From 790af5629fd98f2d853759ec1ef06ba587f2a131 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Thu, 9 Oct 2025 16:08:25 +0000 Subject: [PATCH 01/95] Bump main to 2.9.0. --- projects/hipcub/CHANGELOG.md | 1 + projects/hipcub/cmake/Dependencies.cmake | 2 +- projects/hipcub/test/extra/CMakeLists.txt | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/projects/hipcub/CHANGELOG.md b/projects/hipcub/CHANGELOG.md index 71fe8b706c9..d44bb814a61 100644 --- a/projects/hipcub/CHANGELOG.md +++ b/projects/hipcub/CHANGELOG.md @@ -21,6 +21,7 @@ Full documentation for hipCUB is available at [https://rocm.docs.amd.com/project ### Changed * Updated the documentation on how to run hipCUB tests on multiple GPUs in parallel. +* Changed `CCCL_MINIMUM_VERSION` to `2.9.0` to align with CUB. ### Removed diff --git a/projects/hipcub/cmake/Dependencies.cmake b/projects/hipcub/cmake/Dependencies.cmake index 246a98af0b2..6c5eba8153e 100644 --- a/projects/hipcub/cmake/Dependencies.cmake +++ b/projects/hipcub/cmake/Dependencies.cmake @@ -373,7 +373,7 @@ endif(USER_BUILD_BENCHMARK) # CUB (only for CUDA platform) if(HIP_COMPILER STREQUAL "nvcc") - set(CCCL_MINIMUM_VERSION 2.8.2) + set(CCCL_MINIMUM_VERSION 2.9.0) if(NOT DOWNLOAD_CUB) find_package(CCCL ${CCCL_MINIMUM_VERSION} CONFIG) endif() diff --git a/projects/hipcub/test/extra/CMakeLists.txt b/projects/hipcub/test/extra/CMakeLists.txt index 8ab3eb57755..7804f549adc 100644 --- a/projects/hipcub/test/extra/CMakeLists.txt +++ b/projects/hipcub/test/extra/CMakeLists.txt @@ -42,7 +42,7 @@ include(VerifyCompiler) # CUB (only for CUDA platform) if(HIP_COMPILER STREQUAL "nvcc") - set(CCCL_MINIMUM_VERSION 2.8.2) + set(CCCL_MINIMUM_VERSION 2.9.0) if(NOT DOWNLOAD_CUB) find_package(CCCL ${CCCL_MINIMUM_VERSION} CONFIG) endif() From 718586a3419007137d02a5e632908edc012e45c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Thu, 9 Oct 2025 16:20:15 +0000 Subject: [PATCH 02/95] Bump main to 3.0.0. --- projects/hipcub/CHANGELOG.md | 2 +- projects/hipcub/cmake/Dependencies.cmake | 2 +- projects/hipcub/test/extra/CMakeLists.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/projects/hipcub/CHANGELOG.md b/projects/hipcub/CHANGELOG.md index d44bb814a61..ab3956e15cd 100644 --- a/projects/hipcub/CHANGELOG.md +++ b/projects/hipcub/CHANGELOG.md @@ -21,7 +21,7 @@ Full documentation for hipCUB is available at [https://rocm.docs.amd.com/project ### Changed * Updated the documentation on how to run hipCUB tests on multiple GPUs in parallel. -* Changed `CCCL_MINIMUM_VERSION` to `2.9.0` to align with CUB. +* Changed `CCCL_MINIMUM_VERSION` to `3.0.0` to align with CUB. ### Removed diff --git a/projects/hipcub/cmake/Dependencies.cmake b/projects/hipcub/cmake/Dependencies.cmake index 6c5eba8153e..c5c88c29f47 100644 --- a/projects/hipcub/cmake/Dependencies.cmake +++ b/projects/hipcub/cmake/Dependencies.cmake @@ -373,7 +373,7 @@ endif(USER_BUILD_BENCHMARK) # CUB (only for CUDA platform) if(HIP_COMPILER STREQUAL "nvcc") - set(CCCL_MINIMUM_VERSION 2.9.0) + set(CCCL_MINIMUM_VERSION 3.0.0) if(NOT DOWNLOAD_CUB) find_package(CCCL ${CCCL_MINIMUM_VERSION} CONFIG) endif() diff --git a/projects/hipcub/test/extra/CMakeLists.txt b/projects/hipcub/test/extra/CMakeLists.txt index 7804f549adc..082c64a9bea 100644 --- a/projects/hipcub/test/extra/CMakeLists.txt +++ b/projects/hipcub/test/extra/CMakeLists.txt @@ -42,7 +42,7 @@ include(VerifyCompiler) # CUB (only for CUDA platform) if(HIP_COMPILER STREQUAL "nvcc") - set(CCCL_MINIMUM_VERSION 2.9.0) + set(CCCL_MINIMUM_VERSION 3.0.0) if(NOT DOWNLOAD_CUB) find_package(CCCL ${CCCL_MINIMUM_VERSION} CONFIG) endif() From a4636e544880aea2704171b8bc4ea59952da4544 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Thu, 9 Oct 2025 16:39:35 +0000 Subject: [PATCH 03/95] Require C++17 for compiling hipCUB --- projects/hipcub/CMakeLists.txt | 6 ++---- projects/hipcub/hipcub/include/hipcub/config.hpp | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/projects/hipcub/CMakeLists.txt b/projects/hipcub/CMakeLists.txt index 43f997e9988..dd6ef46a795 100644 --- a/projects/hipcub/CMakeLists.txt +++ b/projects/hipcub/CMakeLists.txt @@ -47,10 +47,8 @@ endif() set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) -if (CMAKE_CXX_STANDARD EQUAL 14) - message(WARNING "C++14 will be deprecated in the next major release") -elseif(NOT CMAKE_CXX_STANDARD EQUAL 17) - message(FATAL_ERROR "Only C++14 and C++17 are supported") +if(NOT CMAKE_CXX_STANDARD EQUAL 17) + message(FATAL_ERROR "Only C++17 are supported") endif() # Set HIP flags diff --git a/projects/hipcub/hipcub/include/hipcub/config.hpp b/projects/hipcub/hipcub/include/hipcub/config.hpp index 18ad351a093..a8c45216926 100644 --- a/projects/hipcub/hipcub/include/hipcub/config.hpp +++ b/projects/hipcub/hipcub/include/hipcub/config.hpp @@ -204,7 +204,7 @@ END_HIPCUB_NAMESPACE #define HIPCUB_IF_CONSTEXPR constexpr #else #if defined(_MSC_VER) && !defined(__clang__) - // MSVC (and not Clang pretending to be MSVC) unconditionally exposes if constexpr (even in C++14 mode), + // MSVC (and not Clang pretending to be MSVC) unconditionally exposes if constexpr, // moreover it triggers warning C4127 (conditional expression is constant) when not using it. nvcc will // be calling cl.exe for host-side codegen. #define HIPCUB_IF_CONSTEXPR constexpr From bd794da3c226609e83540eb97c9b2f62b90cb50a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Thu, 9 Oct 2025 16:48:28 +0000 Subject: [PATCH 04/95] Drop `BinaryFlip` operator --- .../rocprim/thread/thread_operators.hpp | 24 ------------ projects/hipcub/rtest.xml | 2 +- .../hipcub/test_hipcub_thread_operators.cpp | 38 ------------------- 3 files changed, 1 insertion(+), 63 deletions(-) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp index 193817f486e..df8a143ad4d 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp @@ -256,30 +256,6 @@ struct ReduceByKeyOp } }; -template -struct BinaryFlip -{ - BinaryOpT binary_op; - - HIPCUB_HOST_DEVICE - explicit BinaryFlip(BinaryOpT binary_op) : binary_op(binary_op) - { - } - - template - HIPCUB_DEVICE auto operator()(T&& t, U&& u) -> decltype(auto) - { - return binary_op(std::forward(u), std::forward(t)); - } -}; - -template -HIPCUB_HOST_DEVICE -BinaryFlip MakeBinaryFlip(BinaryOpT binary_op) -{ - return BinaryFlip(binary_op); -} - namespace internal { diff --git a/projects/hipcub/rtest.xml b/projects/hipcub/rtest.xml index af8b5917109..427e10edc96 100644 --- a/projects/hipcub/rtest.xml +++ b/projects/hipcub/rtest.xml @@ -2,7 +2,7 @@ - + diff --git a/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp b/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp index b024c526711..babcd7041a9 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp @@ -696,44 +696,6 @@ TYPED_TEST(HipcubNCThreadOperatorsTests, ReduceByKeyOp) } } -TYPED_TEST(HipcubNCThreadOperatorsTests, BinaryFlip) -{ - using input_type = typename TestFixture::input_type; - using output_type = typename TestFixture::output_type; - - const std::vector sizes = get_sizes(); - for(auto input_size : sizes) - { - // Generate data. - std::vector h_input(input_size); - std::iota(h_input.begin(), h_input.end(), static_cast(1)); - - // Scan function: BinaryFlip. - hipcub::Sum sum_op{}; - hipcub::BinaryFlip scan_op(sum_op); - - // Calculate expected results on host. - std::vector h_expected{}; - - // BinaryFlip's () operator is a device function, so cannot be called from the host function - // test_utils::host_inclusive_scan. We do the scan "manually". - output_type accum = h_input[0]; - h_expected.push_back(accum); - for(size_t i = 1; i < input_size; ++i) - { - // The host_inclusive_cast would do: - // - // accum = scan_op(accum, static_cast(h_input[i])); - // - // But for the BinaryFlip this is equivalent to: - accum = sum_op(static_cast(h_input[i]), accum); - h_expected.push_back(accum); - } - - scan_op_test(h_input, h_expected, scan_op, input_size); - } -} - // Unary operators tests. TYPED_TEST(HipcubNCThreadOperatorsTests, CastOp) From a9cfc515452f21240f4a154f71a8b279a589e3a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Mon, 13 Oct 2025 11:55:52 +0000 Subject: [PATCH 05/95] Deprecate `hipcub::Swap` --- projects/hipcub/CHANGELOG.md | 4 +++ .../backend/rocprim/thread/thread_sort.hpp | 31 ++++++++++++++----- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/projects/hipcub/CHANGELOG.md b/projects/hipcub/CHANGELOG.md index ab3956e15cd..4fab8d7c32f 100644 --- a/projects/hipcub/CHANGELOG.md +++ b/projects/hipcub/CHANGELOG.md @@ -18,6 +18,10 @@ Full documentation for hipCUB is available at [https://rocm.docs.amd.com/project * Added `generate_resource_spec.cpp` to the test directory. It is now built as a new target by CMake. It generates the resource spec file required by CTest when running tests in parallel. +### Removed + +* Deprecated `hipcub::Swap`, use `rocprim::swap` instead. + ### Changed * Updated the documentation on how to run hipCUB tests on multiple GPUs in parallel. diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_sort.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_sort.hpp index 8e7a97b66f8..15c3211c575 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_sort.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_sort.hpp @@ -36,19 +36,28 @@ #include // IWYU pragma: export +#include + +#if defined(__HIP_PLATFORM_NVIDIA__) + #include +#endif + BEGIN_HIPCUB_NAMESPACE // Should be deprecated once hip::std::swap is available in this scope. template -HIPCUB_DEVICE -HIPCUB_FORCEINLINE void Swap(T& lhs, T& rhs) +#if defined(__HIP_PLATFORM_NVIDIA__) +HIPCUB_DEPRECATED_BECAUSE("Use cuda::std::swap") +#else +HIPCUB_DEPRECATED_BECAUSE("Use rocprim::swap") +#endif +HIPCUB_DEVICE HIPCUB_FORCEINLINE void Swap(T& lhs, T& rhs) { T temp = lhs; lhs = rhs; rhs = temp; } - /** * @brief Sorts data using odd-even sort method * @@ -97,11 +106,17 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], { if (compare_op(keys[j + 1], keys[j])) { - Swap(keys[j], keys[j + 1]); - if (!KEYS_ONLY) - { - Swap(items[j], items[j + 1]); - } + +#if defined(__HIP_PLATFORM_NVIDIA__) + using ::cuda::std::swap; +#else + using ::rocprim::swap; +#endif + swap(keys[j], keys[j + 1]); + if(!KEYS_ONLY) + { + swap(items[j], items[j + 1]); + } } } // inner loop } // outer loop From 340e1d6d8a012ea5537497a6e4842fa0cb37e30a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Wed, 15 Oct 2025 13:44:25 +0000 Subject: [PATCH 06/95] Drop hipCUB APIs with a debug_synchronous parameter --- .../cub/device/device_adjacent_difference.hpp | 90 ---- .../backend/cub/device/device_histogram.hpp | 200 -------- .../backend/cub/device/device_merge_sort.hpp | 130 ------ .../backend/cub/device/device_partition.hpp | 90 ---- .../backend/cub/device/device_radix_sort.hpp | 188 -------- .../backend/cub/device/device_reduce.hpp | 130 ------ .../cub/device/device_run_length_encode.hpp | 52 --- .../hipcub/backend/cub/device/device_scan.hpp | 293 ------------ .../device/device_segmented_radix_sort.hpp | 236 ---------- .../cub/device/device_segmented_reduce.hpp | 146 ------ .../cub/device/device_segmented_sort.hpp | 432 ------------------ .../backend/cub/device/device_select.hpp | 143 ------ .../hipcub/backend/cub/device/device_spmv.hpp | 32 -- 13 files changed, 2162 deletions(-) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_adjacent_difference.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_adjacent_difference.hpp index 8b1c85246bf..7c8f96dd569 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_adjacent_difference.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_adjacent_difference.hpp @@ -60,30 +60,6 @@ struct DeviceAdjacentDifference stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS static HIPCUB_RUNTIME_FUNCTION hipError_t - SubtractLeftCopy(void* d_temp_storage, - std::size_t& temp_storage_bytes, - InputIteratorT d_input, - OutputIteratorT d_output, - NumItemsT num_items, - DifferenceOpT difference_op, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SubtractLeftCopy(d_temp_storage, - temp_storage_bytes, - d_input, - d_output, - num_items, - difference_op, - stream); - } - template @@ -103,27 +79,6 @@ struct DeviceAdjacentDifference stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS static HIPCUB_RUNTIME_FUNCTION hipError_t - SubtractLeft(void* d_temp_storage, - std::size_t& temp_storage_bytes, - RandomAccessIteratorT d_input, - NumItemsT num_items, - DifferenceOpT difference_op, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SubtractLeft(d_temp_storage, - temp_storage_bytes, - d_input, - num_items, - difference_op, - stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS static HIPCUB_RUNTIME_FUNCTION hipError_t - SubtractRightCopy(void* d_temp_storage, - std::size_t& temp_storage_bytes, - InputIteratorT d_input, - OutputIteratorT d_output, - NumItemsT num_items, - DifferenceOpT difference_op, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SubtractRightCopy(d_temp_storage, - temp_storage_bytes, - d_input, - d_output, - num_items, - difference_op, - stream); - } - template @@ -188,27 +119,6 @@ struct DeviceAdjacentDifference difference_op, stream)); } - - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS static HIPCUB_RUNTIME_FUNCTION hipError_t - SubtractRight(void* d_temp_storage, - std::size_t& temp_storage_bytes, - RandomAccessIteratorT d_input, - NumItemsT num_items, - DifferenceOpT difference_op, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SubtractRight(d_temp_storage, - temp_storage_bytes, - d_input, - num_items, - difference_op, - stream); - } }; END_HIPCUB_NAMESPACE diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_histogram.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_histogram.hpp index 60018ad6854..995d3b61303 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_histogram.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_histogram.hpp @@ -61,31 +61,6 @@ struct DeviceHistogram stream)); } - template - HIPCUB_RUNTIME_FUNCTION HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS static hipError_t - HistogramEven(void* d_temp_storage, - size_t& temp_storage_bytes, - SampleIteratorT d_samples, - CounterT* d_histogram, - int num_levels, - LevelT lower_level, - LevelT upper_level, - OffsetT num_samples, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return HistogramEven(d_temp_storage, - temp_storage_bytes, - d_samples, - d_histogram, - num_levels, - lower_level, - upper_level, - num_samples, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t HistogramEven(void* d_temp_storage, size_t& temp_storage_bytes, @@ -112,35 +87,6 @@ struct DeviceHistogram stream)); } - template - HIPCUB_RUNTIME_FUNCTION HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS static hipError_t - HistogramEven(void* d_temp_storage, - size_t& temp_storage_bytes, - SampleIteratorT d_samples, - CounterT* d_histogram, - int num_levels, - LevelT lower_level, - LevelT upper_level, - OffsetT num_row_samples, - OffsetT num_rows, - size_t row_stride_bytes, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return HistogramEven(d_temp_storage, - temp_storage_bytes, - d_samples, - d_histogram, - num_levels, - lower_level, - upper_level, - num_row_samples, - num_rows, - row_stride_bytes, - stream); - } - template - HIPCUB_RUNTIME_FUNCTION static hipError_t - MultiHistogramEven(void* d_temp_storage, - size_t& temp_storage_bytes, - SampleIteratorT d_samples, - CounterT* d_histogram[NUM_ACTIVE_CHANNELS], - int num_levels[NUM_ACTIVE_CHANNELS], - LevelT lower_level[NUM_ACTIVE_CHANNELS], - LevelT upper_level[NUM_ACTIVE_CHANNELS], - OffsetT num_row_pixels, - OffsetT num_rows, - size_t row_stride_bytes, - hipStream_t stream = 0) - { - return hipCUDAErrorTohipError( - ::cub::DeviceHistogram::MultiHistogramEven( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_histogram, - num_levels, - lower_level, - upper_level, - num_row_pixels, - num_rows, - row_stride_bytes, - stream)); - } - - template - HIPCUB_RUNTIME_FUNCTION HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS static hipError_t - MultiHistogramEven(void* d_temp_storage, - size_t& temp_storage_bytes, - SampleIteratorT d_samples, - CounterT* d_histogram[NUM_ACTIVE_CHANNELS], - int num_levels[NUM_ACTIVE_CHANNELS], - LevelT lower_level[NUM_ACTIVE_CHANNELS], - LevelT upper_level[NUM_ACTIVE_CHANNELS], - OffsetT num_row_pixels, - OffsetT num_rows, - size_t row_stride_bytes, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return MultiHistogramEven(d_temp_storage, - temp_storage_bytes, - d_samples, - d_histogram, - num_levels, - lower_level, - upper_level, - num_row_pixels, - num_rows, - row_stride_bytes, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t HistogramRange(void* d_temp_storage, size_t& temp_storage_bytes, @@ -289,29 +167,6 @@ struct DeviceHistogram stream)); } - template - HIPCUB_RUNTIME_FUNCTION HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS static hipError_t - HistogramRange(void* d_temp_storage, - size_t& temp_storage_bytes, - SampleIteratorT d_samples, - CounterT* d_histogram, - int num_levels, - LevelT* d_levels, - OffsetT num_samples, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return HistogramRange(d_temp_storage, - temp_storage_bytes, - d_samples, - d_histogram, - num_levels, - d_levels, - num_samples, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t HistogramRange(void* d_temp_storage, size_t& temp_storage_bytes, @@ -336,33 +191,6 @@ struct DeviceHistogram stream)); } - template - HIPCUB_RUNTIME_FUNCTION HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS static hipError_t - HistogramRange(void* d_temp_storage, - size_t& temp_storage_bytes, - SampleIteratorT d_samples, - CounterT* d_histogram, - int num_levels, - LevelT* d_levels, - OffsetT num_row_samples, - OffsetT num_rows, - size_t row_stride_bytes, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return HistogramRange(d_temp_storage, - temp_storage_bytes, - d_samples, - d_histogram, - num_levels, - d_levels, - num_row_samples, - num_rows, - row_stride_bytes, - stream); - } - template - HIPCUB_RUNTIME_FUNCTION HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS static hipError_t - MultiHistogramRange(void* d_temp_storage, - size_t& temp_storage_bytes, - SampleIteratorT d_samples, - CounterT* d_histogram[NUM_ACTIVE_CHANNELS], - int num_levels[NUM_ACTIVE_CHANNELS], - LevelT* d_levels[NUM_ACTIVE_CHANNELS], - OffsetT num_pixels, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return MultiHistogramRange(d_temp_storage, - temp_storage_bytes, - d_samples, - d_histogram, - num_levels, - d_levels, - num_pixels, - stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortPairs(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorT d_keys, - ValueIteratorT d_items, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortPairs(d_temp_storage, - temp_storage_bytes, - d_keys, - d_items, - num_items, - compare_op, - stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortPairsCopy(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyInputIteratorT d_input_keys, - ValueInputIteratorT d_input_items, - KeyIteratorT d_output_keys, - ValueIteratorT d_output_items, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortPairsCopy(d_temp_storage, - temp_storage_bytes, - d_input_keys, - d_input_items, - d_output_keys, - d_output_items, - num_items, - compare_op, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t SortKeys(void* d_temp_storage, std::size_t& temp_storage_bytes, @@ -151,20 +100,6 @@ struct DeviceMergeSort stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortKeys(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorT d_keys, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortKeysCopy(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyInputIteratorT d_input_keys, - KeyIteratorT d_output_keys, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream, - bool debug_synchronous) - - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortKeysCopy(d_temp_storage, - temp_storage_bytes, - d_input_keys, - d_output_keys, - num_items, - compare_op, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t StableSortPairs(void* d_temp_storage, std::size_t& temp_storage_bytes, @@ -230,27 +140,6 @@ struct DeviceMergeSort stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortPairs(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorT d_keys, - ValueIteratorT d_items, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return StableSortPairs(d_temp_storage, - temp_storage_bytes, - d_keys, - d_items, - num_items, - compare_op, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t StableSortKeys(void* d_temp_storage, std::size_t& temp_storage_bytes, @@ -267,25 +156,6 @@ struct DeviceMergeSort stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortKeys(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorT d_keys, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return StableSortKeys(d_temp_storage, - temp_storage_bytes, - d_keys, - num_items, - compare_op, - stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION - static hipError_t Flagged(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - FlagIterator d_flags, - OutputIteratorT d_out, - NumSelectedIteratorT d_num_selected_out, - NumItemsT num_items, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return Flagged(d_temp_storage, - temp_storage_bytes, - d_in, - d_flags, - d_out, - d_num_selected_out, - num_items, - stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION - static hipError_t If(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - NumSelectedIteratorT d_num_selected_out, - NumItemsT num_items, - SelectOp select_op, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return If(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - d_num_selected_out, - num_items, - select_op, - stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION - static hipError_t If(void* d_temp_storage, - std::size_t& temp_storage_bytes, - InputIteratorT d_in, - FirstOutputIteratorT d_first_part_out, - SecondOutputIteratorT d_second_part_out, - UnselectedOutputIteratorT d_unselected_out, - NumSelectedIteratorT d_num_selected_out, - NumItemsT num_items, - SelectFirstPartOp select_first_part_op, - SelectSecondPartOp select_second_part_op, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return If(d_temp_storage, - temp_storage_bytes, - d_in, - d_first_part_out, - d_second_part_out, - d_unselected_out, - d_num_selected_out, - num_items, - select_first_part_op, - select_second_part_op, - stream); - } }; END_HIPCUB_NAMESPACE diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_radix_sort.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_radix_sort.hpp index f2b39690348..91129eacea8 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_radix_sort.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_radix_sort.hpp @@ -65,33 +65,6 @@ struct DeviceRadixSort stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortPairs(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - NumItemsT num_items, - int begin_bit, - int end_bit, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortPairs(d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - num_items, - begin_bit, - end_bit, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static auto SortPairs(void* d_temp_storage, size_t& temp_storage_bytes, @@ -162,29 +135,6 @@ struct DeviceRadixSort stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortPairs(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - NumItemsT num_items, - int begin_bit, - int end_bit, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortPairs(d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - begin_bit, - end_bit, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static auto SortPairs(void* d_temp_storage, size_t& temp_storage_bytes, @@ -252,33 +202,6 @@ struct DeviceRadixSort stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortPairsDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - NumItemsT num_items, - int begin_bit, - int end_bit, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortPairsDescending(d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - num_items, - begin_bit, - end_bit, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static auto SortPairsDescending(void* d_temp_storage, size_t& temp_storage_bytes, @@ -352,29 +275,6 @@ struct DeviceRadixSort stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortPairsDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - NumItemsT num_items, - int begin_bit, - int end_bit, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortPairsDescending(d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - begin_bit, - end_bit, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static auto SortPairsDescending(void* d_temp_storage, size_t& temp_storage_bytes, @@ -439,29 +339,6 @@ struct DeviceRadixSort stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortKeys(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - NumItemsT num_items, - int begin_bit, - int end_bit, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortKeys(d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - num_items, - begin_bit, - end_bit, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static auto SortKeys(void* d_temp_storage, size_t& temp_storage_bytes, @@ -522,27 +399,6 @@ struct DeviceRadixSort stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortKeys(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - NumItemsT num_items, - int begin_bit, - int end_bit, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortKeys(d_temp_storage, - temp_storage_bytes, - d_keys, - num_items, - begin_bit, - end_bit, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static auto SortKeys(void* d_temp_storage, size_t& temp_storage_bytes, @@ -601,29 +457,6 @@ struct DeviceRadixSort stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortKeysDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - NumItemsT num_items, - int begin_bit, - int end_bit, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortKeysDescending(d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - num_items, - begin_bit, - end_bit, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static auto SortKeysDescending(void* d_temp_storage, size_t& temp_storage_bytes, @@ -684,27 +517,6 @@ struct DeviceRadixSort stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortKeysDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - NumItemsT num_items, - int begin_bit, - int end_bit, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortKeysDescending(d_temp_storage, - temp_storage_bytes, - d_keys, - num_items, - begin_bit, - end_bit, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static auto SortKeysDescending(void* d_temp_storage, size_t& temp_storage_bytes, diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp index 8d81dce0e1e..7bb0d0df244 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp @@ -64,33 +64,6 @@ class DeviceReduce stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - Reduce(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - NumItemsT num_items, - ReduceOpT reduction_op, - T init, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return Reduce(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_items, - reduction_op, - init, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t Sum(void* d_temp_storage, size_t& temp_storage_bytes, @@ -107,20 +80,6 @@ class DeviceReduce stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - Sum(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - NumItemsT num_items, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t Min(void* d_temp_storage, size_t& temp_storage_bytes, @@ -137,20 +96,6 @@ class DeviceReduce stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - Min(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - NumItemsT num_items, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t ArgMin(void* d_temp_storage, @@ -195,20 +140,6 @@ class DeviceReduce _CCCL_SUPPRESS_DEPRECATED_POP } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - ArgMin(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - NumItemsT num_items, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t Max(void* d_temp_storage, size_t& temp_storage_bytes, @@ -225,20 +156,6 @@ class DeviceReduce stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - Max(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - NumItemsT num_items, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t ArgMax(void* d_temp_storage, @@ -283,20 +200,6 @@ class DeviceReduce _CCCL_SUPPRESS_DEPRECATED_POP } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - ArgMax(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - NumItemsT num_items, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - ReduceByKey(void* d_temp_storage, - size_t& temp_storage_bytes, - KeysInputIteratorT d_keys_in, - UniqueOutputIteratorT d_unique_out, - ValuesInputIteratorT d_values_in, - AggregatesOutputIteratorT d_aggregates_out, - NumRunsOutputIteratorT d_num_runs_out, - ReductionOpT reduction_op, - NumItemsT num_items, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return ReduceByKey(d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_unique_out, - d_values_in, - d_aggregates_out, - d_num_runs_out, - reduction_op, - num_items, - stream); - } }; END_HIPCUB_NAMESPACE diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_run_length_encode.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_run_length_encode.hpp index 824ec222874..b23721084ae 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_run_length_encode.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_run_length_encode.hpp @@ -63,32 +63,6 @@ class DeviceRunLengthEncode stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - Encode(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - UniqueOutputIteratorT d_unique_out, - LengthsOutputIteratorT d_counts_out, - NumRunsOutputIteratorT d_num_runs_out, - int num_items, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return Encode(d_temp_storage, - temp_storage_bytes, - d_in, - d_unique_out, - d_counts_out, - d_num_runs_out, - num_items, - stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - NonTrivialRuns(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OffsetsOutputIteratorT d_offsets_out, - LengthsOutputIteratorT d_lengths_out, - NumRunsOutputIteratorT d_num_runs_out, - int num_items, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return NonTrivialRuns(d_temp_storage, - temp_storage_bytes, - d_in, - d_offsets_out, - d_lengths_out, - d_num_runs_out, - num_items, - stream); - } }; END_HIPCUB_NAMESPACE diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp index 72ad11f7bc8..57abaaafec8 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp @@ -57,20 +57,6 @@ class DeviceScan stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION - static hipError_t InclusiveSum(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - NumItemsT num_items, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t InclusiveSum(void* d_temp_storage, @@ -82,23 +68,6 @@ class DeviceScan return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION - static hipError_t InclusiveSum(void* d_temp_storage, - size_t& temp_storage_bytes, - IteratorT d_data, - int num_items, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return InclusiveSum(d_temp_storage, - temp_storage_bytes, - d_data, - num_items, - stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION - static hipError_t InclusiveScan(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - ScanOpT scan_op, - NumItemsT num_items, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return InclusiveScan(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - scan_op, - num_items, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t InclusiveScan(void* d_temp_storage, @@ -163,25 +108,6 @@ class DeviceScan stream); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION - static hipError_t InclusiveScan(void* d_temp_storage, - size_t& temp_storage_bytes, - IteratorT d_data, - ScanOpT scan_op, - NumItemsT num_items, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return InclusiveScan(d_temp_storage, - temp_storage_bytes, - d_data, - scan_op, - num_items, - stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION - static hipError_t ExclusiveSum(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - NumItemsT num_items, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t ExclusiveSum(void* d_temp_storage, @@ -249,23 +161,6 @@ class DeviceScan return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION - static hipError_t ExclusiveSum(void* d_temp_storage, - size_t& temp_storage_bytes, - IteratorT d_data, - NumItemsT num_items, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return ExclusiveSum(d_temp_storage, - temp_storage_bytes, - d_data, - num_items, - stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION - static hipError_t ExclusiveScan(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - ScanOpT scan_op, - InitValueT init_value, - NumItemsT num_items, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return ExclusiveScan(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - scan_op, - init_value, - num_items, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t ExclusiveScan(void* d_temp_storage, @@ -338,27 +206,6 @@ class DeviceScan stream); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION - static hipError_t ExclusiveScan(void* d_temp_storage, - size_t& temp_storage_bytes, - IteratorT d_data, - ScanOpT scan_op, - InitValueT init_value, - NumItemsT num_items, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return ExclusiveScan(d_temp_storage, - temp_storage_bytes, - d_data, - scan_op, - init_value, - num_items, - stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION - static hipError_t ExclusiveScan(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - ScanOpT scan_op, - FutureValue init_value, - NumItemsT num_items, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return ExclusiveScan(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - scan_op, - init_value, - num_items, - stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION - static hipError_t ExclusiveScan(void* d_temp_storage, - size_t& temp_storage_bytes, - IteratorT d_data, - ScanOpT scan_op, - FutureValue init_value, - NumItemsT num_items, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return ExclusiveScan(d_temp_storage, - temp_storage_bytes, - d_data, - scan_op, - init_value, - num_items, - stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION - static hipError_t ExclusiveSumByKey(void* d_temp_storage, - size_t& temp_storage_bytes, - KeysInputIteratorT d_keys_in, - ValuesInputIteratorT d_values_in, - ValuesOutputIteratorT d_values_out, - NumItemsT num_items, - EqualityOpT equality_op, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return ExclusiveSumByKey(d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_values_in, - d_values_out, - num_items, - equality_op, - stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION - static hipError_t ExclusiveScanByKey(void* d_temp_storage, - size_t& temp_storage_bytes, - KeysInputIteratorT d_keys_in, - ValuesInputIteratorT d_values_in, - ValuesOutputIteratorT d_values_out, - ScanOpT scan_op, - InitValueT init_value, - NumItemsT num_items, - EqualityOpT equality_op, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return ExclusiveScanByKey(d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_values_in, - d_values_out, - scan_op, - init_value, - num_items, - equality_op, - stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION - static hipError_t InclusiveSumByKey(void* d_temp_storage, - size_t& temp_storage_bytes, - KeysInputIteratorT d_keys_in, - ValuesInputIteratorT d_values_in, - ValuesOutputIteratorT d_values_out, - NumItemsT num_items, - EqualityOpT equality_op, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return InclusiveSumByKey(d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_values_in, - d_values_out, - num_items, - equality_op, - stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortPairs(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - int begin_bit, - int end_bit, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortPairs(d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t SortPairs(void* d_temp_storage, size_t& temp_storage_bytes, @@ -128,35 +95,6 @@ struct DeviceSegmentedRadixSort stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortPairs(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - int begin_bit, - int end_bit, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortPairs(d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t SortPairsDescending(void* d_temp_storage, size_t& temp_storage_bytes, @@ -188,39 +126,6 @@ struct DeviceSegmentedRadixSort stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortPairsDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - int begin_bit, - int end_bit, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortPairsDescending(d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t SortPairsDescending(void* d_temp_storage, size_t& temp_storage_bytes, @@ -248,35 +153,6 @@ struct DeviceSegmentedRadixSort stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortPairsDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - int begin_bit, - int end_bit, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortPairsDescending(d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t SortKeys(void* d_temp_storage, size_t& temp_storage_bytes, @@ -303,35 +179,6 @@ struct DeviceSegmentedRadixSort stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortKeys(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - int begin_bit, - int end_bit, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortKeys(d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t SortKeys(void* d_temp_storage, size_t& temp_storage_bytes, @@ -356,33 +203,6 @@ struct DeviceSegmentedRadixSort stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortKeys(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - int begin_bit, - int end_bit, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortKeys(d_temp_storage, - temp_storage_bytes, - d_keys, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t SortKeysDescending(void* d_temp_storage, size_t& temp_storage_bytes, @@ -410,35 +230,6 @@ struct DeviceSegmentedRadixSort stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortKeysDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - int begin_bit, - int end_bit, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortKeysDescending(d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t SortKeysDescending(void* d_temp_storage, size_t& temp_storage_bytes, @@ -463,33 +254,6 @@ struct DeviceSegmentedRadixSort end_bit, stream)); } - - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortKeysDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - int begin_bit, - int end_bit, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortKeysDescending(d_temp_storage, - temp_storage_bytes, - d_keys, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - stream); - } }; END_HIPCUB_NAMESPACE diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_reduce.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_reduce.hpp index 4f21fbfac37..041f3b6a303 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_reduce.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_reduce.hpp @@ -67,37 +67,6 @@ struct DeviceSegmentedReduce stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - Reduce(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - ReductionOp reduction_op, - T initial_value, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return Reduce(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - reduction_op, - initial_value, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t Sum(void* d_temp_storage, size_t& temp_storage_bytes, @@ -118,29 +87,6 @@ struct DeviceSegmentedReduce stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - Sum(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return Sum(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t Min(void* d_temp_storage, size_t& temp_storage_bytes, @@ -161,29 +107,6 @@ struct DeviceSegmentedReduce stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - Min(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return Min(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t ArgMin(void* d_temp_storage, size_t& temp_storage_bytes, @@ -204,29 +127,6 @@ struct DeviceSegmentedReduce stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - ArgMin(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return ArgMin(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t Max(void* d_temp_storage, size_t& temp_storage_bytes, @@ -247,29 +147,6 @@ struct DeviceSegmentedReduce stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - Max(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return Max(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t ArgMax(void* d_temp_storage, size_t& temp_storage_bytes, @@ -289,29 +166,6 @@ struct DeviceSegmentedReduce d_end_offsets, stream)); } - - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - ArgMax(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return ArgMax(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } }; END_HIPCUB_NAMESPACE diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_sort.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_sort.hpp index b31bada2f27..3bc8e6e1130 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_sort.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_sort.hpp @@ -61,31 +61,6 @@ struct DeviceSegmentedSort stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortKeys(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortKeys(d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t SortKeysDescending(void* d_temp_storage, @@ -110,31 +85,6 @@ struct DeviceSegmentedSort stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortKeysDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortKeysDescending(d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t SortKeys(void* d_temp_storage, size_t& temp_storage_bytes, @@ -155,29 +105,6 @@ struct DeviceSegmentedSort stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortKeys(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortKeys(d_temp_storage, - temp_storage_bytes, - d_keys, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t SortKeysDescending(void* d_temp_storage, @@ -200,29 +127,6 @@ struct DeviceSegmentedSort stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortKeysDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortKeysDescending(d_temp_storage, - temp_storage_bytes, - d_keys, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t StableSortKeys(void* d_temp_storage, size_t& temp_storage_bytes, @@ -245,31 +149,6 @@ struct DeviceSegmentedSort stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortKeys(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return StableSortKeys(d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t StableSortKeysDescending(void* d_temp_storage, @@ -294,31 +173,6 @@ struct DeviceSegmentedSort stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortKeysDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return StableSortKeysDescending(d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t StableSortKeys(void* d_temp_storage, size_t& temp_storage_bytes, @@ -339,29 +193,6 @@ struct DeviceSegmentedSort stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortKeys(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return StableSortKeys(d_temp_storage, - temp_storage_bytes, - d_keys, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t StableSortKeysDescending(void* d_temp_storage, @@ -384,29 +215,6 @@ struct DeviceSegmentedSort stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortKeysDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return StableSortKeysDescending(d_temp_storage, - temp_storage_bytes, - d_keys, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortPairs(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortPairs(d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortPairsDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortPairsDescending(d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortPairs(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortPairs(d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortPairsDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return SortPairsDescending(d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortPairs(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return StableSortPairs(d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortPairsDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return StableSortPairsDescending(d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortPairs(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return StableSortPairs(d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortPairsDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return StableSortPairsDescending(d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } }; END_HIPCUB_NAMESPACE diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_select.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_select.hpp index 6812c5cfeb7..5d9ee18ec64 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_select.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_select.hpp @@ -66,32 +66,6 @@ class DeviceSelect stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION - static hipError_t Flagged(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - FlagIterator d_flags, - OutputIteratorT d_out, - NumSelectedIteratorT d_num_selected_out, - int64_t num_items, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return Flagged(d_temp_storage, - temp_storage_bytes, - d_in, - d_flags, - d_out, - d_num_selected_out, - num_items, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t Flagged(void* d_temp_storage, @@ -112,27 +86,6 @@ class DeviceSelect stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION - static hipError_t Flagged(void* d_temp_storage, - size_t& temp_storage_bytes, - IteratorT d_data, - FlagIterator d_flags, - NumSelectedIteratorT d_num_selected_out, - int64_t num_items, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return Flagged(d_temp_storage, - temp_storage_bytes, - d_data, - d_flags, - d_num_selected_out, - num_items, - stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION - static hipError_t If(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - NumSelectedIteratorT d_num_selected_out, - int64_t num_items, - SelectOp select_op, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return If(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - d_num_selected_out, - num_items, - select_op, - stream); - } - template HIPCUB_RUNTIME_FUNCTION static hipError_t If(void* d_temp_storage, @@ -202,27 +129,6 @@ class DeviceSelect stream)); } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION - static hipError_t If(void* d_temp_storage, - size_t& temp_storage_bytes, - IteratorT d_data, - NumSelectedIteratorT d_num_selected_out, - int64_t num_items, - SelectOp select_op, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return If(d_temp_storage, - temp_storage_bytes, - d_data, - d_num_selected_out, - num_items, - select_op, - stream); - } - template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION - static hipError_t Unique(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - NumSelectedIteratorT d_num_selected_out, - int64_t num_items, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - return Unique(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - d_num_selected_out, - num_items, - stream); - } - template - HIPCUB_RUNTIME_FUNCTION - static hipError_t UniqueByKey(void* d_temp_storage, - size_t& temp_storage_bytes, - KeyIteratorT d_keys_input, - ValueIteratorT d_values_input, - OutputKeyIteratorT d_keys_output, - OutputValueIteratorT d_values_output, - NumSelectedIteratorT d_num_selected_out, - NumItemsT num_items, - hipStream_t stream = 0) - { - return hipCUDAErrorTohipError(::cub::DeviceSelect::UniqueByKey(d_temp_storage, - temp_storage_bytes, - d_keys_input, - d_values_input, - d_keys_output, - d_values_output, - d_num_selected_out, - num_items, - stream)); - } - template - HIPCUB_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - CsrMV(void* d_temp_storage, - size_t& temp_storage_bytes, - ValueT* d_values, - int* d_row_offsets, - int* d_column_indices, - ValueT* d_vector_x, - ValueT* d_vector_y, - int num_rows, - int num_cols, - int num_nonzeros, - hipStream_t stream, - bool debug_synchronous) - { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); - _CCCL_SUPPRESS_DEPRECATED_PUSH - return CsrMV(d_temp_storage, - temp_storage_bytes, - d_values, - d_row_offsets, - d_column_indices, - d_vector_x, - d_vector_y, - num_rows, - num_cols, - num_nonzeros, - stream); - _CCCL_SUPPRESS_DEPRECATED_POP - } }; END_HIPCUB_NAMESPACE From d5e9910e92905f11636ed0b9c6c02594095b73c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Mon, 27 Oct 2025 13:41:59 +0000 Subject: [PATCH 07/95] Update deprecated warning for hipcub::DivideAndRoundUp --- .../hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp index 52ae8a3b3df..7f203cd8e70 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp @@ -196,8 +196,9 @@ using is_integral_or_enum = } -// CUB deprecated this API, and suggests to use `::cuda::ceil_div` instead, +// CUB removed this API, and suggests to use `::cuda::ceil_div` instead, // which is implemented in file `libcudacxx/include/cuda/__cmath/ceil_div.h`. +// Remove when hip::ceil_div is implemented. template HIPCUB_DEPRECATED_BECAUSE("Use hip::ceil_div instead from 'libhipcxx'") HIPCUB_HOST_DEVICE __forceinline__ constexpr NumeratorT From a5f87e42442468d20990009217f6190146ea1931 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Mon, 24 Nov 2025 12:13:23 +0000 Subject: [PATCH 08/95] Add `hip::std` support --- projects/hipcub/.clang-format | 6 ++ .../hipcub/hipcub/include/hipcub/config.hpp | 8 ++ .../hipcub/hipcub/include/hipcub/libcxx.hpp | 99 +++++++++++++++++++ 3 files changed, 113 insertions(+) create mode 100644 projects/hipcub/hipcub/include/hipcub/libcxx.hpp diff --git a/projects/hipcub/.clang-format b/projects/hipcub/.clang-format index 21f7c259808..ddb45540880 100644 --- a/projects/hipcub/.clang-format +++ b/projects/hipcub/.clang-format @@ -167,4 +167,10 @@ Macros: - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS=[[DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS___]] BreakAfterAttributes: Always +WhitespaceSensitiveMacros: [ + 'HIPCUB_HAS_INCLUDE', + '_HIPCUB_LIBCXX_INCLUDE', + '_HIPCUB_STD_INCLUDE' +] + --- diff --git a/projects/hipcub/hipcub/include/hipcub/config.hpp b/projects/hipcub/hipcub/include/hipcub/config.hpp index a8c45216926..9c9be1708df 100644 --- a/projects/hipcub/hipcub/include/hipcub/config.hpp +++ b/projects/hipcub/hipcub/include/hipcub/config.hpp @@ -35,6 +35,14 @@ // Version #include "hipcub_version.hpp" // IWYU pragma: export +// Manage std implementation +#include "libcxx.hpp" // IWYU pragma: export + +// For _CCCL_IMPLICIT_SYSTEM_HEADER +#if _HIPCUB_HAS_DEVICE_SYSTEM_STD + #include _HIPCUB_LIBCXX_INCLUDE(__cccl_config) // IWYU pragma: export +#endif + #define HIPCUB_NAMESPACE hipcub // Inline namespace (e.g. HIPCUB_300400_NS where 300400 is the hipCUB version) is used to diff --git a/projects/hipcub/hipcub/include/hipcub/libcxx.hpp b/projects/hipcub/hipcub/include/hipcub/libcxx.hpp new file mode 100644 index 00000000000..d141c8816d6 --- /dev/null +++ b/projects/hipcub/hipcub/include/hipcub/libcxx.hpp @@ -0,0 +1,99 @@ +// MIT License +// +// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +// This is a utility file that helps managing which +// 'std' implementation we're using. The provided +// macros are for internal use only and may change +// in future versions. +// +// Example usage: +// #include _HIPCUB_STD_INCLUDE(optional) +// using optional_int = _HIPCUB_STD::optional; + +// Version that we depend on. We can ignore patch for now +// since we're only interested in breaking (major) and +// features (minor). +#define _HIPCUB_REQUIRED_LIBCXX_VERSION_MAJOR 2 +#define _HIPCUB_REQUIRED_LIBCXX_VERSION_MINOR 8 + +#ifdef __has_include + #define HIPCUB_HAS_INCLUDE(_X) __has_include(_X) +#else + #define HIPCUB_HAS_INCLUDE(_X) 0 +#endif + +#define _HIPCUB_STRINGIFY_IMPL(x) #x +#define _HIPCUB_STRINGIFY(x) _HIPCUB_STRINGIFY_IMPL(x) + +// clang-format off + +// If the '::cuda::std' namespace from 'libcudacxx' or 'libhipcxx' is available. +#if HIPCUB_HAS_INCLUDE() + #include + // If version matches and '_CUDA_VSTD' is available. + #if _LIBCUDACXX_CUDA_API_VERSION_MAJOR == _HIPCUB_REQUIRED_LIBCXX_VERSION_MAJOR \ + && _LIBCUDACXX_CUDA_API_VERSION_MINOR >= _HIPCUB_REQUIRED_LIBCXX_VERSION_MINOR \ + && defined(_CUDA_VSTD) + #define _HIPCUB_LIBCXX_INCLUDE(LIB) _HIPCUB_STRINGIFY(cuda/LIB) + #define _HIPCUB_STD_INCLUDE(LIB) _HIPCUB_STRINGIFY(cuda/std/LIB) + #define _HIPCUB_LIBCXX ::cuda + #define _HIPCUB_STD _CUDA_VSTD + #define _HIPCUB_HAS_DEVICE_SYSTEM_STD 1 + #define _HIPCUB_STD_NAMESPACE_BEGIN _LIBCUDACXX_BEGIN_NAMESPACE_STD + #define _HIPCUB_STD_NAMESPACE_END _LIBCUDACXX_END_NAMESPACE_STD + #endif + +// Otherwise, if the '::hip::std' namespace from 'libhipcxx' is available. +#elif HIPCUB_HAS_INCLUDE() + #include + // If version matches and '_CUDA_VSTD' is available. + #if _LIBCUDACXX_CUDA_API_VERSION_MAJOR == _HIPCUB_REQUIRED_LIBCXX_VERSION_MINOR \ + && _LIBCUDACXX_CUDA_API_VERSION_MINOR >= _HIPCUB_REQUIRED_LIBCXX_VERSION_MINOR \ + && defined(_CUDA_VSTD) + #define _HIPCUB_LIBCXX_INCLUDE(LIB) _HIPCUB_STRINGIFY(hip/LIB) + #define _HIPCUB_STD_INCLUDE(LIB) _HIPCUB_STRINGIFY(hip/std/LIB) + // In 'libhipcxx' the '::hip' namespace is synonymous with '::cuda'. + #define _HIPCUB_LIBCXX ::hip + // In 'libhipcxx' the macro '_CUDA_VSTD' is also defined. + #define _HIPCUB_STD _CUDA_VSTD + #define _HIPCUB_HAS_DEVICE_SYSTEM_STD 1 + #define _HIPCUB_STD_NAMESPACE_BEGIN _LIBCUDACXX_BEGIN_NAMESPACE_STD + #define _HIPCUB_STD_NAMESPACE_END _LIBCUDACXX_END_NAMESPACE_STD + #endif +#endif + +// If 'libcudacxx' or 'libhipcxx' is not found, use fallback. +#ifndef _HIPCUB_HAS_DEVICE_SYSTEM_STD + #define _HIPCUB_LIBCXX_INCLUDE(LIB) _HIPCUB_STRINGIFY(LIB) + #define _HIPCUB_STD_INCLUDE(LIB) _HIPCUB_STRINGIFY(LIB) + #define _HIPCUB_LIBCXX + #define _HIPCUB_STD ::std + #define _HIPCUB_HAS_DEVICE_SYSTEM_STD 0 + #define _HIPCUB_STD_NAMESPACE_BEGIN \ + namespace std \ + { + #define _HIPCUB_STD_NAMESPACE_END } +#endif + +// clang-format on From ce70816d2faa37e85c674ea6489871322a136cbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Mon, 27 Oct 2025 15:11:03 +0000 Subject: [PATCH 09/95] Adds support for large number of items and large number of segments to ``DeviceSegmentedSort`` --- projects/hipcub/CHANGELOG.md | 1 + .../cub/device/device_segmented_sort.hpp | 331 +++++++++--------- 2 files changed, 176 insertions(+), 156 deletions(-) diff --git a/projects/hipcub/CHANGELOG.md b/projects/hipcub/CHANGELOG.md index 4fab8d7c32f..2eb7f022e0b 100644 --- a/projects/hipcub/CHANGELOG.md +++ b/projects/hipcub/CHANGELOG.md @@ -26,6 +26,7 @@ Full documentation for hipCUB is available at [https://rocm.docs.amd.com/project * Updated the documentation on how to run hipCUB tests on multiple GPUs in parallel. * Changed `CCCL_MINIMUM_VERSION` to `3.0.0` to align with CUB. +* Add support for large num_items `DeviceSegmentedSort`. ### Removed diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_sort.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_sort.hpp index 3bc8e6e1130..a5ece248fcb 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_sort.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_sort.hpp @@ -35,20 +35,32 @@ #include // IWYU pragma: export +#ifdef __HIP_PLATFORM_AMD__ + #include // IWYU pragma: export +using ::hip::std::int64_t; +#elif defined(__HIP_PLATFORM_NVIDIA__) + #include // IWYU pragma: export +using ::cuda::std::int64_t; +#else + #include +using ::std::int64_t; +#endif + BEGIN_HIPCUB_NAMESPACE struct DeviceSegmentedSort { template - HIPCUB_RUNTIME_FUNCTION static hipError_t SortKeys(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t SortKeys(void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + int64_t num_items, + int64_t num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceSegmentedSort::SortKeys(d_temp_storage, temp_storage_bytes, @@ -62,16 +74,16 @@ struct DeviceSegmentedSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t - SortKeysDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t SortKeysDescending(void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + int64_t num_items, + int64_t num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return hipCUDAErrorTohipError( ::cub::DeviceSegmentedSort::SortKeysDescending(d_temp_storage, @@ -86,14 +98,15 @@ struct DeviceSegmentedSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t SortKeys(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t SortKeys(void* d_temp_storage, + size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + int64_t num_items, + int64_t num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceSegmentedSort::SortKeys(d_temp_storage, temp_storage_bytes, @@ -106,15 +119,15 @@ struct DeviceSegmentedSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t - SortKeysDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t SortKeysDescending(void* d_temp_storage, + size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + int64_t num_items, + int64_t num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return hipCUDAErrorTohipError( ::cub::DeviceSegmentedSort::SortKeysDescending(d_temp_storage, @@ -128,15 +141,16 @@ struct DeviceSegmentedSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t StableSortKeys(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortKeys(void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + int64_t num_items, + int64_t num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceSegmentedSort::StableSortKeys(d_temp_storage, temp_storage_bytes, @@ -150,16 +164,16 @@ struct DeviceSegmentedSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortKeysDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortKeysDescending(void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + int64_t num_items, + int64_t num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return hipCUDAErrorTohipError( ::cub::DeviceSegmentedSort::StableSortKeysDescending(d_temp_storage, @@ -174,14 +188,15 @@ struct DeviceSegmentedSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t StableSortKeys(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortKeys(void* d_temp_storage, + size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + int64_t num_items, + int64_t num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceSegmentedSort::StableSortKeys(d_temp_storage, temp_storage_bytes, @@ -194,15 +209,15 @@ struct DeviceSegmentedSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortKeysDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortKeysDescending(void* d_temp_storage, + size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + int64_t num_items, + int64_t num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return hipCUDAErrorTohipError( ::cub::DeviceSegmentedSort::StableSortKeysDescending(d_temp_storage, @@ -219,17 +234,18 @@ struct DeviceSegmentedSort typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT> - HIPCUB_RUNTIME_FUNCTION static hipError_t SortPairs(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t SortPairs(void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + const ValueT* d_values_in, + ValueT* d_values_out, + int64_t num_items, + int64_t num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceSegmentedSort::SortPairs(d_temp_storage, temp_storage_bytes, @@ -248,18 +264,18 @@ struct DeviceSegmentedSort typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT> - HIPCUB_RUNTIME_FUNCTION static hipError_t - SortPairsDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t SortPairsDescending(void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + const ValueT* d_values_in, + ValueT* d_values_out, + int64_t num_items, + int64_t num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return hipCUDAErrorTohipError( ::cub::DeviceSegmentedSort::SortPairsDescending(d_temp_storage, @@ -279,15 +295,16 @@ struct DeviceSegmentedSort typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT> - HIPCUB_RUNTIME_FUNCTION static hipError_t SortPairs(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t SortPairs(void* d_temp_storage, + size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + DoubleBuffer& d_values, + int64_t num_items, + int64_t num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceSegmentedSort::SortPairs(d_temp_storage, temp_storage_bytes, @@ -304,16 +321,16 @@ struct DeviceSegmentedSort typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT> - HIPCUB_RUNTIME_FUNCTION static hipError_t - SortPairsDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t SortPairsDescending(void* d_temp_storage, + size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + DoubleBuffer& d_values, + int64_t num_items, + int64_t num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return hipCUDAErrorTohipError( ::cub::DeviceSegmentedSort::SortPairsDescending(d_temp_storage, @@ -331,17 +348,18 @@ struct DeviceSegmentedSort typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT> - HIPCUB_RUNTIME_FUNCTION static hipError_t StableSortPairs(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortPairs(void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + const ValueT* d_values_in, + ValueT* d_values_out, + int64_t num_items, + int64_t num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return hipCUDAErrorTohipError( ::cub::DeviceSegmentedSort::StableSortPairs(d_temp_storage, @@ -361,18 +379,18 @@ struct DeviceSegmentedSort typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT> - HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortPairsDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortPairsDescending(void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + const ValueT* d_values_in, + ValueT* d_values_out, + int64_t num_items, + int64_t num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return hipCUDAErrorTohipError( ::cub::DeviceSegmentedSort::StableSortPairsDescending(d_temp_storage, @@ -392,15 +410,16 @@ struct DeviceSegmentedSort typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT> - HIPCUB_RUNTIME_FUNCTION static hipError_t StableSortPairs(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortPairs(void* d_temp_storage, + size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + DoubleBuffer& d_values, + int64_t num_items, + int64_t num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return hipCUDAErrorTohipError( ::cub::DeviceSegmentedSort::StableSortPairs(d_temp_storage, @@ -418,16 +437,16 @@ struct DeviceSegmentedSort typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT> - HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortPairsDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortPairsDescending(void* d_temp_storage, + size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + DoubleBuffer& d_values, + int64_t num_items, + int64_t num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return hipCUDAErrorTohipError( ::cub::DeviceSegmentedSort::StableSortPairsDescending(d_temp_storage, From fa0556564a61c894018e4e9372a6210c7c02e7fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Fri, 5 Dec 2025 13:45:13 +0000 Subject: [PATCH 10/95] Deprecate hipcub::min and hipcub:max and replace internal uses of std::min and std::max to hip::std --- .../benchmark/benchmark_device_histogram.cpp | 4 +-- .../benchmark_device_reduce_by_key.cpp | 2 +- .../benchmark_device_run_length_encode.cpp | 4 +-- projects/hipcub/benchmark/benchmark_utils.hpp | 27 +++++++++++-------- .../benchmark/common_benchmark_header.hpp | 5 +++- .../example_device_partition_flagged.cpp | 4 +-- .../device/example_device_partition_if.cpp | 4 +-- .../device/example_device_select_flagged.cpp | 4 +-- .../device/example_device_select_if.cpp | 4 +-- .../device/example_device_select_unique.cpp | 4 +-- projects/hipcub/examples/example_utils.hpp | 6 +++-- .../backend/rocprim/grid/grid_even_share.hpp | 4 +-- .../hipcub/backend/rocprim/util_macro.hpp | 8 +++--- .../hipcub/test/hipcub/common_test_header.hpp | 5 +++- .../test_hipcub_block_adjacent_difference.cpp | 14 +++++----- .../test_hipcub_block_run_length_decode.cpp | 3 ++- .../test/hipcub/test_hipcub_block_shuffle.cpp | 6 ++--- .../hipcub/test_hipcub_device_histogram.cpp | 21 +++++++++------ .../test_hipcub_device_reduce_by_key.cpp | 2 +- .../test_hipcub_device_run_length_encode.cpp | 16 +++++------ .../test/hipcub/test_hipcub_device_scan.cpp | 3 ++- .../test_hipcub_device_segmented_reduce.cpp | 12 ++++----- .../hipcub/test_hipcub_thread_operators.cpp | 4 +-- .../test/hipcub/test_hipcub_util_ptx.cpp | 25 ++++++++--------- .../test/hipcub/test_utils_assertions.hpp | 6 ++--- .../hipcub/test_utils_data_generation.hpp | 4 +-- 26 files changed, 111 insertions(+), 90 deletions(-) diff --git a/projects/hipcub/benchmark/benchmark_device_histogram.cpp b/projects/hipcub/benchmark/benchmark_device_histogram.cpp index ded31e28f8c..4970862efde 100644 --- a/projects/hipcub/benchmark/benchmark_device_histogram.cpp +++ b/projects/hipcub/benchmark/benchmark_device_histogram.cpp @@ -54,7 +54,7 @@ std::vector std::default_random_engine gen(rd()); std::vector data(size); std::generate(data.begin(), - data.begin() + std::min(size, max_random_size), + data.begin() + _HIPCUB_STD::min(size, max_random_size), [&]() { // Reduce entropy by applying bitwise AND to random bits @@ -69,7 +69,7 @@ std::vector }); for(size_t i = max_random_size; i < size; i += max_random_size) { - std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); + std::copy_n(data.begin(), _HIPCUB_STD::min(size - i, max_random_size), data.begin() + i); } return data; } diff --git a/projects/hipcub/benchmark/benchmark_device_reduce_by_key.cpp b/projects/hipcub/benchmark/benchmark_device_reduce_by_key.cpp index 0d9160f4c9c..7cdf1f0501a 100644 --- a/projects/hipcub/benchmark/benchmark_device_reduce_by_key.cpp +++ b/projects/hipcub/benchmark/benchmark_device_reduce_by_key.cpp @@ -58,7 +58,7 @@ void run_benchmark(benchmark::State& state, while(offset < size) { const size_t key_count = key_counts[unique_count % key_counts.size()]; - const size_t end = std::min(size, offset + key_count); + const size_t end = _HIPCUB_STD::min(size, offset + key_count); for(size_t i = offset; i < end; i++) { keys_input[i] = unique_count; diff --git a/projects/hipcub/benchmark/benchmark_device_run_length_encode.cpp b/projects/hipcub/benchmark/benchmark_device_run_length_encode.cpp index f0c858528a5..e8a21bf497a 100644 --- a/projects/hipcub/benchmark/benchmark_device_run_length_encode.cpp +++ b/projects/hipcub/benchmark/benchmark_device_run_length_encode.cpp @@ -54,7 +54,7 @@ void run_encode_benchmark(benchmark::State& state, while(offset < size) { const size_t key_count = key_counts[runs_count % key_counts.size()]; - const size_t end = std::min(size, offset + key_count); + const size_t end = _HIPCUB_STD::min(size, offset + key_count); for(size_t i = offset; i < end; i++) { input[i] = runs_count; @@ -157,7 +157,7 @@ void run_non_trivial_runs_benchmark(benchmark::State& state, while(offset < size) { const size_t key_count = key_counts[runs_count % key_counts.size()]; - const size_t end = std::min(size, offset + key_count); + const size_t end = _HIPCUB_STD::min(size, offset + key_count); for(size_t i = offset; i < end; i++) { input[i] = runs_count; diff --git a/projects/hipcub/benchmark/benchmark_utils.hpp b/projects/hipcub/benchmark/benchmark_utils.hpp index 489cf8de852..7d4085a4c41 100644 --- a/projects/hipcub/benchmark/benchmark_utils.hpp +++ b/projects/hipcub/benchmark/benchmark_utils.hpp @@ -59,11 +59,11 @@ inline auto std::uniform_int_distribution distribution(min, max); std::vector data(size); std::generate(data.begin(), - data.begin() + std::min(size, max_random_size), + data.begin() + _HIPCUB_STD::min(size, max_random_size), [&]() { return distribution(gen); }); for(size_t i = max_random_size; i < size; i += max_random_size) { - std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); + std::copy_n(data.begin(), _HIPCUB_STD::min(size - i, max_random_size), data.begin() + i); } return data; } @@ -78,11 +78,11 @@ inline auto std::uniform_real_distribution distribution(min, max); std::vector data(size); std::generate(data.begin(), - data.begin() + std::min(size, max_random_size), + data.begin() + _HIPCUB_STD::min(size, max_random_size), [&]() { return distribution(gen); }); for(size_t i = max_random_size; i < size; i += max_random_size) { - std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); + std::copy_n(data.begin(), _HIPCUB_STD::min(size - i, max_random_size), data.begin() + i); } return data; } @@ -96,11 +96,11 @@ inline std::vector std::bernoulli_distribution distribution(p); std::vector data(size); std::generate(data.begin(), - data.begin() + std::min(size, max_random_size), + data.begin() + _HIPCUB_STD::min(size, max_random_size), [&]() { return distribution(gen); }); for(size_t i = max_random_size; i < size; i += max_random_size) { - std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); + std::copy_n(data.begin(), _HIPCUB_STD::min(size - i, max_random_size), data.begin() + i); } return data; } @@ -390,7 +390,8 @@ std::vector while(keys_start_index < size) { const size_t new_segment_length = segment_length_distribution(prng); - const size_t new_segment_end = std::min(size, keys_start_index + new_segment_length); + const size_t new_segment_end + = _HIPCUB_STD::min(size, keys_start_index + new_segment_length); const T key = key_distribution(prng); std::fill(std::next(keys.begin(), keys_start_index), std::next(keys.begin(), new_segment_end), @@ -433,10 +434,12 @@ inline auto generate_random_data_n( using dis_type = typename std::conditional<(sizeof(T) == 1), short, T>::type; std::uniform_int_distribution distribution((T)min, (T)max); - std::generate_n(it, std::min(size, max_random_size), [&]() { return distribution(gen); }); + std::generate_n(it, + _HIPCUB_STD::min(size, max_random_size), + [&]() { return distribution(gen); }); for(size_t i = max_random_size; i < size; i += max_random_size) { - std::copy_n(it, std::min(size - i, max_random_size), it + i); + std::copy_n(it, _HIPCUB_STD::min(size - i, max_random_size), it + i); } return it + size; } @@ -453,10 +456,12 @@ inline auto generate_random_data_n(OutputIterator it, using T = typename std::iterator_traits::value_type; std::uniform_real_distribution distribution((T)min, (T)max); - std::generate_n(it, std::min(size, max_random_size), [&]() { return distribution(gen); }); + std::generate_n(it, + _HIPCUB_STD::min(size, max_random_size), + [&]() { return distribution(gen); }); for(size_t i = max_random_size; i < size; i += max_random_size) { - std::copy_n(it, std::min(size - i, max_random_size), it + i); + std::copy_n(it, _HIPCUB_STD::min(size - i, max_random_size), it + i); } return it + size; } diff --git a/projects/hipcub/benchmark/common_benchmark_header.hpp b/projects/hipcub/benchmark/common_benchmark_header.hpp index a632840a815..c85b24cb0c2 100644 --- a/projects/hipcub/benchmark/common_benchmark_header.hpp +++ b/projects/hipcub/benchmark/common_benchmark_header.hpp @@ -22,7 +22,6 @@ #include #include -#include #include #include #include @@ -43,6 +42,10 @@ // HIP API #include +#include + +#include _HIPCUB_LIBCXX_INCLUDE(cmath) + // benchmark_utils.hpp should only be included by this header. // The following definition is used as guard in benchmark_utils.hpp // Including benchmark_utils.hpp by itself will cause a compile error. diff --git a/projects/hipcub/examples/device/example_device_partition_flagged.cpp b/projects/hipcub/examples/device/example_device_partition_flagged.cpp index f56cae1d6e8..35e4f529c86 100644 --- a/projects/hipcub/examples/device/example_device_partition_flagged.cpp +++ b/projects/hipcub/examples/device/example_device_partition_flagged.cpp @@ -82,10 +82,10 @@ void Initialize( unsigned short repeat; RandomBits(repeat); repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short)))); - repeat = std::max(1, repeat); + repeat = _HIPCUB_STD::max(1, repeat); int j = i; - while (j < std::min(i + repeat, num_items)) + while(j < _HIPCUB_STD::min(i + repeat, num_items)) { h_flags[j] = 0; h_in[j] = key; diff --git a/projects/hipcub/examples/device/example_device_partition_if.cpp b/projects/hipcub/examples/device/example_device_partition_if.cpp index 7516f9a40d7..f12c8188f37 100644 --- a/projects/hipcub/examples/device/example_device_partition_if.cpp +++ b/projects/hipcub/examples/device/example_device_partition_if.cpp @@ -93,10 +93,10 @@ void Initialize( unsigned short repeat; RandomBits(repeat); repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short)))); - repeat = std::max(1, repeat); + repeat = _HIPCUB_STD::max(1, repeat); int j = i; - while (j < std::min(i + repeat, num_items)) + while(j < _HIPCUB_STD::min(i + repeat, num_items)) { h_in[j] = key; j++; diff --git a/projects/hipcub/examples/device/example_device_select_flagged.cpp b/projects/hipcub/examples/device/example_device_select_flagged.cpp index 75c39789dee..ba52705f01b 100644 --- a/projects/hipcub/examples/device/example_device_select_flagged.cpp +++ b/projects/hipcub/examples/device/example_device_select_flagged.cpp @@ -83,10 +83,10 @@ void Initialize( unsigned short repeat; RandomBits(repeat); repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short)))); - repeat = std::max(1, repeat); + repeat = _HIPCUB_STD::max(1, repeat); int j = i; - while (j < std::min(i + repeat, num_items)) + while(j < _HIPCUB_STD::min(i + repeat, num_items)) { h_flags[j] = 0; h_in[j] = key; diff --git a/projects/hipcub/examples/device/example_device_select_if.cpp b/projects/hipcub/examples/device/example_device_select_if.cpp index 40a81bc15bc..0493d3c0c47 100644 --- a/projects/hipcub/examples/device/example_device_select_if.cpp +++ b/projects/hipcub/examples/device/example_device_select_if.cpp @@ -93,10 +93,10 @@ void Initialize( unsigned short repeat; RandomBits(repeat); repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short)))); - repeat = std::max(1, repeat); + repeat = _HIPCUB_STD::max(1, repeat); int j = i; - while (j < std::max(i + repeat, num_items)) + while(j < _HIPCUB_STD::max(i + repeat, num_items)) { h_in[j] = key; j++; diff --git a/projects/hipcub/examples/device/example_device_select_unique.cpp b/projects/hipcub/examples/device/example_device_select_unique.cpp index d923b29cdb4..196850ed2f9 100644 --- a/projects/hipcub/examples/device/example_device_select_unique.cpp +++ b/projects/hipcub/examples/device/example_device_select_unique.cpp @@ -80,10 +80,10 @@ void Initialize( unsigned short repeat; RandomBits(repeat); repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short)))); - repeat = std::max(1, repeat); + repeat = _HIPCUB_STD::max(1, repeat); int j = i; - while (j < std::min(i + repeat, num_items)) + while(j < _HIPCUB_STD::min(i + repeat, num_items)) { h_in[j] = key; j++; diff --git a/projects/hipcub/examples/example_utils.hpp b/projects/hipcub/examples/example_utils.hpp index deedbc937ad..1248a854cab 100644 --- a/projects/hipcub/examples/example_utils.hpp +++ b/projects/hipcub/examples/example_utils.hpp @@ -38,6 +38,8 @@ #include #include +#include _HIPCUB_STD_INCLUDE(functional) + #define AssertEquals(a, b) if ((a) != (b)) { std::cerr << "\n(" << __FILE__ << ": " << __LINE__ << ")\n"; exit(1);} #define HIP_CHECK(condition) \ @@ -573,8 +575,8 @@ void RandomBits( int current_bit = j * WORD_BYTES * 8; unsigned int word = 0xffffffff; - word &= 0xffffffff << std::max(0, begin_bit - current_bit); - word &= 0xffffffff >> std::max(0, (current_bit + (WORD_BYTES * 8)) - end_bit); + word &= 0xffffffff << _HIPCUB_STD::max(0, begin_bit - current_bit); + word &= 0xffffffff >> _HIPCUB_STD::max(0, (current_bit + (WORD_BYTES * 8)) - end_bit); for (int i = 0; i <= entropy_reduction; i++) { diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/grid/grid_even_share.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/grid/grid_even_share.hpp index 8311ec12c0b..6fd55cc767f 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/grid/grid_even_share.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/grid/grid_even_share.hpp @@ -123,7 +123,7 @@ struct GridEvenShare HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH this->total_tiles = static_cast(hipcub::DivideAndRoundUp(num_items_, tile_items)); HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP - this->grid_size = min(total_tiles, max_grid_size); + this->grid_size = _HIPCUB_STD::min(total_tiles, max_grid_size); int avg_tiles_per_block = total_tiles / grid_size; // leftover grains go to big blocks: this->big_shares = total_tiles - (avg_tiles_per_block * grid_size); @@ -154,7 +154,7 @@ struct GridEvenShare { // This thread block gets a normal share of grains (avg_tiles_per_block) block_offset = normal_base_offset + (block_id * normal_share_items); - block_end = min(num_items, block_offset + normal_share_items); + block_end = _HIPCUB_STD::min(num_items, block_offset + normal_share_items); } // Else default past-the-end } diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_macro.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_macro.hpp index 0278f5a5cea..34243854161 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_macro.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_macro.hpp @@ -44,16 +44,16 @@ BEGIN_HIPCUB_NAMESPACE #ifndef DOXYGEN_SHOULD_SKIP_THIS #define HIPCUB_PREVENT_MACRO_SUBSTITUTION template -constexpr __host__ __device__ -auto min HIPCUB_PREVENT_MACRO_SUBSTITUTION(T&& t, U&& u) +HIPCUB_DEPRECATED_BECAUSE("Use hip::std::min from instead") +constexpr __host__ __device__ auto min HIPCUB_PREVENT_MACRO_SUBSTITUTION(T&& t, U&& u) -> decltype(t < u ? std::forward(t) : std::forward(u)) { return t < u ? std::forward(t) : std::forward(u); } template -constexpr __host__ __device__ -auto max HIPCUB_PREVENT_MACRO_SUBSTITUTION(T&& t, U&& u) +HIPCUB_DEPRECATED_BECAUSE("Use hip::std::max from instead") +constexpr __host__ __device__ auto max HIPCUB_PREVENT_MACRO_SUBSTITUTION(T&& t, U&& u) -> decltype(t < u ? std::forward(u) : std::forward(t)) { return t < u ? std::forward(u) : std::forward(t); diff --git a/projects/hipcub/test/hipcub/common_test_header.hpp b/projects/hipcub/test/hipcub/common_test_header.hpp index d770db32d71..ec53dc785ea 100755 --- a/projects/hipcub/test/hipcub/common_test_header.hpp +++ b/projects/hipcub/test/hipcub/common_test_header.hpp @@ -22,7 +22,6 @@ #include #include -#include #include #include #include @@ -41,6 +40,10 @@ // HIP API #include +#include + +#include _HIPCUB_LIBCXX_INCLUDE(cmath) + // test_utils.hpp should only be included by this header. // The following definition is used as guard in test_utils.hpp // Including test_utils.hpp by itself will cause a compile error. diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_adjacent_difference.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_adjacent_difference.cpp index 43ce5be89a4..f7c68035263 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_adjacent_difference.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_adjacent_difference.cpp @@ -337,8 +337,8 @@ TYPED_TEST(HipcubBlockAdjacentDifferenceSubtract, SubtractLeft) ASSERT_NO_FATAL_FAILURE( test_utils::assert_near(output, expected, - std::max(test_utils::precision::value, - test_utils::precision::value))); + _HIPCUB_STD::max(test_utils::precision::value, + test_utils::precision::value))); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); @@ -458,8 +458,8 @@ TYPED_TEST(HipcubBlockAdjacentDifferenceSubtract, SubtractLeftPartialTile) ASSERT_NO_FATAL_FAILURE( test_utils::assert_near(output, expected, - std::max(test_utils::precision::value, - test_utils::precision::value))); + _HIPCUB_STD::max(test_utils::precision::value, + test_utils::precision::value))); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_tile_sizes)); @@ -561,8 +561,8 @@ TYPED_TEST(HipcubBlockAdjacentDifferenceSubtract, SubtractRight) ASSERT_NO_FATAL_FAILURE( test_utils::assert_near(output, expected, - std::max(test_utils::precision::value, - test_utils::precision::value))); + _HIPCUB_STD::max(test_utils::precision::value, + test_utils::precision::value))); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); @@ -682,7 +682,7 @@ TYPED_TEST(HipcubBlockAdjacentDifferenceSubtract, SubtractRightPartialTile) // clang-format off ASSERT_NO_FATAL_FAILURE(test_utils::assert_near(output, expected, is_add_op::value - ? std::max(test_utils::precision::value, test_utils::precision::value) + ? _HIPCUB_STD::max(test_utils::precision::value, test_utils::precision::value) : std::is_same::value ? 0 : test_utils::precision::value)); diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp index 02250d54623..c0d9a00cb70 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp @@ -141,7 +141,8 @@ TYPED_TEST(HipcubBlockRunLengthDecodeTest, TestDecode) SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); const LengthT max_run_length = static_cast( - std::min(1000ll, static_cast(std::numeric_limits::max()))); + _HIPCUB_STD::min(1000ll, + static_cast(_HIPCUB_STD::numeric_limits::max()))); size_t num_runs = runs_per_thread * block_size; auto run_items = test_utils::get_random_data(num_runs, diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_shuffle.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_shuffle.cpp index ea5b4376251..b81581552a9 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_shuffle.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_shuffle.cpp @@ -99,8 +99,8 @@ TYPED_TEST(HipcubBlockShuffleTests, BlockOffset) { unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; - int distance - = rand() % std::min(size_t(10), block_size / 2) - std::min(size_t(10), block_size / 2); + int distance = rand() % _HIPCUB_STD::min(size_t(10), block_size / 2) + - _HIPCUB_STD::min(size_t(10), block_size / 2); SCOPED_TRACE(testing::Message() << "with seed= " << seed_value << " & distance = " << distance); // Generate data @@ -183,7 +183,7 @@ TYPED_TEST(HipcubBlockShuffleTests, BlockRotate) { unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; - int distance = rand() % std::min(size_t(5), block_size / 2); + int distance = rand() % _HIPCUB_STD::min(size_t(5), block_size / 2); SCOPED_TRACE(testing::Message() << "with seed= " << seed_value << " & distance = " << distance); // Generate data diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp index 1433f69e56a..2b6dd1d4d54 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp @@ -73,9 +73,11 @@ inline auto get_random_samples(size_t size, U min, U max, unsigned int seed_valu return test_utils::get_random_data( size, static_cast( - std::max(min1 - d / 10, static_cast(std::numeric_limits::lowest()))), + _HIPCUB_STD::max(min1 - d / 10, + static_cast(_HIPCUB_STD::numeric_limits::lowest()))), static_cast( - std::min(max1 + d / 10, static_cast(std::numeric_limits::max()))), + _HIPCUB_STD::min(max1 + d / 10, + static_cast(_HIPCUB_STD::numeric_limits::max()))), seed_value); } @@ -89,8 +91,11 @@ inline auto get_random_samples(size_t size, U min, U max, unsigned int seed_valu return test_utils::get_random_data( size, static_cast( - std::max(min1 - d / 10, static_cast(std::numeric_limits::lowest()))), - static_cast(std::min(max1 + d / 10, static_cast(std::numeric_limits::max()))), + _HIPCUB_STD::max(min1 - d / 10, + static_cast(_HIPCUB_STD::numeric_limits::lowest()))), + static_cast( + _HIPCUB_STD::min(max1 + d / 10, + static_cast(_HIPCUB_STD::numeric_limits::max()))), seed_value); } @@ -195,7 +200,7 @@ TYPED_TEST(HipcubDeviceHistogramEven, Even) const size_t row_stride = columns + std::get<2>(dim); const size_t row_stride_bytes = row_stride * sizeof(sample_type); - const size_t size = std::max(1, rows * row_stride); + const size_t size = _HIPCUB_STD::max(1, rows * row_stride); for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { @@ -522,7 +527,7 @@ TYPED_TEST(HipcubDeviceHistogramRange, Range) const size_t row_stride = columns + std::get<2>(dim); const size_t row_stride_bytes = row_stride * sizeof(sample_type); - const size_t size = std::max(1, rows * row_stride); + const size_t size = _HIPCUB_STD::max(1, rows * row_stride); for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { @@ -787,7 +792,7 @@ TYPED_TEST(HipcubDeviceHistogramMultiEven, MultiEven) const size_t row_stride = columns * channels + std::get<2>(dim); const size_t row_stride_bytes = row_stride * sizeof(sample_type); - const size_t size = std::max(1, rows * row_stride); + const size_t size = _HIPCUB_STD::max(1, rows * row_stride); for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { @@ -1089,7 +1094,7 @@ TYPED_TEST(HipcubDeviceHistogramMultiRange, MultiRange) const size_t row_stride = columns * channels + std::get<2>(dim); const size_t row_stride_bytes = row_stride * sizeof(sample_type); - const size_t size = std::max(1, rows * row_stride); + const size_t size = _HIPCUB_STD::max(1, rows * row_stride); for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp index 33f86c43560..894cf3871af 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp @@ -136,7 +136,7 @@ TYPED_TEST(HipcubDeviceReduceByKey, ReduceByKey) const size_t key_count = key_count_dis(gen); current_key += key_delta_dis(gen); - const size_t end = std::min(size, offset + key_count); + const size_t end = _HIPCUB_STD::min(size, offset + key_count); for(size_t i = offset; i < end; i++) { keys_input[i] = test_utils::convert_to_device(current_key); diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_run_length_encode.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_run_length_encode.cpp index 797676b286c..d32210327b8 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_run_length_encode.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_run_length_encode.cpp @@ -123,7 +123,7 @@ TYPED_TEST(HipcubDeviceRunLengthEncode, Encode) size_t key_count = key_count_dis(gen); current_key += key_delta_dis(gen); - const size_t end = std::min(size, offset + key_count); + const size_t end = _HIPCUB_STD::min(size, offset + key_count); key_count = end - offset; for(size_t i = offset; i < end; i++) { @@ -296,7 +296,7 @@ TYPED_TEST(HipcubDeviceRunLengthEncode, NonTrivialRuns) } current_key += key_delta_dis(gen); - const size_t end = std::min(size, offset + key_count); + const size_t end = _HIPCUB_STD::min(size, offset + key_count); key_count = end - offset; for(size_t i = offset; i < end; i++) { @@ -321,12 +321,12 @@ TYPED_TEST(HipcubDeviceRunLengthEncode, NonTrivialRuns) offset_type* d_offsets_output; count_type* d_counts_output; count_type* d_runs_count_output; - HIP_CHECK(test_common_utils::hipMallocHelper(&d_offsets_output, - std::max(1, runs_count_expected) - * sizeof(offset_type))); - HIP_CHECK(test_common_utils::hipMallocHelper(&d_counts_output, - std::max(1, runs_count_expected) - * sizeof(count_type))); + HIP_CHECK(test_common_utils::hipMallocHelper( + &d_offsets_output, + _HIPCUB_STD::max(1, runs_count_expected) * sizeof(offset_type))); + HIP_CHECK(test_common_utils::hipMallocHelper( + &d_counts_output, + _HIPCUB_STD::max(1, runs_count_expected) * sizeof(count_type))); HIP_CHECK(test_common_utils::hipMallocHelper(&d_runs_count_output, sizeof(count_type))); size_t temporary_storage_bytes = 0; diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp index ee585fd590d..e601b00602c 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp @@ -105,7 +105,8 @@ std::vector while(keys_start_index < size) { const size_t new_segment_length = segment_length_distribution(prng); - const size_t new_segment_end = std::min(size, keys_start_index + new_segment_length); + const size_t new_segment_end + = _HIPCUB_STD::min(size, keys_start_index + new_segment_length); const T key = key_distribution(prng); std::fill(std::next(keys.begin(), keys_start_index), std::next(keys.begin(), new_segment_end), diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp index 3d5c02575b8..27b0069424c 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp @@ -125,7 +125,7 @@ TYPED_TEST(HipcubDeviceSegmentedReduceOp, Reduce) const size_t segment_length = segment_length_dis(gen); offsets.push_back(offset); - const size_t end = std::min(size, offset + segment_length); + const size_t end = _HIPCUB_STD::min(size, offset + segment_length); max_segment_length = std::max(max_segment_length, end - offset); result_type aggregate = init; @@ -324,7 +324,7 @@ TYPED_TEST(HipcubDeviceSegmentedReduce, Sum) const size_t segment_length = segment_length_dis(gen); offsets.push_back(offset); - const size_t end = std::min(size, offset + segment_length); + const size_t end = _HIPCUB_STD::min(size, offset + segment_length); max_segment_length = std::max(max_segment_length, end - offset); result_type aggregate = init; @@ -481,7 +481,7 @@ TYPED_TEST(HipcubDeviceSegmentedReduce, Min) const size_t segment_length = segment_length_dis(gen); offsets.push_back(offset); - const size_t end = std::min(size, offset + segment_length); + const size_t end = _HIPCUB_STD::min(size, offset + segment_length); max_segment_length = std::max(max_segment_length, end - offset); result_type aggregate = init; @@ -638,7 +638,7 @@ TYPED_TEST(HipcubDeviceSegmentedReduce, Max) const size_t segment_length = segment_length_dis(gen); offsets.push_back(offset); - const size_t end = std::min(size, offset + segment_length); + const size_t end = _HIPCUB_STD::min(size, offset + segment_length); max_segment_length = std::max(max_segment_length, end - offset); result_type aggregate = init; @@ -848,7 +848,7 @@ void test_argminmax(typename TestFixture::params::input_type empty_value) offsets.push_back(offset); Iterator x(&values_input[offset]); - const size_t end = std::min(size, offset + segment_length); + const size_t end = _HIPCUB_STD::min(size, offset + segment_length); max_segment_length = std::max(max_segment_length, end - offset); if(offset < end) { @@ -1186,7 +1186,7 @@ TEST(HipcubDeviceSegmentedReduceLargeIndicesTests, LargeIndices) const size_t segment_length = segment_length_dis(gen); offsets.push_back(offset); - const offset_type end = std::min(size, offset + segment_length); + const offset_type end = _HIPCUB_STD::min(size, offset + segment_length); output_type aggregate = init; aggregate = reduce_op(aggregate, gauss_sum(end) - gauss_sum(offset)); aggregates_expected.push_back(aggregate); diff --git a/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp b/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp index babcd7041a9..7e2c48bba8a 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp @@ -511,7 +511,7 @@ TYPED_TEST(HipcubNCThreadOperatorsTests, ReduceBySegmentOp) pair_type init(0, 0); for(size_t offset = 0; offset < input_size; offset += segment_size) { - const size_t end = std::min(input_size, offset + segment_size); + const size_t end = _HIPCUB_STD::min(input_size, offset + segment_size); pair_type aggregate = init; for(size_t i = offset; i < end; ++i) { @@ -528,7 +528,7 @@ TYPED_TEST(HipcubNCThreadOperatorsTests, ReduceBySegmentOp) std::vector output{}; for(size_t offset = 0; offset < input_size; offset += segment_size) { - const size_t end = std::min(input_size, offset + segment_size); + const size_t end = _HIPCUB_STD::min(input_size, offset + segment_size); pair_type aggregate = init; for(size_t i = offset; i < end; ++i) { diff --git a/projects/hipcub/test/hipcub/test_hipcub_util_ptx.cpp b/projects/hipcub/test/hipcub/test_hipcub_util_ptx.cpp index c583c0b9e39..a68cb5a09fb 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_util_ptx.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_util_ptx.cpp @@ -156,9 +156,9 @@ TYPED_TEST(HipcubUtilPtxTests, ShuffleUp) std::vector output(input.size()); auto src_offsets = test_utils::get_random_data( - std::max(1, logical_warp_size / 2), + _HIPCUB_STD::max(1, logical_warp_size / 2), 1U, - std::max(1, logical_warp_size - 1), + _HIPCUB_STD::max(1, logical_warp_size - 1), seed_value + seed_value_addition); T* device_data; @@ -267,9 +267,9 @@ TYPED_TEST(HipcubUtilPtxTests, ShuffleDown) std::vector output(input.size()); auto src_offsets = test_utils::get_random_data( - std::max(1, logical_warp_size / 2), + _HIPCUB_STD::max(1, logical_warp_size / 2), 1U, - std::max(1, logical_warp_size - 1), + _HIPCUB_STD::max(1, logical_warp_size - 1), seed_value + seed_value_addition); T* device_data; @@ -379,10 +379,11 @@ TYPED_TEST(HipcubUtilPtxTests, ShuffleIndex) seed_value); std::vector output(input.size()); - auto src_offsets = test_utils::get_random_data(hardware_warp_size / logical_warp_size, - 0, - std::max(1, logical_warp_size - 1), - seed_value + seed_value_addition); + auto src_offsets + = test_utils::get_random_data(hardware_warp_size / logical_warp_size, + 0, + _HIPCUB_STD::max(1, logical_warp_size - 1), + seed_value + seed_value_addition); // Calculate expected results on host std::vector expected(size, test_utils::convert_to_device(0)); @@ -481,9 +482,9 @@ TEST(HipcubUtilPtxTests, ShuffleUpCustomStruct) } auto src_offsets = test_utils::get_random_data( - std::max(1, logical_warp_size / 2), + _HIPCUB_STD::max(1, logical_warp_size / 2), 1U, - std::max(1, logical_warp_size - 1), + _HIPCUB_STD::max(1, logical_warp_size - 1), seed_value + seed_value_addition); T* device_data; @@ -592,9 +593,9 @@ TEST(HipcubUtilPtxTests, ShuffleUpCustomAlignedStruct) } auto src_offsets = test_utils::get_random_data( - std::max(1, logical_warp_size / 2), + _HIPCUB_STD::max(1, logical_warp_size / 2), 1U, - std::max(1, logical_warp_size - 1), + _HIPCUB_STD::max(1, logical_warp_size - 1), seed_value + seed_value_addition); T* device_data; diff --git a/projects/hipcub/test/hipcub/test_utils_assertions.hpp b/projects/hipcub/test/hipcub/test_utils_assertions.hpp index 08903f7ce5c..e95bf497f76 100644 --- a/projects/hipcub/test/hipcub/test_utils_assertions.hpp +++ b/projects/hipcub/test/hipcub/test_utils_assertions.hpp @@ -48,7 +48,7 @@ template inline void assert_eq(const std::vector& result, const std::vector& expected, const size_t max_length = SIZE_MAX) { if(max_length == SIZE_MAX || max_length > expected.size()) ASSERT_EQ(result.size(), expected.size()); - for(size_t i = 0; i < std::min(result.size(), max_length); i++) + for(size_t i = 0; i < _HIPCUB_STD::min(result.size(), max_length); i++) { if(bit_equal(result[i], expected[i])) continue; // Check to also regard equality of NaN's, -NaN, +inf, -inf as correct. @@ -74,7 +74,7 @@ inline void assert_eq(const std::vector& result, const std::vector& expect inline void assert_eq(const std::vector& result, const std::vector& expected, const size_t max_length = SIZE_MAX) { if(max_length == SIZE_MAX || max_length > expected.size()) ASSERT_EQ(result.size(), expected.size()); - for(size_t i = 0; i < std::min(result.size(), max_length); i++) + for(size_t i = 0; i < _HIPCUB_STD::min(result.size(), max_length); i++) { if(bit_equal(result[i], expected[i])) continue; // Check to also regard equality of NaN's, -NaN, +inf, -inf as correct. ASSERT_EQ(test_utils::native_half(result[i]), test_utils::native_half(expected[i])) << "where index = " << i; @@ -84,7 +84,7 @@ inline void assert_eq(const std::vector& result, const std::ve inline void assert_eq(const std::vector& result, const std::vector& expected, const size_t max_length = SIZE_MAX) { if(max_length == SIZE_MAX || max_length > expected.size()) ASSERT_EQ(result.size(), expected.size()); - for(size_t i = 0; i < std::min(result.size(), max_length); i++) + for(size_t i = 0; i < _HIPCUB_STD::min(result.size(), max_length); i++) { if(bit_equal(result[i], expected[i])) continue; // Check to also regard equality of NaN's, -NaN, +inf, -inf as correct. ASSERT_EQ(test_utils::native_bfloat16(result[i]), test_utils::native_bfloat16(expected[i])) << "where index = " << i; diff --git a/projects/hipcub/test/hipcub/test_utils_data_generation.hpp b/projects/hipcub/test/hipcub/test_utils_data_generation.hpp index 75f0010fe18..698adf5c32f 100644 --- a/projects/hipcub/test/hipcub/test_utils_data_generation.hpp +++ b/projects/hipcub/test/hipcub/test_utils_data_generation.hpp @@ -460,11 +460,11 @@ inline std::vector get_random_data01(size_t size, float p, int seed_value) std::bernoulli_distribution distribution(p); std::vector data(size); std::generate(data.begin(), - data.begin() + std::min(size, max_random_size), + data.begin() + _HIPCUB_STD::min(size, max_random_size), [&]() { return convert_to_device(distribution(gen)); }); for(size_t i = max_random_size; i < size; i += max_random_size) { - std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); + std::copy_n(data.begin(), _HIPCUB_STD::min(size - i, max_random_size), data.begin() + i); } return data; } From 6e6b96db4134508419c2783a974ad8aee63f02c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Fri, 5 Dec 2025 17:07:08 +0000 Subject: [PATCH 11/95] Remove deprecated hipcub::min and hipcub::max --- projects/hipcub/CHANGELOG.md | 1 + .../backend/rocprim/grid/grid_even_share.hpp | 2 +- .../hipcub/backend/rocprim/util_macro.hpp | 20 ------------------- .../test_hipcub_block_adjacent_difference.cpp | 2 +- 4 files changed, 3 insertions(+), 22 deletions(-) diff --git a/projects/hipcub/CHANGELOG.md b/projects/hipcub/CHANGELOG.md index 2eb7f022e0b..498bf8214a1 100644 --- a/projects/hipcub/CHANGELOG.md +++ b/projects/hipcub/CHANGELOG.md @@ -20,6 +20,7 @@ Full documentation for hipCUB is available at [https://rocm.docs.amd.com/project ### Removed +* Removed deprecated `hipcub:max` and `hipcub:min`. Use `hip::std::max` and `hip::std::min` instead. * Deprecated `hipcub::Swap`, use `rocprim::swap` instead. ### Changed diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/grid/grid_even_share.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/grid/grid_even_share.hpp index 6fd55cc767f..a3432cd8e57 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/grid/grid_even_share.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/grid/grid_even_share.hpp @@ -154,7 +154,7 @@ struct GridEvenShare { // This thread block gets a normal share of grains (avg_tiles_per_block) block_offset = normal_base_offset + (block_id * normal_share_items); - block_end = _HIPCUB_STD::min(num_items, block_offset + normal_share_items); + block_end = _HIPCUB_STD::min(num_items, block_offset + normal_share_items); } // Else default past-the-end } diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_macro.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_macro.hpp index 34243854161..d0d2203b2b9 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_macro.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_macro.hpp @@ -41,26 +41,6 @@ BEGIN_HIPCUB_NAMESPACE * @{ */ -#ifndef DOXYGEN_SHOULD_SKIP_THIS - #define HIPCUB_PREVENT_MACRO_SUBSTITUTION -template -HIPCUB_DEPRECATED_BECAUSE("Use hip::std::min from instead") -constexpr __host__ __device__ auto min HIPCUB_PREVENT_MACRO_SUBSTITUTION(T&& t, U&& u) - -> decltype(t < u ? std::forward(t) : std::forward(u)) -{ - return t < u ? std::forward(t) : std::forward(u); -} - -template -HIPCUB_DEPRECATED_BECAUSE("Use hip::std::max from instead") -constexpr __host__ __device__ auto max HIPCUB_PREVENT_MACRO_SUBSTITUTION(T&& t, U&& u) - -> decltype(t < u ? std::forward(u) : std::forward(t)) -{ - return t < u ? std::forward(u) : std::forward(t); -} - #undef HIPCUB_PREVENT_MACRO_SUBSTITUTION -#endif - /// Deprecated since rocm [7.1] #ifndef HIPCUB_MAX /// Select maximum(a, b) diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_adjacent_difference.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_adjacent_difference.cpp index f7c68035263..b4c0e1a7f7d 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_adjacent_difference.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_adjacent_difference.cpp @@ -682,7 +682,7 @@ TYPED_TEST(HipcubBlockAdjacentDifferenceSubtract, SubtractRightPartialTile) // clang-format off ASSERT_NO_FATAL_FAILURE(test_utils::assert_near(output, expected, is_add_op::value - ? _HIPCUB_STD::max(test_utils::precision::value, test_utils::precision::value) + ? _HIPCUB_STD::max(test_utils::precision::value, test_utils::precision::value) : std::is_same::value ? 0 : test_utils::precision::value)); From 17d1aec6240ebc68e2523d11734535e90ce1c226 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Thu, 30 Oct 2025 15:29:21 +0000 Subject: [PATCH 12/95] Replace rocprim counting iterator to thrust for nvcc compatibility --- projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp index 2b6dd1d4d54..7e4def5e4c9 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp @@ -33,6 +33,8 @@ #include #include +#include + // rows, columns, (row_stride - columns * Channels) std::vector> get_dims() { @@ -392,7 +394,7 @@ TYPED_TEST(HipcubDeviceHistogramEvenOverflow, EvenOverflow) SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate data - auto d_input = rocprim::counting_iterator{0UL}; + auto d_input = thrust::counting_iterator(0); counter_type* d_histogram; HIP_CHECK(test_common_utils::hipMallocHelper(&d_histogram, bins * sizeof(counter_type))); From b3550fd4f095b9c71ff41075723fa13c64bb896e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Mon, 3 Nov 2025 13:28:46 +0000 Subject: [PATCH 13/95] Deprecate and replace `HIPCUB_IS_INT128_ENABLED` --- projects/hipcub/CHANGELOG.md | 1 + .../backend/rocprim/device/device_histogram.hpp | 2 +- .../include/hipcub/backend/rocprim/util_ptx.hpp | 2 +- .../include/hipcub/backend/rocprim/util_type.hpp | 12 ++++++++++-- .../test/hipcub/test_hipcub_block_radix_sort.cpp | 2 +- .../test/hipcub/test_hipcub_device_radix_sort.cpp.in | 2 +- .../hipcub/test/hipcub/test_utils_assertions.hpp | 4 ++-- .../test/hipcub/test_utils_data_generation.hpp | 4 ++-- 8 files changed, 19 insertions(+), 10 deletions(-) diff --git a/projects/hipcub/CHANGELOG.md b/projects/hipcub/CHANGELOG.md index 498bf8214a1..b886f4d280a 100644 --- a/projects/hipcub/CHANGELOG.md +++ b/projects/hipcub/CHANGELOG.md @@ -22,6 +22,7 @@ Full documentation for hipCUB is available at [https://rocm.docs.amd.com/project * Removed deprecated `hipcub:max` and `hipcub:min`. Use `hip::std::max` and `hip::std::min` instead. * Deprecated `hipcub::Swap`, use `rocprim::swap` instead. +* Deprecated `HIPCUB_IS_INT128_ENABLED`, use `_CCCL_HAS_INT128()` instead. ### Changed diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_histogram.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_histogram.hpp index 0dd5d978cb3..e1422c0f2a6 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_histogram.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_histogram.hpp @@ -70,7 +70,7 @@ struct int_arithmetic_t using type = ::std::conditional_t< sizeof(SampleT) + sizeof(CommonT) <= sizeof(uint32_t), uint32_t, -#if HIPCUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() ::std::conditional_t<(::std::is_same::value || ::std::is_same::value), CommonT, diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_ptx.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_ptx.hpp index 3df30881bcd..8582d4376b7 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_ptx.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_ptx.hpp @@ -225,7 +225,7 @@ HIPCUB_FORCEINLINE unsigned int return detail::unsigned_bit_extract(source, bit_start, num_bits); } -#if HIPCUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() /** * Bitfield-extract for 128-bit types. */ diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp index 7f203cd8e70..dcb2456deb8 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp @@ -53,8 +53,16 @@ using NullType = ::rocprim::empty_type; #endif +// This API needs to be deprecated once libhipcxx is available. +#if defined(__SIZEOF_INT128__) + #define _CCCL_HAS_INT128() 1 +#else + #define _CCCL_HAS_INT128() 0 +#endif + #ifndef HIPCUB_IS_INT128_ENABLED - #define HIPCUB_IS_INT128_ENABLED 1 + // Deprecated [Since 4.2] + #define HIPCUB_IS_INT128_ENABLED _CCCL_HAS_INT128() #endif // !defined(HIPCUB_IS_INT128_ENABLED) template struct @@ -746,7 +754,7 @@ template <> struct NumericTraits : BaseTraits struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; - #if HIPCUB_IS_INT128_ENABLED + #if _CCCL_HAS_INT128() template<> struct NumericTraits<__uint128_t> { diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_radix_sort.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_radix_sort.cpp index f5abe06154b..0388c295793 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_radix_sort.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_radix_sort.cpp @@ -60,7 +60,7 @@ class HipcubBlockRadixSort : public ::testing::Test using Params = ::testing::Types< // Power of 2 BlockSize -#if HIPCUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() params<__int128_t, __int128_t, 64U, 1>, params<__uint128_t, __uint128_t, 64U, 1>, #endif diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.cpp.in b/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.cpp.in index 94daf870e63..7cac5df9e1e 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.cpp.in +++ b/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.cpp.in @@ -50,7 +50,7 @@ #endif #if HIPCUB_TEST_TYPE_SLICE == 0 -#if HIPCUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() INSTANTIATE(params<__uint128_t, __uint128_t, true >) INSTANTIATE(params<__int128_t, __int128_t, true >) #endif diff --git a/projects/hipcub/test/hipcub/test_utils_assertions.hpp b/projects/hipcub/test/hipcub/test_utils_assertions.hpp index e95bf497f76..7e6b5e31b6c 100644 --- a/projects/hipcub/test/hipcub/test_utils_assertions.hpp +++ b/projects/hipcub/test/hipcub/test_utils_assertions.hpp @@ -254,7 +254,7 @@ inline void assert_bit_eq(const std::vector& result, const std::vector& ex } } -#if HIPCUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() inline void assert_bit_eq(const std::vector<__int128_t>& result, const std::vector<__int128_t>& expected) { @@ -328,7 +328,7 @@ inline void assert_bit_eq(const std::vector<__uint128_t>& result, } } } -#endif //HIPCUB_IS_INT128_ENABLED +#endif //_CCCL_HAS_INT128() /// Compile-time assertion for type equality of two objects. template diff --git a/projects/hipcub/test/hipcub/test_utils_data_generation.hpp b/projects/hipcub/test/hipcub/test_utils_data_generation.hpp index 698adf5c32f..8edfebb34d4 100644 --- a/projects/hipcub/test/hipcub/test_utils_data_generation.hpp +++ b/projects/hipcub/test/hipcub/test_utils_data_generation.hpp @@ -132,7 +132,7 @@ class numeric_limits : public std::numeric_limits }; // End of extended numeric_limits -#if HIPCUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template using is_int128 = std::is_same<__int128_t, typename std::remove_cv::type>; template @@ -142,7 +142,7 @@ template using is_int128 = std::false_type; template using is_uint128 = std::false_type; -#endif // HIPCUB_IS_INT128_ENABLED +#endif // _CCCL_HAS_INT128() template using is_half = std::is_same::type>; From 7b1b7f6851c832f57974178a4f1e57165932c5e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Mon, 3 Nov 2025 13:40:22 +0000 Subject: [PATCH 14/95] Adds support for large num items to ``DeviceMerge`` --- projects/hipcub/CHANGELOG.md | 2 +- .../hipcub/backend/cub/device/device_merge.hpp | 11 +++++++---- .../backend/cub/device/device_segmented_sort.hpp | 10 +--------- 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/projects/hipcub/CHANGELOG.md b/projects/hipcub/CHANGELOG.md index b886f4d280a..9afcd2a460c 100644 --- a/projects/hipcub/CHANGELOG.md +++ b/projects/hipcub/CHANGELOG.md @@ -28,7 +28,7 @@ Full documentation for hipCUB is available at [https://rocm.docs.amd.com/project * Updated the documentation on how to run hipCUB tests on multiple GPUs in parallel. * Changed `CCCL_MINIMUM_VERSION` to `3.0.0` to align with CUB. -* Add support for large num_items `DeviceSegmentedSort`. +* Add support for large num_items `DeviceMerge` and `DeviceSegmentedSort`. ### Removed diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_merge.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_merge.hpp index f314f5a128e..1489266704a 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_merge.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_merge.hpp @@ -33,6 +33,9 @@ #include // IWYU pragma: export +#include // IWYU pragma: export +using ::cuda::std::int64_t; + BEGIN_HIPCUB_NAMESPACE struct DeviceMerge @@ -46,9 +49,9 @@ struct DeviceMerge static hipError_t MergeKeys(void* d_temp_storage, std::size_t& temp_storage_bytes, KeyIteratorIn1 keys_in1, - int num_keys1, + int64_t num_keys1, KeyIteratorIn2 keys_in2, - int num_keys2, + int64_t num_keys2, KeyIteratorOut keys_out, CompareOp compare_op = {}, hipStream_t stream = 0) @@ -77,10 +80,10 @@ struct DeviceMerge std::size_t& temp_storage_bytes, KeyIteratorIn1 keys_in1, ValueIteratorIn1 values_in1, - int num_keys1, + int64_t num_keys1, KeyIteratorIn2 keys_in2, ValueIteratorIn2 values_in2, - int num_keys2, + int64_t num_keys2, KeyIteratorOut keys_out, ValueIteratorOut values_out, CompareOp compare_op = {}, diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_sort.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_sort.hpp index a5ece248fcb..718788bda0e 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_sort.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_sort.hpp @@ -35,16 +35,8 @@ #include // IWYU pragma: export -#ifdef __HIP_PLATFORM_AMD__ - #include // IWYU pragma: export -using ::hip::std::int64_t; -#elif defined(__HIP_PLATFORM_NVIDIA__) - #include // IWYU pragma: export +#include // IWYU pragma: export using ::cuda::std::int64_t; -#else - #include -using ::std::int64_t; -#endif BEGIN_HIPCUB_NAMESPACE From db132890cf9f242bd78000080597e2a73bb0d46c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Mon, 3 Nov 2025 14:08:26 +0000 Subject: [PATCH 15/95] Remove ``DeviceSpmv`` --- projects/hipcub/CHANGELOG.md | 3 +- projects/hipcub/benchmark/CMakeLists.txt | 1 - .../benchmark/benchmark_device_spmv.cpp | 269 ---- .../hipcub/backend/cub/device/device_spmv.hpp | 108 -- .../include/hipcub/backend/cub/hipcub.hpp | 1 - .../backend/rocprim/device/device_spmv.hpp | 196 --- .../include/hipcub/backend/rocprim/hipcub.hpp | 1 - .../include/hipcub/device/device_spmv.hpp | 39 - projects/hipcub/rtest.xml | 2 +- projects/hipcub/test/hipcub/CMakeLists.txt | 1 - .../hipcub/experimental/sparse_matrix.hpp | 1240 ----------------- .../test/hipcub/test_hipcub_device_spmv.cpp | 292 ---- 12 files changed, 3 insertions(+), 2150 deletions(-) delete mode 100644 projects/hipcub/benchmark/benchmark_device_spmv.cpp delete mode 100644 projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_spmv.hpp delete mode 100644 projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_spmv.hpp delete mode 100644 projects/hipcub/hipcub/include/hipcub/device/device_spmv.hpp delete mode 100644 projects/hipcub/test/hipcub/experimental/sparse_matrix.hpp delete mode 100644 projects/hipcub/test/hipcub/test_hipcub_device_spmv.cpp diff --git a/projects/hipcub/CHANGELOG.md b/projects/hipcub/CHANGELOG.md index 9afcd2a460c..565d6a466c3 100644 --- a/projects/hipcub/CHANGELOG.md +++ b/projects/hipcub/CHANGELOG.md @@ -20,7 +20,8 @@ Full documentation for hipCUB is available at [https://rocm.docs.amd.com/project ### Removed -* Removed deprecated `hipcub:max` and `hipcub:min`. Use `hip::std::max` and `hip::std::min` instead. +* Removed `DeviceSpmv`, which was deprecated from CUB after CCCL's 2.8.0 release. Use `hipSPARSE` or `rocSPARSE` libraries instead. +* Removed `hipcub:max` and `hipcub:min`, which were deprecated. Use `hip::std::max` and `hip::std::min` instead. * Deprecated `hipcub::Swap`, use `rocprim::swap` instead. * Deprecated `HIPCUB_IS_INT128_ENABLED`, use `_CCCL_HAS_INT128()` instead. diff --git a/projects/hipcub/benchmark/CMakeLists.txt b/projects/hipcub/benchmark/CMakeLists.txt index 3e8f663d1a1..cf109273bed 100644 --- a/projects/hipcub/benchmark/CMakeLists.txt +++ b/projects/hipcub/benchmark/CMakeLists.txt @@ -100,7 +100,6 @@ add_hipcub_benchmark(benchmark_device_segmented_sort.cpp) add_hipcub_benchmark(benchmark_device_segmented_radix_sort.cpp) add_hipcub_benchmark(benchmark_device_segmented_reduce.cpp) add_hipcub_benchmark(benchmark_device_select.cpp) -add_hipcub_benchmark(benchmark_device_spmv.cpp) add_hipcub_benchmark(benchmark_warp_exchange.cpp) add_hipcub_benchmark(benchmark_warp_load.cpp) add_hipcub_benchmark(benchmark_warp_reduce.cpp) diff --git a/projects/hipcub/benchmark/benchmark_device_spmv.cpp b/projects/hipcub/benchmark/benchmark_device_spmv.cpp deleted file mode 100644 index fcdb1ab9be9..00000000000 --- a/projects/hipcub/benchmark/benchmark_device_spmv.cpp +++ /dev/null @@ -1,269 +0,0 @@ -// MIT License -// -// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#include "common_benchmark_header.hpp" - -// HIP API -#include - -#ifndef DEFAULT_N -const size_t DEFAULT_N = 1024 * 32; -#endif - -const unsigned int batch_size = 10; -const unsigned int warmup_size = 5; - -template -void run_benchmark(benchmark::State& state, - size_t size, - const hipStream_t stream, - float probability) -{ - const T rand_min = T(1); - const T rand_max = T(10); - - // generate a lexicograhically sorted list of (row, column) index tuples - // number of nonzeroes cannot be guaranteed as duplicates may exist - const int num_nonzeroes_attempt = static_cast( - std::min(static_cast(INT_MAX), - static_cast(probability * static_cast(size * size)))); - std::vector> indices(num_nonzeroes_attempt); - { - std::vector flat_indices - = benchmark_utils::get_random_data(2 * num_nonzeroes_attempt, - 0, - size - 1, - 2 * num_nonzeroes_attempt); - for(int i = 0; i < num_nonzeroes_attempt; i++) - { - indices[i] = std::make_pair(flat_indices[2 * i], flat_indices[2 * i + 1]); - } - std::sort(indices.begin(), indices.end()); - } - - // generate the compressed sparse rows matrix - std::pair prev_cell = std::make_pair(-1, -1); - int num_nonzeroes = 0; - std::vector row_offsets(size + 1); - // this vector might be too large, but doing the allocation now eliminates a - // scan - std::vector column_indices(num_nonzeroes_attempt); - row_offsets[0] = 0; - int last_row_written = 0; - for(int i = 0; i < num_nonzeroes_attempt; i++) - { - if(indices[i] != prev_cell) - { - // update the row offets if we go to the next row (or skip some) - if(indices[i].first != last_row_written) - { - for(int j = last_row_written + 1; j <= indices[i].first; j++) - { - row_offsets[j] = num_nonzeroes; - } - last_row_written = indices[i].first; - } - - column_indices[num_nonzeroes++] = indices[i].second; - - prev_cell = indices[i]; - } - } - // fill in the entries for any missing rows - for(int j = last_row_written + 1; j < static_cast(size) + 1; j++) - { - row_offsets[j] = num_nonzeroes; - } - - // generate the random data once the actual number of nonzeroes are known - std::vector values = benchmark_utils::get_random_data(num_nonzeroes, rand_min, rand_max); - - std::vector vector_x = benchmark_utils::get_random_data(size, rand_min, rand_max); - - T* d_values; - int* d_row_offsets; - int* d_column_indices; - T* d_vector_x; - T* d_vector_y; - HIP_CHECK(hipMalloc(&d_values, values.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_row_offsets, row_offsets.size() * sizeof(int))); - HIP_CHECK(hipMalloc(&d_column_indices, num_nonzeroes * sizeof(int))); - HIP_CHECK(hipMalloc(&d_vector_x, vector_x.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_vector_y, size * sizeof(T))); - HIP_CHECK(hipMemcpy(d_values, values.data(), values.size() * sizeof(T), hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(d_row_offsets, - row_offsets.data(), - row_offsets.size() * sizeof(int), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(d_column_indices, - column_indices.data(), - num_nonzeroes * sizeof(int), - hipMemcpyHostToDevice)); - HIP_CHECK( - hipMemcpy(d_vector_x, vector_x.data(), vector_x.size() * sizeof(T), hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceSynchronize()); - - // Allocate temporary storage memory - size_t temp_storage_size_bytes; - - // Get size of d_temp_storage - HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH - HIP_CHECK(hipcub::DeviceSpmv::CsrMV(nullptr, - temp_storage_size_bytes, - d_values, - d_row_offsets, - d_column_indices, - d_vector_x, - d_vector_y, - size, - size, - num_nonzeroes, - stream)); - HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP - HIP_CHECK(hipDeviceSynchronize()); - - // allocate temporary storage - void* d_temp_storage = nullptr; - HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for(size_t i = 0; i < warmup_size; i++) - { - HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH - HIP_CHECK(hipcub::DeviceSpmv::CsrMV(d_temp_storage, - temp_storage_size_bytes, - d_values, - d_row_offsets, - d_column_indices, - d_vector_x, - d_vector_y, - size, - size, - num_nonzeroes, - stream)); - HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH - } - HIP_CHECK(hipDeviceSynchronize()); - - for(auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - for(size_t i = 0; i < batch_size; i++) - { - HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH - HIP_CHECK(hipcub::DeviceSpmv::CsrMV(d_temp_storage, - temp_storage_size_bytes, - d_values, - d_row_offsets, - d_column_indices, - d_vector_x, - d_vector_y, - size, - size, - num_nonzeroes, - stream)); - HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP - } - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds - = std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * (num_nonzeroes + size) * sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * (num_nonzeroes + size)); - - HIP_CHECK(hipFree(d_temp_storage)); - HIP_CHECK(hipFree(d_vector_y)); - HIP_CHECK(hipFree(d_vector_x)); - HIP_CHECK(hipFree(d_column_indices)); - HIP_CHECK(hipFree(d_row_offsets)); - HIP_CHECK(hipFree(d_values)); - HIP_CHECK(hipDeviceSynchronize()); -} - -#define CREATE_BENCHMARK(T, p) \ - benchmark::RegisterBenchmark( \ - std::string("device_spmv_CsrMV.").c_str(), \ - &run_benchmark, \ - size, \ - stream, \ - p) - -#define BENCHMARK_TYPE(type) \ - CREATE_BENCHMARK(type, 1.0e-6f), CREATE_BENCHMARK(type, 1.0e-5f), \ - CREATE_BENCHMARK(type, 1.0e-4f), CREATE_BENCHMARK(type, 1.0e-3f), \ - CREATE_BENCHMARK(type, 1.0e-2f) - -int main(int argc, char* argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - - std::cout << "benchmark_device_spmv" << std::endl; - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks = { - BENCHMARK_TYPE(int), - BENCHMARK_TYPE(unsigned int), - BENCHMARK_TYPE(float), - BENCHMARK_TYPE(double), - }; - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } - } - - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - - return 0; -} diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_spmv.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_spmv.hpp deleted file mode 100644 index 336e6185452..00000000000 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_spmv.hpp +++ /dev/null @@ -1,108 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -#ifndef HIPCUB_CUB_DEVICE_DEVICE_SPMV_HPP_ -#define HIPCUB_CUB_DEVICE_DEVICE_SPMV_HPP_ - -#include "../../../config.hpp" -#include "../../../util_deprecated.hpp" - -#include // IWYU pragma: export -#include // IWYU pragma: export - -BEGIN_HIPCUB_NAMESPACE - -class HIPCUB_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DeviceSpmv -{ - -public: - template ///< Signed integer type for sequence offsets - struct HIPCUB_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") SpmvParams - { - ValueT* - d_values; ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. - OffsetT* - d_row_end_offsets; ///< Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values - OffsetT* - d_column_indices; ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) - ValueT* - d_vector_x; ///< Pointer to the array of \p num_cols values corresponding to the dense input vector x - ValueT* - d_vector_y; ///< Pointer to the array of \p num_rows values corresponding to the dense output vector y - int num_rows; ///< Number of rows of matrix A. - int num_cols; ///< Number of columns of matrix A. - int num_nonzeros; ///< Number of nonzero elements of matrix A. - ValueT alpha; ///< Alpha multiplicand - ValueT beta; ///< Beta addend-multiplicand - - ::cub::TexObjInputIterator t_vector_x; - }; - - template - HIPCUB_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") - HIPCUB_RUNTIME_FUNCTION static hipError_t CsrMV(void* d_temp_storage, - size_t& temp_storage_bytes, - ValueT* d_values, - int* d_row_offsets, - int* d_column_indices, - ValueT* d_vector_x, - ValueT* d_vector_y, - int num_rows, - int num_cols, - int num_nonzeros, - hipStream_t stream = 0) - { - _CCCL_SUPPRESS_DEPRECATED_PUSH - ::cub::SpmvParams spmv_params; - _CCCL_SUPPRESS_DEPRECATED_POP - spmv_params.d_values = d_values; - spmv_params.d_row_end_offsets = d_row_offsets + 1; - spmv_params.d_column_indices = d_column_indices; - spmv_params.d_vector_x = d_vector_x; - spmv_params.d_vector_y = d_vector_y; - spmv_params.num_rows = num_rows; - spmv_params.num_cols = num_cols; - spmv_params.num_nonzeros = num_nonzeros; - spmv_params.alpha = 1.0; - spmv_params.beta = 0.0; - - _CCCL_SUPPRESS_DEPRECATED_PUSH - return static_cast( - ::cub::DispatchSpmv::Dispatch(d_temp_storage, - temp_storage_bytes, - spmv_params, - stream)); - _CCCL_SUPPRESS_DEPRECATED_POP - } -}; - -END_HIPCUB_NAMESPACE - -#endif // HIPCUB_CUB_DEVICE_DEVICE_SELECT_HPP_ diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/hipcub.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/hipcub.hpp index 2f8157c7bb4..3a35a7a63cf 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/hipcub.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/hipcub.hpp @@ -67,7 +67,6 @@ #include "device/device_segmented_reduce.hpp" #include "device/device_segmented_sort.hpp" #include "device/device_select.hpp" -#include "device/device_spmv.hpp" #include "device/device_transform.hpp" // Grid diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_spmv.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_spmv.hpp deleted file mode 100644 index e5c75322dfb..00000000000 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_spmv.hpp +++ /dev/null @@ -1,196 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SPMV_HPP_ -#define HIPCUB_ROCPRIM_DEVICE_DEVICE_SPMV_HPP_ - -#include "../../../config.hpp" -#include "../../../util_deprecated.hpp" - -#include "../iterator/tex_obj_input_iterator.hpp" - -#include "../util_sync.hpp" - -#include - -BEGIN_HIPCUB_NAMESPACE - -class HIPCUB_DEPRECATED_BECAUSE("Use the hipSPARSE library instead") DeviceSpmv -{ - -public: - template ///< Signed integer type for sequence offsets - struct HIPCUB_DEPRECATED_BECAUSE("Use the rocSPARSE library instead") SpmvParams - { - ValueT* - d_values; ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. - OffsetT* - d_row_end_offsets; ///< Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values - OffsetT* - d_column_indices; ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) - ValueT* - d_vector_x; ///< Pointer to the array of \p num_cols values corresponding to the dense input vector x - ValueT* - d_vector_y; ///< Pointer to the array of \p num_rows values corresponding to the dense output vector y - int num_rows; ///< Number of rows of matrix A. - int num_cols; ///< Number of columns of matrix A. - int num_nonzeros; ///< Number of nonzero elements of matrix A. - ValueT alpha; ///< Alpha multiplicand - ValueT beta; ///< Beta addend-multiplicand - - ::hipcub::TexObjInputIterator t_vector_x; - }; - -static constexpr uint32_t CsrMVKernel_MaxThreads = 256; - -template -static __global__ void -CsrMVKernel(SpmvParams spmv_params) -{ - __shared__ ValueT partial; - - const int32_t row_id = blockIdx.x; - - if(threadIdx.x == 0) - { - partial = spmv_params.beta * spmv_params.d_vector_y[row_id]; - } - __syncthreads(); - - int32_t row_offset = (row_id == 0) ? (0) : (spmv_params.d_row_end_offsets[row_id - 1]); - for(uint32_t thread_offset = 0; thread_offset < spmv_params.num_cols / blockDim.x; - thread_offset++) - { - int32_t offset = row_offset + thread_offset * blockDim.x + threadIdx.x; - - if(offset < spmv_params.d_row_end_offsets[row_id]) - { - ValueT t_value = - spmv_params.alpha * - spmv_params.d_values[offset] * - spmv_params.d_vector_x[spmv_params.d_column_indices[offset]]; - - atomicAdd(&partial, t_value); - - __syncthreads(); - - if(threadIdx.x == 0) - { - spmv_params.d_vector_y[row_id] = partial; - } - } - } -} - -template -HIPCUB_DEPRECATED_BECAUSE("Use the rocSPARSE library instead") -HIPCUB_RUNTIME_FUNCTION static hipError_t CsrMV(void* d_temp_storage, - size_t& temp_storage_bytes, - ValueT* d_values, - int* d_row_offsets, - int* d_column_indices, - ValueT* d_vector_x, - ValueT* d_vector_y, - int num_rows, - int num_cols, - int num_nonzeros, - hipStream_t stream = 0) -{ - HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH - SpmvParams spmv_params; - HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP - spmv_params.d_values = d_values; - spmv_params.d_row_end_offsets = d_row_offsets + 1; - spmv_params.d_column_indices = d_column_indices; - spmv_params.d_vector_x = d_vector_x; - spmv_params.d_vector_y = d_vector_y; - spmv_params.num_rows = num_rows; - spmv_params.num_cols = num_cols; - spmv_params.num_nonzeros = num_nonzeros; - spmv_params.alpha = 1.0; - spmv_params.beta = 0.0; - - if(d_temp_storage == nullptr) - { - // Make sure user won't try to allocate 0 bytes memory, because - // hipMalloc will return nullptr when size is zero. - temp_storage_bytes = 4; - return hipError_t(0); - } else - { - size_t block_size = min(num_cols, static_cast(DeviceSpmv::CsrMVKernel_MaxThreads)); - size_t grid_size = num_rows; - - std::chrono::high_resolution_clock::time_point start; - if HIPCUB_IF_CONSTEXPR(HIPCUB_DETAIL_DEBUG_SYNC_VALUE) - { - start = std::chrono::high_resolution_clock::now(); - } - HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH - CsrMVKernel<<>>(spmv_params); - HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH - HIPCUB_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("CsrMV", block_size * grid_size, start); - } - return hipSuccess; -} - -template -HIPCUB_DEPRECATED_BECAUSE("Use the rocSPARSE library instead") -HIPCUB_RUNTIME_FUNCTION static hipError_t CsrMV(void* d_temp_storage, - size_t& temp_storage_bytes, - ValueT* d_values, - int* d_row_offsets, - int* d_column_indices, - ValueT* d_vector_x, - ValueT* d_vector_y, - int num_rows, - int num_cols, - int num_nonzeros, - hipStream_t stream, - bool /*debug_synchronous*/) -{ - return CsrMV(d_temp_storage, - temp_storage_bytes, - d_values, - d_row_offsets, - d_column_indices, - d_vector_x, - d_vector_y, - num_rows, - num_cols, - num_nonzeros, - stream); -} -}; - -END_HIPCUB_NAMESPACE - -#endif // HIPCUB_CUB_DEVICE_DEVICE_SELECT_HPP_ - diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/hipcub.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/hipcub.hpp index ad8e1131179..21ae7ff17c4 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/hipcub.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/hipcub.hpp @@ -66,7 +66,6 @@ #include "device/device_segmented_reduce.hpp" #include "device/device_segmented_sort.hpp" #include "device/device_select.hpp" -#include "device/device_spmv.hpp" #include "device/device_transform.hpp" // Grid diff --git a/projects/hipcub/hipcub/include/hipcub/device/device_spmv.hpp b/projects/hipcub/hipcub/include/hipcub/device/device_spmv.hpp deleted file mode 100644 index b32fc2811d3..00000000000 --- a/projects/hipcub/hipcub/include/hipcub/device/device_spmv.hpp +++ /dev/null @@ -1,39 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2020-2025, Advanced Micro Devices, Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -#ifndef HIPCUB_DEVICE_DEVICE_SPMV_HPP_ -#define HIPCUB_DEVICE_DEVICE_SPMV_HPP_ - -#ifdef __HIP_PLATFORM_AMD__ - #include "../backend/rocprim/device/device_spmv.hpp" // IWYU pragma: export -#elif defined(__HIP_PLATFORM_NVIDIA__) - #include "../backend/cub/device/device_spmv.hpp" // IWYU pragma: export -#endif - -#endif // HIPCUB_DEVICE_DEVICE_SELECT_HPP_ diff --git a/projects/hipcub/rtest.xml b/projects/hipcub/rtest.xml index 427e10edc96..7156f0dcbd3 100644 --- a/projects/hipcub/rtest.xml +++ b/projects/hipcub/rtest.xml @@ -5,7 +5,7 @@ - + {CTEST_FILTER} {CTEST_REGEX} diff --git a/projects/hipcub/test/hipcub/CMakeLists.txt b/projects/hipcub/test/hipcub/CMakeLists.txt index f62d3d85d22..0bbe882c3b2 100644 --- a/projects/hipcub/test/hipcub/CMakeLists.txt +++ b/projects/hipcub/test/hipcub/CMakeLists.txt @@ -257,7 +257,6 @@ add_hipcub_test_parallel("hipcub.DeviceSegmentedRadixSort" test_hipcub_device_se add_hipcub_test("hipcub.DeviceSegmentedReduce" test_hipcub_device_segmented_reduce.cpp) add_hipcub_test_parallel("hipcub.DeviceSegmentedSort" test_hipcub_device_segmented_sort.cpp.in) add_hipcub_test("hipcub.DeviceSelect" test_hipcub_device_select.cpp) -add_hipcub_test("hipcub.DeviceSpmv" test_hipcub_device_spmv.cpp) add_hipcub_test("hipcub.DeviceTransform" test_hipcub_device_transform.cpp) add_hipcub_test("hipcub.DevicePartition" test_hipcub_device_partition.cpp) add_hipcub_test("hipcub.Grid" test_hipcub_grid.cpp) diff --git a/projects/hipcub/test/hipcub/experimental/sparse_matrix.hpp b/projects/hipcub/test/hipcub/experimental/sparse_matrix.hpp deleted file mode 100644 index 941c21cae52..00000000000 --- a/projects/hipcub/test/hipcub/experimental/sparse_matrix.hpp +++ /dev/null @@ -1,1240 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/****************************************************************************** - * Matrix data structures and parsing logic - ******************************************************************************/ - -#pragma once - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef CUB_MKL - #include - #include -#endif - -using namespace std; - -/****************************************************************************** - * COO matrix type - ******************************************************************************/ - -struct GraphStats -{ - int num_rows; - int num_cols; - int num_nonzeros; - - double diag_dist_mean; // mean - double diag_dist_std_dev; // sample std dev - double pearson_r; // coefficient of variation - - double row_length_mean; // mean - double row_length_std_dev; // sample std_dev - double row_length_variation; // coefficient of variation - double row_length_skewness; // skewness - - void Display(bool show_labels = true) - { - if (show_labels) - printf("\n" - "\t num_rows: %d\n" - "\t num_cols: %d\n" - "\t num_nonzeros: %d\n" - "\t diag_dist_mean: %.2f\n" - "\t diag_dist_std_dev: %.2f\n" - "\t pearson_r: %f\n" - "\t row_length_mean: %.5f\n" - "\t row_length_std_dev: %.5f\n" - "\t row_length_variation: %.5f\n" - "\t row_length_skewness: %.5f\n", - num_rows, - num_cols, - num_nonzeros, - diag_dist_mean, - diag_dist_std_dev, - pearson_r, - row_length_mean, - row_length_std_dev, - row_length_variation, - row_length_skewness); - else - printf( - "%d, " - "%d, " - "%d, " - "%.2f, " - "%.2f, " - "%f, " - "%.5f, " - "%.5f, " - "%.5f, " - "%.5f, ", - num_rows, - num_cols, - num_nonzeros, - diag_dist_mean, - diag_dist_std_dev, - pearson_r, - row_length_mean, - row_length_std_dev, - row_length_variation, - row_length_skewness); - } -}; - - - -/****************************************************************************** - * COO matrix type - ******************************************************************************/ - - -/** - * COO matrix type. A COO matrix is just a vector of edge tuples. Tuples are sorted - * first by row, then by column. - */ -template -struct CooMatrix -{ - //--------------------------------------------------------------------- - // Type definitions and constants - //--------------------------------------------------------------------- - - // COO edge tuple - struct CooTuple - { - OffsetT row; - OffsetT col; - ValueT val; - - CooTuple() : row(OffsetT()), col(OffsetT()), val(ValueT()) {} - CooTuple(OffsetT row, OffsetT col) : row(row), col(col) {} - CooTuple(OffsetT row, OffsetT col, ValueT val) : row(row), col(col), val(val) {} - - /** - * Comparator for sorting COO sparse format num_nonzeros - */ - bool operator<(const CooTuple &other) const - { - if ((row < other.row) || ((row == other.row) && (col < other.col))) - { - return true; - } - - return false; - } - }; - - - //--------------------------------------------------------------------- - // Data members - //--------------------------------------------------------------------- - - // Fields - int num_rows; - int num_cols; - int num_nonzeros; - CooTuple* coo_tuples; - - //--------------------------------------------------------------------- - // Methods - //--------------------------------------------------------------------- - - // Constructor - CooMatrix() : num_rows(0), num_cols(0), num_nonzeros(0), coo_tuples(nullptr) {} - - /** - * Clear - */ - void Clear() - { - if (coo_tuples) delete[] coo_tuples; - coo_tuples = nullptr; - } - - - // Destructor - ~CooMatrix() - { - Clear(); - } - - - // Display matrix to stdout - void Display() - { - cout << "COO Matrix (" << num_rows << " rows, " << num_cols << " columns, " << num_nonzeros << " non-zeros):\n"; - cout << "Ordinal, Row, Column, Value\n"; - for (int i = 0; i < num_nonzeros; i++) - { - cout << '\t' << i << ',' << coo_tuples[i].row << ',' << coo_tuples[i].col << ',' << coo_tuples[i].val << "\n"; - } - } - - - /** - * Builds a symmetric COO sparse from an asymmetric CSR matrix. - */ - template - void InitCsrSymmetric(CsrMatrixT &csr_matrix) - { - if (coo_tuples) - { - fprintf(stderr, "Matrix already constructed\n"); - exit(1); - } - - num_rows = csr_matrix.num_cols; - num_cols = csr_matrix.num_rows; - num_nonzeros = csr_matrix.num_nonzeros * 2; - coo_tuples = new CooTuple[num_nonzeros]; - - for (OffsetT row = 0; row < csr_matrix.num_rows; ++row) - { - for (OffsetT nonzero = csr_matrix.row_offsets[row]; nonzero < csr_matrix.row_offsets[row + 1]; ++nonzero) - { - coo_tuples[nonzero].row = row; - coo_tuples[nonzero].col = csr_matrix.column_indices[nonzero]; - coo_tuples[nonzero].val = csr_matrix.values[nonzero]; - - coo_tuples[csr_matrix.num_nonzeros + nonzero].row = coo_tuples[nonzero].col; - coo_tuples[csr_matrix.num_nonzeros + nonzero].col = coo_tuples[nonzero].row; - coo_tuples[csr_matrix.num_nonzeros + nonzero].val = csr_matrix.values[nonzero]; - - } - } - - // Sort by rows, then columns - std::stable_sort(coo_tuples, coo_tuples + num_nonzeros); - } - - /** - * Builds a COO sparse from a relabeled CSR matrix. - */ - template - void InitCsrRelabel(CsrMatrixT &csr_matrix, const OffsetT* relabel_indices) - { - if (coo_tuples) - { - fprintf(stderr, "Matrix already constructed\n"); - exit(1); - } - - num_rows = csr_matrix.num_rows; - num_cols = csr_matrix.num_cols; - num_nonzeros = csr_matrix.num_nonzeros; - coo_tuples = new CooTuple[num_nonzeros]; - - for (OffsetT row = 0; row < num_rows; ++row) - { - for (OffsetT nonzero = csr_matrix.row_offsets[row]; nonzero < csr_matrix.row_offsets[row + 1]; ++nonzero) - { - coo_tuples[nonzero].row = relabel_indices[row]; - coo_tuples[nonzero].col = relabel_indices[csr_matrix.column_indices[nonzero]]; - coo_tuples[nonzero].val = csr_matrix.values[nonzero]; - } - } - - // Sort by rows, then columns - std::stable_sort(coo_tuples, coo_tuples + num_nonzeros); - } - - - - /** - * Builds a METIS COO sparse from the given file. - */ - void InitMetis(const string& /*metis_filename*/) const - { - if (coo_tuples) - { - fprintf(stderr, "Matrix already constructed\n"); - exit(1); - } - - // TODO - } - - - /** - * Builds a MARKET COO sparse from the given file. - */ - void InitMarket( - const string& market_filename, - ValueT default_value = 1.0, - bool verbose = false) - { - if (verbose) { - printf("Reading... "); fflush(stdout); - } - - if (coo_tuples) - { - fprintf(stderr, "Matrix already constructed\n"); - exit(1); - } - - std::ifstream ifs; - ifs.open(market_filename.c_str(), std::ifstream::in); - if (!ifs.good()) - { - fprintf(stderr, "Error opening file\n"); - exit(1); - } - - bool array = false; - bool symmetric = false; - bool skew = false; - int current_edge = -1; - char line[1024]; - - if (verbose) { - printf("Parsing... "); fflush(stdout); - } - - while (true) - { - ifs.getline(line, 1024); - if (!ifs.good()) - { - // Done - break; - } - - if (line[0] == '%') - { - // Comment - if (line[1] == '%') - { - // Banner - symmetric = (strstr(line, "symmetric") != nullptr); - skew = (strstr(line, "skew") != nullptr); - array = (strstr(line, "array") != nullptr); - - if (verbose) { - printf("(symmetric: %d, skew: %d, array: %d) ", symmetric, skew, array); fflush(stdout); - } - } - } - else if (current_edge == -1) - { - // Problem description - int nparsed = sscanf(line, "%d %d %d", &num_rows, &num_cols, &num_nonzeros); - if ((!array) && (nparsed == 3)) - { - if (symmetric) - num_nonzeros *= 2; - - // Allocate coo matrix - coo_tuples = new CooTuple[num_nonzeros]; - current_edge = 0; - - } - else if (array && (nparsed == 2)) - { - // Allocate coo matrix - num_nonzeros = num_rows * num_cols; - coo_tuples = new CooTuple[num_nonzeros]; - current_edge = 0; - } - else - { - fprintf(stderr, "Error parsing MARKET matrix: invalid problem description: %s\n", line); - exit(1); - } - - } - else - { - // Edge - if (current_edge >= num_nonzeros) - { - fprintf(stderr, "Error parsing MARKET matrix: encountered more than %d num_nonzeros\n", num_nonzeros); - exit(1); - } - - int row, col; - double val; - - if (array) - { - if (sscanf(line, "%lf", &val) != 1) - { - fprintf(stderr, "Error parsing MARKET matrix: badly formed current_edge: '%s' at edge %d\n", line, current_edge); - exit(1); - } - col = (current_edge / num_rows); - row = (current_edge - (num_rows * col)); - - coo_tuples[current_edge] = CooTuple(row, col, val); // Convert indices to zero-based - } - else - { - // Parse nonzero (note: using strtol and strtod is 2x faster than sscanf or istream parsing) - char *l = line; - char* t = nullptr; - - // parse row - row = strtol(l, &t, 0); - if (t == l) - { - fprintf(stderr, "Error parsing MARKET matrix: badly formed row at edge %d\n", current_edge); - exit(1); - } - l = t; - - // parse col - col = strtol(l, &t, 0); - if (t == l) - { - fprintf(stderr, "Error parsing MARKET matrix: badly formed col at edge %d\n", current_edge); - exit(1); - } - l = t; - - // parse val - val = strtod(l, &t); - if (t == l) - { - val = default_value; - } -/* - int nparsed = sscanf(line, "%d %d %lf", &row, &col, &val); - if (nparsed == 2) - { - // No value specified - val = default_value; - - } - else if (nparsed != 3) - { - fprintf(stderr, "Error parsing MARKET matrix 1: badly formed current_edge: %d parsed at edge %d\n", nparsed, current_edge); - exit(1); - } -*/ - - coo_tuples[current_edge] = CooTuple(row - 1, col - 1, val); // Convert indices to zero-based - - } - - current_edge++; - - if (symmetric && (row != col)) - { - coo_tuples[current_edge].row = coo_tuples[current_edge - 1].col; - coo_tuples[current_edge].col = coo_tuples[current_edge - 1].row; - coo_tuples[current_edge].val = coo_tuples[current_edge - 1].val * (skew ? -1 : 1); - current_edge++; - } - } - } - - // Adjust nonzero count (nonzeros along the diagonal aren't reversed) - num_nonzeros = current_edge; - - if (verbose) { - printf("done. Ordering..."); fflush(stdout); - } - - // Sort by rows, then columns - std::stable_sort(coo_tuples, coo_tuples + num_nonzeros); - - if (verbose) { - printf("done. "); fflush(stdout); - } - - ifs.close(); - } - - - /** - * Builds a dense matrix - */ - int InitDense(OffsetT num_rows, - OffsetT num_cols, - ValueT default_value = 1.0, - bool /*verbose*/ = false) - { - if (coo_tuples) - { - fprintf(stderr, "Matrix already constructed\n"); - exit(1); - } - - this->num_rows = num_rows; - this->num_cols = num_cols; - - num_nonzeros = num_rows * num_cols; - coo_tuples = new CooTuple[num_nonzeros]; - - for (OffsetT row = 0; row < num_rows; ++row) - { - for (OffsetT col = 0; col < num_cols; ++col) - { - coo_tuples[(row * num_cols) + col] = CooTuple(row, col, default_value); - } - } - - // Sort by rows, then columns - std::stable_sort(coo_tuples, coo_tuples + num_nonzeros); - - return 0; - } - - /** - * Builds a wheel COO sparse matrix having spokes spokes. - */ - int InitWheel(OffsetT spokes, ValueT default_value = 1.0, bool /*verbose*/ = false) - { - if (coo_tuples) - { - fprintf(stderr, "Matrix already constructed\n"); - exit(1); - } - - num_rows = spokes + 1; - num_cols = num_rows; - num_nonzeros = spokes * 2; - coo_tuples = new CooTuple[num_nonzeros]; - - // Add spoke num_nonzeros - int current_edge = 0; - for (OffsetT i = 0; i < spokes; i++) - { - coo_tuples[current_edge] = CooTuple(0, i + 1, default_value); - current_edge++; - } - - // Add rim - for (OffsetT i = 0; i < spokes; i++) - { - OffsetT dest = (i + 1) % spokes; - coo_tuples[current_edge] = CooTuple(i + 1, dest + 1, default_value); - current_edge++; - } - - // Sort by rows, then columns - std::stable_sort(coo_tuples, coo_tuples + num_nonzeros); - - return 0; - } - - - /** - * Builds a square 2D grid CSR matrix. Interior num_vertices have degree 5 when including - * a self-loop. - * - * Returns 0 on success, 1 on failure. - */ - int InitGrid2d(OffsetT width, bool self_loop, ValueT default_value = 1.0) - { - if (coo_tuples) - { - fprintf(stderr, "Matrix already constructed\n"); - exit(1); - } - - int interior_nodes = (width - 2) * (width - 2); - int edge_nodes = (width - 2) * 4; - int corner_nodes = 4; - num_rows = width * width; - num_cols = num_rows; - num_nonzeros = (interior_nodes * 4) + (edge_nodes * 3) + (corner_nodes * 2); - - if (self_loop) - num_nonzeros += num_rows; - - coo_tuples = new CooTuple[num_nonzeros]; - int current_edge = 0; - - for (OffsetT j = 0; j < width; j++) - { - for (OffsetT k = 0; k < width; k++) - { - OffsetT me = (j * width) + k; - - // West - OffsetT neighbor = (j * width) + (k - 1); - if (k - 1 >= 0) { - coo_tuples[current_edge] = CooTuple(me, neighbor, default_value); - current_edge++; - } - - // East - neighbor = (j * width) + (k + 1); - if (k + 1 < width) { - coo_tuples[current_edge] = CooTuple(me, neighbor, default_value); - current_edge++; - } - - // North - neighbor = ((j - 1) * width) + k; - if (j - 1 >= 0) { - coo_tuples[current_edge] = CooTuple(me, neighbor, default_value); - current_edge++; - } - - // South - neighbor = ((j + 1) * width) + k; - if (j + 1 < width) { - coo_tuples[current_edge] = CooTuple(me, neighbor, default_value); - current_edge++; - } - - if (self_loop) - { - neighbor = me; - coo_tuples[current_edge] = CooTuple(me, neighbor, default_value); - current_edge++; - } - } - } - - // Sort by rows, then columns, update dims - std::stable_sort(coo_tuples, coo_tuples + num_nonzeros); - - return 0; - } - - - /** - * Builds a square 3D grid COO sparse matrix. Interior num_vertices have degree 7 when including - * a self-loop. Values are uninitialized, coo_tuples are sorted. - */ - int InitGrid3d(OffsetT width, bool self_loop, ValueT default_value = 1.0) - { - if (coo_tuples) - { - fprintf(stderr, "Matrix already constructed\n"); - return -1; - } - - OffsetT interior_nodes = (width - 2) * (width - 2) * (width - 2); - OffsetT face_nodes = (width - 2) * (width - 2) * 6; - OffsetT edge_nodes = (width - 2) * 12; - OffsetT corner_nodes = 8; - num_cols = width * width * width; - num_rows = num_cols; - num_nonzeros = (interior_nodes * 6) + (face_nodes * 5) + (edge_nodes * 4) + (corner_nodes * 3); - - if (self_loop) - num_nonzeros += num_rows; - - coo_tuples = new CooTuple[num_nonzeros]; - int current_edge = 0; - - for (OffsetT i = 0; i < width; i++) - { - for (OffsetT j = 0; j < width; j++) - { - for (OffsetT k = 0; k < width; k++) - { - - OffsetT me = (i * width * width) + (j * width) + k; - - // Up - OffsetT neighbor = (i * width * width) + (j * width) + (k - 1); - if (k - 1 >= 0) { - coo_tuples[current_edge] = CooTuple(me, neighbor, default_value); - current_edge++; - } - - // Down - neighbor = (i * width * width) + (j * width) + (k + 1); - if (k + 1 < width) { - coo_tuples[current_edge] = CooTuple(me, neighbor, default_value); - current_edge++; - } - - // West - neighbor = (i * width * width) + ((j - 1) * width) + k; - if (j - 1 >= 0) { - coo_tuples[current_edge] = CooTuple(me, neighbor, default_value); - current_edge++; - } - - // East - neighbor = (i * width * width) + ((j + 1) * width) + k; - if (j + 1 < width) { - coo_tuples[current_edge] = CooTuple(me, neighbor, default_value); - current_edge++; - } - - // North - neighbor = ((i - 1) * width * width) + (j * width) + k; - if (i - 1 >= 0) { - coo_tuples[current_edge] = CooTuple(me, neighbor, default_value); - current_edge++; - } - - // South - neighbor = ((i + 1) * width * width) + (j * width) + k; - if (i + 1 < width) { - coo_tuples[current_edge] = CooTuple(me, neighbor, default_value); - current_edge++; - } - - if (self_loop) - { - neighbor = me; - coo_tuples[current_edge] = CooTuple(me, neighbor, default_value); - current_edge++; - } - } - } - } - - // Sort by rows, then columns, update dims - std::stable_sort(coo_tuples, coo_tuples + num_nonzeros); - - return 0; - } -}; - - - -/****************************************************************************** - * COO matrix type - ******************************************************************************/ - - -/** - * CSR sparse format matrix - */ -template< - typename ValueT, - typename OffsetT> -struct CsrMatrix -{ - int num_rows; - int num_cols; - int num_nonzeros; - OffsetT* row_offsets; - OffsetT* column_indices; - ValueT* values; - bool numa_malloc; - - /** - * Constructor - */ - CsrMatrix() - : num_rows(0) - , num_cols(0) - , num_nonzeros(0) - , row_offsets(nullptr) - , column_indices(nullptr) - , values(nullptr) - { -#ifdef CUB_MKL - numa_malloc = ((numa_available() >= 0) && (numa_num_task_nodes() > 1)); -#else - numa_malloc = false; -#endif - } - - - /** - * Clear - */ - void Clear() - { -#ifdef CUB_MKL - if (numa_malloc) - { - numa_free(row_offsets, sizeof(OffsetT) * (num_rows + 1)); - numa_free(values, sizeof(ValueT) * num_nonzeros); - numa_free(column_indices, sizeof(OffsetT) * num_nonzeros); - } - else - { - if (row_offsets) mkl_free(row_offsets); - if (column_indices) mkl_free(column_indices); - if (values) mkl_free(values); - } - -#else - if (row_offsets) delete[] row_offsets; - if (column_indices) delete[] column_indices; - if (values) delete[] values; -#endif - - row_offsets = nullptr; - column_indices = nullptr; - values = nullptr; - } - - /** - * Destructor - */ - ~CsrMatrix() - { - Clear(); - } - - GraphStats Stats() const - { - GraphStats stats; - stats.num_rows = num_rows; - stats.num_cols = num_cols; - stats.num_nonzeros = num_nonzeros; - - // - // Compute diag-distance statistics - // - - OffsetT samples = 0; - double mean = 0.0; - double ss_tot = 0.0; - - for (OffsetT row = 0; row < num_rows; ++row) - { - OffsetT nz_idx_start = row_offsets[row]; - OffsetT nz_idx_end = row_offsets[row + 1]; - - for (int nz_idx = nz_idx_start; nz_idx < nz_idx_end; ++nz_idx) - { - OffsetT col = column_indices[nz_idx]; - double x = (col > row) ? col - row : row - col; - - samples++; - double delta = x - mean; - mean = mean + (delta / samples); - ss_tot += delta * (x - mean); - } - } - stats.diag_dist_mean = mean; - double variance = ss_tot / samples; - stats.diag_dist_std_dev = sqrt(variance); - - - // - // Compute deming statistics - // - - samples = 0; - double mean_x = 0.0; - double mean_y = 0.0; - double ss_x = 0.0; - double ss_y = 0.0; - - for (OffsetT row = 0; row < num_rows; ++row) - { - OffsetT nz_idx_start = row_offsets[row]; - OffsetT nz_idx_end = row_offsets[row + 1]; - - for (int nz_idx = nz_idx_start; nz_idx < nz_idx_end; ++nz_idx) - { - OffsetT col = column_indices[nz_idx]; - - samples++; - double x = col; - double y = row; - double delta; - - delta = x - mean_x; - mean_x = mean_x + (delta / samples); - ss_x += delta * (x - mean_x); - - delta = y - mean_y; - mean_y = mean_y + (delta / samples); - ss_y += delta * (y - mean_y); - } - } - - samples = 0; - double s_xy = 0.0; - double s_xxy = 0.0; - double s_xyy = 0.0; - for (OffsetT row = 0; row < num_rows; ++row) - { - OffsetT nz_idx_start = row_offsets[row]; - OffsetT nz_idx_end = row_offsets[row + 1]; - - for (int nz_idx = nz_idx_start; nz_idx < nz_idx_end; ++nz_idx) - { - OffsetT col = column_indices[nz_idx]; - - samples++; - double x = col; - double y = row; - - double xy = (x - mean_x) * (y - mean_y); - double xxy = (x - mean_x) * (x - mean_x) * (y - mean_y); - double xyy = (x - mean_x) * (y - mean_y) * (y - mean_y); - double delta; - - delta = xy - s_xy; - s_xy = s_xy + (delta / samples); - - delta = xxy - s_xxy; - s_xxy = s_xxy + (delta / samples); - - delta = xyy - s_xyy; - s_xyy = s_xyy + (delta / samples); - } - } - - // double s_xx = ss_x / num_nonzeros; - // double s_yy = ss_y / num_nonzeros; - - stats.pearson_r = (num_nonzeros * s_xy) / (sqrt(ss_x) * sqrt(ss_y)); - - - // - // Compute row-length statistics - // - - // Sample mean - stats.row_length_mean = double(num_nonzeros) / num_rows; - variance = 0.0; - stats.row_length_skewness = 0.0; - for (OffsetT row = 0; row < num_rows; ++row) - { - OffsetT length = row_offsets[row + 1] - row_offsets[row]; - double delta = double(length) - stats.row_length_mean; - variance += (delta * delta); - stats.row_length_skewness += (delta * delta * delta); - } - variance /= num_rows; - stats.row_length_std_dev = sqrt(variance); - stats.row_length_skewness = (stats.row_length_skewness / num_rows) / pow(stats.row_length_std_dev, 3.0); - stats.row_length_variation = stats.row_length_std_dev / stats.row_length_mean; - - return stats; - } - - /** - * Build CSR matrix from sorted COO matrix - */ - void FromCoo(const CooMatrix &coo_matrix) - { - num_rows = coo_matrix.num_rows; - num_cols = coo_matrix.num_cols; - num_nonzeros = coo_matrix.num_nonzeros; - -#ifdef CUB_MKL - - if (numa_malloc) - { - numa_set_strict(1); -// numa_set_bind_policy(1); - -// values = (ValueT*) numa_alloc_interleaved(sizeof(ValueT) * num_nonzeros); -// row_offsets = (OffsetT*) numa_alloc_interleaved(sizeof(OffsetT) * (num_rows + 1)); -// column_indices = (OffsetT*) numa_alloc_interleaved(sizeof(OffsetT) * num_nonzeros); - - row_offsets = (OffsetT*) numa_alloc_onnode(sizeof(OffsetT) * (num_rows + 1), 0); - column_indices = (OffsetT*) numa_alloc_onnode(sizeof(OffsetT) * num_nonzeros, 0); - values = (ValueT*) numa_alloc_onnode(sizeof(ValueT) * num_nonzeros, 1); - } - else - { - values = (ValueT*) mkl_malloc(sizeof(ValueT) * num_nonzeros, 4096); - row_offsets = (OffsetT*) mkl_malloc(sizeof(OffsetT) * (num_rows + 1), 4096); - column_indices = (OffsetT*) mkl_malloc(sizeof(OffsetT) * num_nonzeros, 4096); - - } - -#else - row_offsets = new OffsetT[num_rows + 1]; - column_indices = new OffsetT[num_nonzeros]; - values = new ValueT[num_nonzeros]; -#endif - - OffsetT prev_row = -1; - for (OffsetT current_edge = 0; current_edge < num_nonzeros; current_edge++) - { - OffsetT current_row = coo_matrix.coo_tuples[current_edge].row; - - // Fill in rows up to and including the current row - for (OffsetT row = prev_row + 1; row <= current_row; row++) - { - row_offsets[row] = current_edge; - } - prev_row = current_row; - - column_indices[current_edge] = coo_matrix.coo_tuples[current_edge].col; - values[current_edge] = coo_matrix.coo_tuples[current_edge].val; - } - - // Fill out any trailing edgeless vertices (and the end-of-list element) - for (OffsetT row = prev_row + 1; row <= num_rows; row++) - { - row_offsets[row] = num_nonzeros; - } - } - - - /** - * Display log-histogram to stdout - */ - void DisplayHistogram() - { - // Initialize - int log_counts[9]; - for (int i = 0; i < 9; i++) - { - log_counts[i] = 0; - } - - // Scan - int max_log_length = -1; - for (OffsetT row = 0; row < num_rows; row++) - { - OffsetT length = row_offsets[row + 1] - row_offsets[row]; - - int log_length = -1; - while (length > 0) - { - length /= 10; - log_length++; - } - if (log_length > max_log_length) - { - max_log_length = log_length; - } - - log_counts[log_length + 1]++; - } - printf("CSR matrix (%d rows, %d columns, %d non-zeros):\n", (int) num_rows, (int) num_cols, (int) num_nonzeros); - for (int i = -1; i < max_log_length + 1; i++) - { - printf("\tDegree 1e%d: \t%d (%.2f%%)\n", i, log_counts[i + 1], (float) log_counts[i + 1] * 100.0 / num_cols); - } - fflush(stdout); - } - - - /** - * Display matrix to stdout - */ - void Display() - { - printf("Input Matrix:\n"); - for (OffsetT row = 0; row < num_rows; row++) - { - printf("%d [@%d, #%d]: ", row, row_offsets[row], row_offsets[row + 1] - row_offsets[row]); - for (OffsetT current_edge = row_offsets[row]; current_edge < row_offsets[row + 1]; current_edge++) - { - printf("%d (%f), ", column_indices[current_edge], values[current_edge]); - } - printf("\n"); - } - fflush(stdout); - } - - -}; - - - -/****************************************************************************** - * Matrix transformations - ******************************************************************************/ - -// Comparator for ordering rows by degree (lowest first), then by row-id (lowest first) -template -struct OrderByLow -{ - OffsetT* row_degrees; - OrderByLow(OffsetT* row_degrees) : row_degrees(row_degrees) {} - - bool operator()(const OffsetT &a, const OffsetT &b) const - { - if (row_degrees[a] < row_degrees[b]) - return true; - else if (row_degrees[a] > row_degrees[b]) - return false; - else - return (a < b); - } -}; - -// Comparator for ordering rows by degree (highest first), then by row-id (lowest first) -template -struct OrderByHigh -{ - OffsetT* row_degrees; - OrderByHigh(OffsetT* row_degrees) : row_degrees(row_degrees) {} - - bool operator()(const OffsetT &a, const OffsetT &b) const - { - if (row_degrees[a] > row_degrees[b]) - return true; - else if (row_degrees[a] < row_degrees[b]) - return false; - else - return (a < b); - } -}; - - - -/** - * Reverse Cuthill-McKee - */ -template -void RcmRelabel( - CsrMatrix& matrix, - OffsetT* relabel_indices) -{ - // Initialize row degrees - OffsetT* row_degrees_in = new OffsetT[matrix.num_rows]; - OffsetT* row_degrees_out = new OffsetT[matrix.num_rows]; - for (OffsetT row = 0; row < matrix.num_rows; ++row) - { - row_degrees_in[row] = 0; - row_degrees_out[row] = matrix.row_offsets[row + 1] - matrix.row_offsets[row]; - } - for (OffsetT nonzero = 0; nonzero < matrix.num_nonzeros; ++nonzero) - { - row_degrees_in[matrix.column_indices[nonzero]]++; - } - - // Initialize unlabeled set - using UnlabeledSet = std::set>; - typename UnlabeledSet::key_compare unlabeled_comp(row_degrees_in); - UnlabeledSet unlabeled(unlabeled_comp); - for (OffsetT row = 0; row < matrix.num_rows; ++row) - { - relabel_indices[row] = -1; - unlabeled.insert(row); - } - - // Initialize queue set - std::deque q; - - // Process unlabeled vertices (traverse connected components) - OffsetT relabel_idx = 0; - while (!unlabeled.empty()) - { - // Seed the unvisited frontier queue with the unlabeled vertex of lowest-degree - OffsetT vertex = *unlabeled.begin(); - q.push_back(vertex); - - while (!q.empty()) - { - vertex = q.front(); - q.pop_front(); - - if (relabel_indices[vertex] == -1) - { - // Update this vertex - unlabeled.erase(vertex); - relabel_indices[vertex] = relabel_idx; - relabel_idx++; - - // Sort neighbors by degree - OrderByLow neighbor_comp(row_degrees_in); - std::sort( - matrix.column_indices + matrix.row_offsets[vertex], - matrix.column_indices + matrix.row_offsets[vertex + 1], - neighbor_comp); - - // Inspect neighbors, adding to the out frontier if unlabeled - for (OffsetT neighbor_idx = matrix.row_offsets[vertex]; - neighbor_idx < matrix.row_offsets[vertex + 1]; - ++neighbor_idx) - { - OffsetT neighbor = matrix.column_indices[neighbor_idx]; - q.push_back(neighbor); - } - } - } - } - -/* - // Reverse labels - for (int row = 0; row < matrix.num_rows; ++row) - { - relabel_indices[row] = matrix.num_rows - relabel_indices[row] - 1; - } -*/ - - // Cleanup - if (row_degrees_in) delete[] row_degrees_in; - if (row_degrees_out) delete[] row_degrees_out; -} - - -/** - * Reverse Cuthill-McKee - */ -template -void RcmRelabel( - CsrMatrix& matrix, - bool verbose = false) -{ - // Do not process if not square - if (matrix.num_cols != matrix.num_rows) - { - if (verbose) { - printf("RCM transformation ignored (not square)\n"); fflush(stdout); - } - return; - } - - // Initialize relabel indices - OffsetT* relabel_indices = new OffsetT[matrix.num_rows]; - - if (verbose) { - printf("RCM relabeling... "); fflush(stdout); - } - - RcmRelabel(matrix, relabel_indices); - - if (verbose) { - printf("done. Reconstituting... "); fflush(stdout); - } - - // Create a COO matrix from the relabel indices - CooMatrix coo_matrix; - coo_matrix.InitCsrRelabel(matrix, relabel_indices); - - // Reconstitute the CSR matrix from the sorted COO tuples - if (relabel_indices) delete[] relabel_indices; - matrix.Clear(); - matrix.FromCoo(coo_matrix); - - if (verbose) { - printf("done. "); fflush(stdout); - } -} diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_spmv.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_spmv.cpp deleted file mode 100644 index 5b145995420..00000000000 --- a/projects/hipcub/test/hipcub/test_hipcub_device_spmv.cpp +++ /dev/null @@ -1,292 +0,0 @@ -// MIT License -// -// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#include "experimental/sparse_matrix.hpp" - -#include -#include - -#include "common_test_header.hpp" -#include "test_utils_assertions.hpp" - -hipcub::CachingDeviceAllocator g_allocator; - -static constexpr float alpha_const = 1.0f; -static constexpr float beta_const = 0.0f; - -// Params for tests -template -struct DeviceSpmvParams -{ - using value_type = Type; - static constexpr int32_t grid_2d = Grid2D; - static constexpr int32_t grid_3d = Grid3D; - static constexpr int32_t wheel = Wheel; - static constexpr int32_t dense = Dense; - static constexpr bool use_graphs = UseGraphs; -}; - -// --------------------------------------------------------- -// Test for scan ops taking single input value -// --------------------------------------------------------- - -template -class HipcubDeviceSpmvTests : public ::testing::Test -{ -public: - using value_type = typename Params::value_type; - static constexpr int32_t grid_2d = Params::grid_2d; - static constexpr int32_t grid_3d = Params::grid_3d; - static constexpr int32_t wheel = Params::wheel; - static constexpr int32_t dense = Params::dense; - static constexpr bool use_graphs = Params::use_graphs; -}; - -using HipcubDeviceSpmvTestsParams = ::testing::Types, - DeviceSpmvParams>; - -template -static void generate_matrix(CooMatrix& coo_matrix, - int32_t grid2d, - int32_t grid3d, - int32_t wheel, - int32_t dense) -{ - if(grid2d > 0) - { - // Generate 2D lattice - coo_matrix.InitGrid2d(grid2d, false); - } - else if(grid3d > 0) - { - // Generate 3D lattice - coo_matrix.InitGrid3d(grid3d, false); - } - else if(wheel > 0) - { - // Generate wheel graph - coo_matrix.InitWheel(wheel); - } - else if(dense > 0) - { -#if 0 - // Generate dense graph - OffsetType size = 1 << 24; // 16M nnz - args.GetCmdLineArgument("size", size); - - OffsetType rows = size / dense; - printf("dense_%d_x_%d, ", rows, dense); fflush(stdout); - coo_matrix.InitDense(rows, dense); -#endif - } -} - -template -void SpmvGold(CsrMatrix& a, - const T* vector_x, - const T* vector_y_in, - T* vector_y_out, - T alpha, - T beta) -{ - for(OffsetType row = 0; row < a.num_rows; ++row) - { - T partial = beta * vector_y_in[row]; - for(OffsetType offset = a.row_offsets[row]; offset < a.row_offsets[row + 1]; ++offset) - { - partial += alpha * a.values[offset] * vector_x[a.column_indices[offset]]; - } - vector_y_out[row] = partial; - } -} - -TYPED_TEST_SUITE(HipcubDeviceSpmvTests, HipcubDeviceSpmvTestsParams); - -TYPED_TEST(HipcubDeviceSpmvTests, Spmv) -{ - int device_id = test_common_utils::obtain_device_from_ctest(); - SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); - HIP_CHECK(hipSetDevice(device_id)); - - using T = typename TestFixture::value_type; - using OffsetType = int32_t; - constexpr int32_t grid_2d = TestFixture::grid_2d; - constexpr int32_t grid_3d = TestFixture::grid_3d; - constexpr int32_t wheel = TestFixture::wheel; - constexpr int32_t dense = TestFixture::dense; - - hipStream_t stream = 0; // default - if(TestFixture::use_graphs) - { - // Default stream does not support hipGraph stream capture, so create one - HIP_CHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); - } - - CooMatrix coo_matrix; - generate_matrix(coo_matrix, grid_2d, grid_3d, wheel, dense); - - // Convert to CSR - CsrMatrix csr_matrix; - csr_matrix.FromCoo(coo_matrix); - - // Allocate input and output vectors - T* vector_x = new T[csr_matrix.num_cols]; - T* vector_y_in = new T[csr_matrix.num_rows]; - T* vector_y_out = new T[csr_matrix.num_rows]; - - for(int col = 0; col < csr_matrix.num_cols; ++col) - vector_x[col] = 1.0; - - for(int row = 0; row < csr_matrix.num_rows; ++row) - vector_y_in[row] = 1.0; - - // Compute reference answer - SpmvGold(csr_matrix, vector_x, vector_y_in, vector_y_out, alpha_const, beta_const); - - HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH - // Allocate and initialize GPU problem - hipcub::DeviceSpmv::SpmvParams params{}; - HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP - - HIP_CHECK( - g_allocator.DeviceAllocate((void**)¶ms.d_values, sizeof(T) * csr_matrix.num_nonzeros)); - HIP_CHECK(g_allocator.DeviceAllocate((void**)¶ms.d_row_end_offsets, - sizeof(OffsetType) * (csr_matrix.num_rows + 1))); - HIP_CHECK(g_allocator.DeviceAllocate((void**)¶ms.d_column_indices, - sizeof(OffsetType) * csr_matrix.num_nonzeros)); - HIP_CHECK( - g_allocator.DeviceAllocate((void**)¶ms.d_vector_x, sizeof(T) * csr_matrix.num_cols)); - HIP_CHECK( - g_allocator.DeviceAllocate((void**)¶ms.d_vector_y, sizeof(T) * csr_matrix.num_rows)); - - params.num_rows = csr_matrix.num_rows; - params.num_cols = csr_matrix.num_cols; - params.num_nonzeros = csr_matrix.num_nonzeros; - params.alpha = alpha_const; - params.beta = beta_const; - - HIP_CHECK(hipMemcpy(params.d_values, - csr_matrix.values, - sizeof(T) * csr_matrix.num_nonzeros, - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(params.d_row_end_offsets, - csr_matrix.row_offsets, - sizeof(OffsetType) * (csr_matrix.num_rows + 1), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(params.d_column_indices, - csr_matrix.column_indices, - sizeof(OffsetType) * csr_matrix.num_nonzeros, - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(params.d_vector_x, - vector_x, - sizeof(T) * csr_matrix.num_cols, - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(params.d_vector_y, - vector_y_in, - sizeof(T) * csr_matrix.num_rows, - hipMemcpyHostToDevice)); - - // Allocate temporary storage - size_t temp_storage_bytes = 0; - void* d_temp_storage = nullptr; - - // Get amount of temporary storage needed - HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH - HIP_CHECK(hipcub::DeviceSpmv::CsrMV(d_temp_storage, - temp_storage_bytes, - params.d_values, - params.d_row_end_offsets, - params.d_column_indices, - params.d_vector_x, - params.d_vector_y, - params.num_rows, - params.num_cols, - params.num_nonzeros, - stream)); - HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP - - // Allocate - //HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes); - HIP_CHECK(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - test_utils::GraphHelper gHelper; - if(TestFixture::use_graphs) - gHelper.startStreamCapture(stream); - - HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH - HIP_CHECK(hipcub::DeviceSpmv::CsrMV(d_temp_storage, - temp_storage_bytes, - params.d_values, - params.d_row_end_offsets, - params.d_column_indices, - params.d_vector_x, - params.d_vector_y, - params.num_rows, - params.num_cols, - params.num_nonzeros, - stream)); - HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP - - if(TestFixture::use_graphs) - gHelper.createAndLaunchGraph(stream); - - HIP_CHECK(hipMemcpy(vector_y_in, - params.d_vector_y, - sizeof(T) * params.num_rows, - hipMemcpyDeviceToHost)); - - HIP_CHECK(hipPeekAtLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - const auto max_row_len = csr_matrix.num_cols * csr_matrix.num_rows; - const float diff = max_row_len * test_utils::precision::value; - - for(int32_t i = 0; i < csr_matrix.num_rows; i++) - { - ASSERT_NO_FATAL_FAILURE(test_utils::assert_near(vector_y_in[i], vector_y_out[i], diff)) - << "where index = " << i; - } - - if(TestFixture::use_graphs) - { - gHelper.cleanupGraphHelper(); - HIP_CHECK(hipStreamDestroy(stream)); - } - - // De-allocate input and output vectors - delete[] vector_x; - delete[] vector_y_in; - delete[] vector_y_out; - - HIP_CHECK(g_allocator.DeviceFree(params.d_values)); - HIP_CHECK(g_allocator.DeviceFree(params.d_row_end_offsets)); - HIP_CHECK(g_allocator.DeviceFree(params.d_column_indices)); - HIP_CHECK(g_allocator.DeviceFree(params.d_vector_x)); - HIP_CHECK(g_allocator.DeviceFree(params.d_vector_y)); - HIP_CHECK(g_allocator.DeviceFree(d_temp_storage)); -} From c95af95d865b7a7f03fb2c4bf717e98bdfa4c3cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Thu, 4 Dec 2025 13:44:08 +0000 Subject: [PATCH 16/95] Adds support for large number of segments to DeviceSegmentedReduce --- .../benchmark_device_segmented_reduce.cpp | 20 +- .../cub/device/device_segmented_reduce.hpp | 106 ++++----- .../device/device_segmented_reduce.hpp | 212 +++++++++--------- .../test_hipcub_device_segmented_reduce.cpp | 32 +-- 4 files changed, 195 insertions(+), 175 deletions(-) diff --git a/projects/hipcub/benchmark/benchmark_device_segmented_reduce.cpp b/projects/hipcub/benchmark/benchmark_device_segmented_reduce.cpp index 5ba4a94284f..815acb0078c 100644 --- a/projects/hipcub/benchmark/benchmark_device_segmented_reduce.cpp +++ b/projects/hipcub/benchmark/benchmark_device_segmented_reduce.cpp @@ -152,7 +152,14 @@ struct Benchmark static void run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size) { - hipError_t (*ptr_to_sum)(void*, size_t&, T*, T*, int, OffsetType*, OffsetType*, hipStream_t) + hipError_t (*ptr_to_sum)(void*, + size_t&, + T*, + T*, + _HIPCUB_STD::int64_t, + OffsetType*, + OffsetType*, + hipStream_t) = &hipcub::DeviceSegmentedReduce::Sum; run_benchmark(state, desired_segments, stream, size, ptr_to_sum); } @@ -164,7 +171,14 @@ struct Benchmark static void run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size) { - hipError_t (*ptr_to_min)(void*, size_t&, T*, T*, int, OffsetType*, OffsetType*, hipStream_t) + hipError_t (*ptr_to_min)(void*, + size_t&, + T*, + T*, + _HIPCUB_STD::int64_t, + OffsetType*, + OffsetType*, + hipStream_t) = &hipcub::DeviceSegmentedReduce::Min; run_benchmark(state, desired_segments, stream, size, ptr_to_min); } @@ -184,7 +198,7 @@ struct Benchmark size_t&, T*, KeyValue*, - int, + _HIPCUB_STD::int64_t, OffsetType*, OffsetType*, hipStream_t) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_reduce.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_reduce.hpp index 041f3b6a303..1edb929c8b0 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_reduce.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_reduce.hpp @@ -44,16 +44,17 @@ struct DeviceSegmentedReduce typename OffsetIteratorT, typename ReductionOp, typename T> - HIPCUB_RUNTIME_FUNCTION static hipError_t Reduce(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - ReductionOp reduction_op, - T initial_value, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t Reduce(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + _HIPCUB_STD::int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + ReductionOp reduction_op, + T initial_value, + hipStream_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, @@ -68,14 +69,15 @@ struct DeviceSegmentedReduce } template - HIPCUB_RUNTIME_FUNCTION static hipError_t Sum(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t Sum(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + _HIPCUB_STD::int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, @@ -88,14 +90,15 @@ struct DeviceSegmentedReduce } template - HIPCUB_RUNTIME_FUNCTION static hipError_t Min(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t Min(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + _HIPCUB_STD::int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, @@ -108,14 +111,15 @@ struct DeviceSegmentedReduce } template - HIPCUB_RUNTIME_FUNCTION static hipError_t ArgMin(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t ArgMin(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + _HIPCUB_STD::int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, @@ -128,14 +132,15 @@ struct DeviceSegmentedReduce } template - HIPCUB_RUNTIME_FUNCTION static hipError_t Max(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t Max(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + _HIPCUB_STD::int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, @@ -148,14 +153,15 @@ struct DeviceSegmentedReduce } template - HIPCUB_RUNTIME_FUNCTION static hipError_t ArgMax(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t ArgMax(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + _HIPCUB_STD::int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp index 8f649447120..d698db38dec 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp @@ -188,16 +188,16 @@ struct DeviceSegmentedReduce typename ReductionOp, typename T> HIPCUB_RUNTIME_FUNCTION - static hipError_t Reduce(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - ReductionOp reduction_op, - T initial_value, - hipStream_t stream = 0) + static hipError_t Reduce(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + _HIPCUB_STD::int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + ReductionOp reduction_op, + T initial_value, + hipStream_t stream = 0) { return ::rocprim::segmented_reduce( d_temp_storage, @@ -218,17 +218,17 @@ struct DeviceSegmentedReduce typename ReductionOp, typename T> HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION - static hipError_t Reduce(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - ReductionOp reduction_op, - T initial_value, - hipStream_t stream, - bool debug_synchronous) + static hipError_t Reduce(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + _HIPCUB_STD::int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + ReductionOp reduction_op, + T initial_value, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return Reduce(d_temp_storage, @@ -245,14 +245,14 @@ struct DeviceSegmentedReduce template HIPCUB_RUNTIME_FUNCTION - static hipError_t Sum(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + static hipError_t Sum(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + _HIPCUB_STD::int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { using input_type = typename std::iterator_traits::value_type; @@ -270,15 +270,15 @@ struct DeviceSegmentedReduce template HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION - static hipError_t Sum(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) + static hipError_t Sum(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + _HIPCUB_STD::int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return Sum(d_temp_storage, @@ -293,14 +293,14 @@ struct DeviceSegmentedReduce template HIPCUB_RUNTIME_FUNCTION - static hipError_t Min(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + static hipError_t Min(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + _HIPCUB_STD::int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { using input_type = typename std::iterator_traits::value_type; @@ -318,15 +318,15 @@ struct DeviceSegmentedReduce template HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION - static hipError_t Min(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) + static hipError_t Min(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + _HIPCUB_STD::int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return Min(d_temp_storage, @@ -341,14 +341,14 @@ struct DeviceSegmentedReduce template HIPCUB_RUNTIME_FUNCTION - static hipError_t ArgMin(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + static hipError_t ArgMin(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + _HIPCUB_STD::int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { using OffsetT = int; using T = typename std::iterator_traits::value_type; @@ -382,15 +382,15 @@ struct DeviceSegmentedReduce template HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION - static hipError_t ArgMin(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) + static hipError_t ArgMin(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + _HIPCUB_STD::int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return ArgMin(d_temp_storage, @@ -405,14 +405,14 @@ struct DeviceSegmentedReduce template HIPCUB_RUNTIME_FUNCTION - static hipError_t Max(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + static hipError_t Max(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + _HIPCUB_STD::int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { using input_type = typename std::iterator_traits::value_type; @@ -430,15 +430,15 @@ struct DeviceSegmentedReduce template HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION - static hipError_t Max(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) + static hipError_t Max(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + _HIPCUB_STD::int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return Max(d_temp_storage, @@ -453,14 +453,14 @@ struct DeviceSegmentedReduce template HIPCUB_RUNTIME_FUNCTION - static hipError_t ArgMax(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + static hipError_t ArgMax(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + _HIPCUB_STD::int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { using OffsetT = int; using T = typename std::iterator_traits::value_type; @@ -494,15 +494,15 @@ struct DeviceSegmentedReduce template HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION - static hipError_t ArgMax(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) + static hipError_t ArgMax(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + _HIPCUB_STD::int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return ArgMax(d_temp_storage, diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp index 27b0069424c..7b7d0725fad 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp @@ -752,14 +752,14 @@ TYPED_TEST(HipcubDeviceSegmentedReduce, Max) struct ArgMinDispatch { template - auto operator()(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream) const + auto operator()(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + _HIPCUB_STD::int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream) const { return hipcub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, @@ -775,14 +775,14 @@ struct ArgMinDispatch struct ArgMaxDispatch { template - auto operator()(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream) const + auto operator()(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + _HIPCUB_STD::int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream) const { return hipcub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, From 0351bd554ed19f2090fc78a0a851b01758c448ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Mon, 3 Nov 2025 14:21:36 +0000 Subject: [PATCH 17/95] Remove ``LEGACY_PTX_ARCH`` --- projects/hipcub/CHANGELOG.md | 3 ++- .../hipcub/backend/cub/agent/single_pass_scan_operators.hpp | 3 +-- .../backend/rocprim/agent/single_pass_scan_operators.hpp | 2 -- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/projects/hipcub/CHANGELOG.md b/projects/hipcub/CHANGELOG.md index 565d6a466c3..63b70c96fcb 100644 --- a/projects/hipcub/CHANGELOG.md +++ b/projects/hipcub/CHANGELOG.md @@ -20,7 +20,8 @@ Full documentation for hipCUB is available at [https://rocm.docs.amd.com/project ### Removed -* Removed `DeviceSpmv`, which was deprecated from CUB after CCCL's 2.8.0 release. Use `hipSPARSE` or `rocSPARSE` libraries instead. +* Removed `DeviceSpmv`, which was removed from CUB after CCCL's 2.8.0 release. Use `hipSPARSE` or `rocSPARSE` libraries instead. +* Removed `LEGACY_PTX_ARCH`. * Removed `hipcub:max` and `hipcub:min`, which were deprecated. Use `hip::std::max` and `hip::std::min` instead. * Deprecated `hipcub::Swap`, use `rocprim::swap` instead. * Deprecated `HIPCUB_IS_INT128_ENABLED`, use `_CCCL_HAS_INT128()` instead. diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/agent/single_pass_scan_operators.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/agent/single_pass_scan_operators.hpp index 8e223cbbf47..03a98e16b1c 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/agent/single_pass_scan_operators.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/agent/single_pass_scan_operators.hpp @@ -46,10 +46,9 @@ using BlockScanRunningPrefixOp = cub::BlockScanRunningPrefixOp; template> using TilePrefixCallbackOp - = cub::TilePrefixCallbackOp; + = cub::TilePrefixCallbackOp; template[optional] Unused (deprecated). * \tparam DelayConstructorT [optional] Unused (CUB's implementation detail). */ template */> class TilePrefixCallbackOp { From d6f4a57b7a3611eada9a417e858eb3bf6e9fecb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Fri, 14 Nov 2025 14:53:22 +0000 Subject: [PATCH 18/95] Fix hipCUB trait deprecations --- .../hipcub/include/hipcub/backend/rocprim/util_type.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp index dcb2456deb8..3f713c5768e 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp @@ -501,7 +501,7 @@ struct Uninitialized * This enum is deprecated, please use instead. Or if you have * libhipcxx, please use the type_traits system in libhipcxx. */ -enum HIPCUB_DEPRECATED_BECAUSE("Use instead.") Category +enum Category { NOT_A_NUMBER, SIGNED_INTEGER, @@ -547,8 +547,8 @@ struct BaseTraits enum { - PRIMITIVE HIPCUB_DEPRECATED_BECAUSE("Use instead.") = true, - nullptr_TYPE = false, + PRIMITIVE HIPCUB_DEPRECATED_BECAUSE("Use instead.") = true, + HIPCUB_DEPRECATED nullptr_TYPE = false, }; using key_codec = decltype(::rocprim::traits::get().template radix_key_codec()); From 5ab3fb9a8c25c64212e62db9c3e5ba3f5fdac3ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Thu, 13 Nov 2025 14:57:18 +0000 Subject: [PATCH 19/95] Drop deprecated entities from util_type --- .../block/radix_rank_sort_operations.hpp | 5 +- .../hipcub/backend/rocprim/util_type.hpp | 240 ++++++------------ projects/hipcub/test/hipcub/bfloat16.hpp | 17 +- projects/hipcub/test/hipcub/half.hpp | 13 +- .../hipcub/test_hipcub_device_radix_sort.hpp | 10 +- .../hipcub/test_utils_sort_comparator.hpp | 27 +- 6 files changed, 107 insertions(+), 205 deletions(-) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/radix_rank_sort_operations.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/radix_rank_sort_operations.hpp index 0ead2289625..b9660713250 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/radix_rank_sort_operations.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/radix_rank_sort_operations.hpp @@ -42,6 +42,8 @@ #include // IWYU pragma: export #include // IWYU pragma: export +#include + BEGIN_HIPCUB_NAMESPACE /** \brief Twiddling keys for radix sort. */ @@ -92,8 +94,7 @@ struct RadixSortTwiddle enum { - HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH FLOAT_KEY = TraitsT::CATEGORY == FLOATING_POINT, - HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP + FLOAT_KEY = std::is_floating_point::value, }; static __device__ __forceinline__ UnsignedBits ProcessFloatMinusZero(UnsignedBits key) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp index 3f713c5768e..9d3df462afb 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp @@ -60,11 +60,6 @@ using NullType = ::rocprim::empty_type; #define _CCCL_HAS_INT128() 0 #endif -#ifndef HIPCUB_IS_INT128_ENABLED - // Deprecated [Since 4.2] - #define HIPCUB_IS_INT128_ENABLED _CCCL_HAS_INT128() -#endif // !defined(HIPCUB_IS_INT128_ENABLED) - template struct [[deprecated("[Since 1.16] If is deprecated use std::conditional instead.")]] If { @@ -155,12 +150,6 @@ struct DoubleBuffer } }; -template -struct HIPCUB_DEPRECATED_BECAUSE("Use ::std::integral_constant instead") Int2Type -{ - enum {VALUE = A}; -}; - #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template< @@ -512,45 +501,21 @@ enum Category /** * \brief Basic type traits */ -HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH -template +template struct BaseTraits -{ - /// Category - HIPCUB_DEPRECATED_BECAUSE("Use instead.") - static constexpr Category CATEGORY = _CATEGORY; - enum - { - PRIMITIVE HIPCUB_DEPRECATED_BECAUSE("Use instead.") = _PRIMITIVE, - nullptr_TYPE = _nullptr_TYPE, - }; -}; -HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP +{}; /** * Basic type traits (unsigned primitive specialization) */ -HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH -template -struct BaseTraits +template +struct BaseTraits { - using UnsignedBits = _UnsignedBits; - HIPCUB_DEPRECATED_BECAUSE("Use instead.") - static constexpr Category CATEGORY = UNSIGNED_INTEGER; + using UnsignedBits = _UnsignedBits; static constexpr UnsignedBits LOWEST_KEY = UnsignedBits(0); static constexpr UnsignedBits MAX_KEY = UnsignedBits(-1); - enum - { - PRIMITIVE HIPCUB_DEPRECATED_BECAUSE("Use instead.") = true, - HIPCUB_DEPRECATED nullptr_TYPE = false, - }; - using key_codec = decltype(::rocprim::traits::get().template radix_key_codec()); static HIPCUB_HOST_DEVICE __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) @@ -579,29 +544,19 @@ struct BaseTraits return retval; } }; -HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP /** * Basic type traits (signed primitive specialization) */ -HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH -template -struct BaseTraits +template +struct BaseTraits { using UnsignedBits = _UnsignedBits; - HIPCUB_DEPRECATED_BECAUSE("Use instead.") - static constexpr Category CATEGORY = SIGNED_INTEGER; static constexpr UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1); static constexpr UnsignedBits LOWEST_KEY = HIGH_BIT; static constexpr UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT; - enum - { - PRIMITIVE HIPCUB_DEPRECATED_BECAUSE("Use instead.") = true, - nullptr_TYPE = false, - }; - using key_codec = decltype(::rocprim::traits::get().template radix_key_codec()); static HIPCUB_HOST_DEVICE __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) @@ -626,91 +581,21 @@ struct BaseTraits return reinterpret_cast(retval); } }; -HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP - -// This API needs to be deprecated once libhipcxx is available. -template -struct FpLimits; - -// This API needs to be deprecated once libhipcxx is available. -template <> -struct FpLimits -{ - static HIPCUB_HOST_DEVICE __forceinline__ float Max() { - return std::numeric_limits::max(); - } - - static HIPCUB_HOST_DEVICE __forceinline__ float Lowest() { - return std::numeric_limits::max() * float(-1); - } -}; - -// This API needs to be deprecated once libhipcxx is available. -template <> -struct FpLimits -{ - static HIPCUB_HOST_DEVICE __forceinline__ double Max() { - return std::numeric_limits::max(); - } - - static HIPCUB_HOST_DEVICE __forceinline__ double Lowest() { - return std::numeric_limits::max() * double(-1); - } -}; - -// This API needs to be deprecated once libhipcxx is available. -template <> -struct FpLimits<__half> -{ - static HIPCUB_HOST_DEVICE __forceinline__ __half Max() { - unsigned short max_word = 0x7BFF; - return reinterpret_cast<__half&>(max_word); - } - - static HIPCUB_HOST_DEVICE __forceinline__ __half Lowest() { - unsigned short lowest_word = 0xFBFF; - return reinterpret_cast<__half&>(lowest_word); - } -}; - -// This API needs to be deprecated once libhipcxx is available. -template <> -struct FpLimits -{ - static HIPCUB_HOST_DEVICE __forceinline__ hip_bfloat16 Max() { - unsigned short max_word = 0x7F7F; - return reinterpret_cast(max_word); - } - - static HIPCUB_HOST_DEVICE __forceinline__ hip_bfloat16 Lowest() { - unsigned short lowest_word = 0xFF7F; - return reinterpret_cast(lowest_word); - } -}; /** * Basic type traits (fp primitive specialization) */ -HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH -template -struct BaseTraits +template +struct BaseTraits { using UnsignedBits = _UnsignedBits; - HIPCUB_DEPRECATED_BECAUSE("Use instead.") - static constexpr Category CATEGORY = FLOATING_POINT; static constexpr UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1); static constexpr UnsignedBits LOWEST_KEY = UnsignedBits(-1); static constexpr UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT; using key_codec = decltype(::rocprim::traits::get().template radix_key_codec()); - enum - { - PRIMITIVE HIPCUB_DEPRECATED_BECAUSE("Use instead.") = true, - nullptr_TYPE = false, - }; - static HIPCUB_HOST_DEVICE __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) { UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT; @@ -723,36 +608,72 @@ struct BaseTraits return key ^ mask; }; - static HIPCUB_HOST_DEVICE __forceinline__ T Max() { - return FpLimits::Max(); + static HIPCUB_HOST_DEVICE __forceinline__ + T Max() + { + return std::numeric_limits::max(); } - static HIPCUB_HOST_DEVICE __forceinline__ T Lowest() { - return FpLimits::Lowest(); + static HIPCUB_HOST_DEVICE __forceinline__ + T Lowest() + { + return std::numeric_limits::lowest(); } }; -HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP /** * \brief Numeric type traits */ -HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH -template struct NumericTraits : BaseTraits {}; +template +struct NumericTraits : BaseTraits +{}; -template <> struct NumericTraits : BaseTraits {}; +template<> +struct NumericTraits : BaseTraits +{}; -template <> struct NumericTraits : BaseTraits<(std::numeric_limits::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char, char> {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; +template<> +struct NumericTraits + : BaseTraits<(std::numeric_limits::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, + unsigned char, + char> +{}; +template<> +struct NumericTraits : BaseTraits +{}; +template<> +struct NumericTraits : BaseTraits +{}; +template<> +struct NumericTraits : BaseTraits +{}; +template<> +struct NumericTraits : BaseTraits +{}; +template<> +struct NumericTraits : BaseTraits +{}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; +template<> +struct NumericTraits + : BaseTraits +{}; +template<> +struct NumericTraits + : BaseTraits +{}; +template<> +struct NumericTraits + : BaseTraits +{}; +template<> +struct NumericTraits + : BaseTraits +{}; +template<> +struct NumericTraits + : BaseTraits +{}; #if _CCCL_HAS_INT128() template<> @@ -761,14 +682,9 @@ struct NumericTraits<__uint128_t> using T = __uint128_t; using UnsignedBits = __uint128_t; - static constexpr Category CATEGORY = UNSIGNED_INTEGER; static constexpr UnsignedBits LOWEST_KEY = UnsignedBits(0); static constexpr UnsignedBits MAX_KEY = UnsignedBits(-1); - HIPCUB_DEPRECATED_BECAUSE("Use instead.") - static constexpr bool PRIMITIVE = false; - static constexpr bool nullptr_TYPE = false; - using key_codec = decltype(::rocprim::traits::get().template radix_key_codec()); static __host__ __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) @@ -798,15 +714,10 @@ struct NumericTraits<__int128_t> using T = __int128_t; using UnsignedBits = __uint128_t; - static constexpr Category CATEGORY = SIGNED_INTEGER; static constexpr UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1); static constexpr UnsignedBits LOWEST_KEY = HIGH_BIT; static constexpr UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT; - HIPCUB_DEPRECATED_BECAUSE("Use instead.") - static constexpr bool PRIMITIVE = false; - static constexpr bool nullptr_TYPE = false; - using key_codec = decltype(::rocprim::traits::get().template radix_key_codec()); static __host__ __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) @@ -833,13 +744,24 @@ struct NumericTraits<__int128_t> }; #endif -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits<__half> : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; +template<> +struct NumericTraits : BaseTraits +{}; +template<> +struct NumericTraits : BaseTraits +{}; +template<> +struct NumericTraits<__half> : BaseTraits +{}; +template<> +struct NumericTraits + : BaseTraits +{}; -template <> struct NumericTraits : BaseTraits::VolatileWord, bool> {}; -HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP +template<> +struct NumericTraits + : BaseTraits::VolatileWord, bool> +{}; /** * \brief Type traits diff --git a/projects/hipcub/test/hipcub/bfloat16.hpp b/projects/hipcub/test/hipcub/bfloat16.hpp index cc63e155b53..06e034d6b9d 100644 --- a/projects/hipcub/test/hipcub/bfloat16.hpp +++ b/projects/hipcub/test/hipcub/bfloat16.hpp @@ -95,8 +95,8 @@ struct bfloat16_t /// Constructor from unsigned long long int template::value - && (!std::is_same::value)>::type> + std::is_same_v + && (!std::is_same_v)>::type> __host__ __device__ __forceinline__ bfloat16_t(T a) { *this = bfloat16_t(float(a)); @@ -278,20 +278,15 @@ inline std::ostream& operator<<(std::ostream &out, const bfloat16_t &x) * Traits overloads ******************************************************************************/ -template <> -struct hipcub::FpLimits -{ - static __host__ __device__ __forceinline__ bfloat16_t Max() { return bfloat16_t::max(); } - - static __host__ __device__ __forceinline__ bfloat16_t Lowest() { return bfloat16_t::lowest(); } -}; - #if defined(__HIP_PLATFORM_NVIDIA__) _CCCL_SUPPRESS_DEPRECATED_PUSH #else HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH #endif -template <> struct hipcub::NumericTraits : hipcub::BaseTraits {}; +template<> +struct hipcub::NumericTraits + : hipcub::BaseTraits +{}; #if defined(__HIP_PLATFORM_NVIDIA__) _CCCL_SUPPRESS_DEPRECATED_POP #else diff --git a/projects/hipcub/test/hipcub/half.hpp b/projects/hipcub/test/hipcub/half.hpp index f1fa31eebcd..7bae9b418f6 100644 --- a/projects/hipcub/test/hipcub/half.hpp +++ b/projects/hipcub/test/hipcub/half.hpp @@ -331,20 +331,15 @@ inline std::ostream& operator<<(std::ostream &out, const half_t &x) * Traits overloads ******************************************************************************/ -template <> -struct hipcub::FpLimits -{ - static __host__ __device__ __forceinline__ half_t Max() { return half_t::max(); } - - static __host__ __device__ __forceinline__ half_t Lowest() { return half_t::lowest(); } -}; - #if defined(__HIP_PLATFORM_NVIDIA__) _CCCL_SUPPRESS_DEPRECATED_PUSH #else HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH #endif -template <> struct hipcub::NumericTraits : hipcub::BaseTraits {}; +template<> +struct hipcub::NumericTraits + : hipcub::BaseTraits +{}; #if defined(__HIP_PLATFORM_NVIDIA__) _CCCL_SUPPRESS_DEPRECATED_POP #else diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp b/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp index 9c041765180..70934d35940 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp @@ -87,9 +87,8 @@ class HipcubDeviceRadixSort : public ::testing::Test TYPED_TEST_SUITE_P(HipcubDeviceRadixSort); template -auto generate_key_input(size_t size, unsigned int seed_value) HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH - -> std::enable_if_t::CATEGORY == hipcub::FLOATING_POINT, - std::vector> HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP +auto generate_key_input(size_t size, unsigned int seed_value) + -> std::enable_if_t::value, std::vector> { auto result = test_utils::get_random_data(size, test_utils::numeric_limits::min(), @@ -100,9 +99,8 @@ auto generate_key_input(size_t size, unsigned int seed_value) HIPCUB_CLANG_SUPPR } template -auto generate_key_input(size_t size, unsigned int seed_value) HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH - -> std::enable_if_t::CATEGORY != hipcub::FLOATING_POINT, - std::vector> HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP +auto generate_key_input(size_t size, unsigned int seed_value) + -> std::enable_if_t::value, std::vector> { using inner_t = typename test_utils::inner_type::type; return test_utils::get_random_data(size, diff --git a/projects/hipcub/test/hipcub/test_utils_sort_comparator.hpp b/projects/hipcub/test/hipcub/test_utils_sort_comparator.hpp index 27fc65c4076..cf0927de945 100644 --- a/projects/hipcub/test/hipcub/test_utils_sort_comparator.hpp +++ b/projects/hipcub/test/hipcub/test_utils_sort_comparator.hpp @@ -45,11 +45,7 @@ namespace detail template::CATEGORY == hipcub::SIGNED_INTEGER - || hipcub::NumericTraits::CATEGORY == hipcub::UNSIGNED_INTEGER, - int> HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP - = 0> + std::enable_if_t::value, int> = 0> Key to_bits(const Key key) { static constexpr Key radix_mask_upper @@ -63,10 +59,8 @@ Key to_bits(const Key key) template::CATEGORY == hipcub::FLOATING_POINT, int> - = 0> -HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP auto to_bits(const Key key) + std::enable_if_t::value, int> = 0> +auto to_bits(const Key key) { using unsigned_bits_type = typename hipcub::NumericTraits::UnsignedBits; @@ -112,15 +106,12 @@ auto to_bits(const Key& key) auto bit_key_lower = static_cast(to_bits<0, sizeof(key.y) * 8>(key.y)); // Flip sign bit to properly order signed types - HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH - if(::hipcub::NumericTraits::CATEGORY == hipcub::SIGNED_INTEGER) - HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP - { - constexpr auto sign_bit = static_cast(1) - << (sizeof(inner_t) * 8 - 1); - bit_key_upper ^= sign_bit; - bit_key_lower ^= sign_bit; - } + if(std::is_signed::value) + { + constexpr auto sign_bit = static_cast(1) << (sizeof(inner_t) * 8 - 1); + bit_key_upper ^= sign_bit; + bit_key_lower ^= sign_bit; + } // Create the result containing both parts const auto bit_key From 6a621f0dbc60b08528d2a4b5e61ed95fea84c1c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Thu, 13 Nov 2025 15:04:44 +0000 Subject: [PATCH 20/95] Drop ``GridBarrier`` --- projects/hipcub/CHANGELOG.md | 1 + .../include/hipcub/grid/grid_barrier.hpp | 40 ----------- .../hipcub/test/hipcub/test_hipcub_grid.cpp | 71 ------------------- 3 files changed, 1 insertion(+), 111 deletions(-) delete mode 100644 projects/hipcub/hipcub/include/hipcub/grid/grid_barrier.hpp diff --git a/projects/hipcub/CHANGELOG.md b/projects/hipcub/CHANGELOG.md index 63b70c96fcb..c292f0d6a72 100644 --- a/projects/hipcub/CHANGELOG.md +++ b/projects/hipcub/CHANGELOG.md @@ -21,6 +21,7 @@ Full documentation for hipCUB is available at [https://rocm.docs.amd.com/project ### Removed * Removed `DeviceSpmv`, which was removed from CUB after CCCL's 2.8.0 release. Use `hipSPARSE` or `rocSPARSE` libraries instead. +* Removed `GridBarrier`. * Removed `LEGACY_PTX_ARCH`. * Removed `hipcub:max` and `hipcub:min`, which were deprecated. Use `hip::std::max` and `hip::std::min` instead. * Deprecated `hipcub::Swap`, use `rocprim::swap` instead. diff --git a/projects/hipcub/hipcub/include/hipcub/grid/grid_barrier.hpp b/projects/hipcub/hipcub/include/hipcub/grid/grid_barrier.hpp deleted file mode 100644 index 8c8863be647..00000000000 --- a/projects/hipcub/hipcub/include/hipcub/grid/grid_barrier.hpp +++ /dev/null @@ -1,40 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2021-2025, Advanced Micro Devices, Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -#ifndef HIPCUB_GRID_GRID_BARRIER_HPP_ -#define HIPCUB_GRID_GRID_BARRIER_HPP_ - -#ifdef __HIP_PLATFORM_AMD__ - #include "../backend/rocprim/grid/grid_barrier.hpp" // IWYU pragma: export -#elif defined(__HIP_PLATFORM_NVIDIA__) - #include "../backend/cub/grid/grid_barrier.hpp" // IWYU pragma: export - #include "../config.hpp" -#endif - -#endif // HIPCUB_GRID_GRID_BARRIER_HPP_ diff --git a/projects/hipcub/test/hipcub/test_hipcub_grid.cpp b/projects/hipcub/test/hipcub/test_hipcub_grid.cpp index b7ff8715c98..654bf550e32 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_grid.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_grid.cpp @@ -32,80 +32,9 @@ #include #include -#include #include #include -#if defined(__HIP_PLATFORM_NVIDIA__) -_CCCL_SUPPRESS_DEPRECATED_PUSH -#else -HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH -#endif -__global__ -void KernelGridBarrier(hipcub::GridBarrier global_barrier, int iterations) -#if defined(__HIP_PLATFORM_NVIDIA__) - _CCCL_SUPPRESS_DEPRECATED_POP -#else - HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP -#endif -{ - for (int i = 0; i < iterations; i++) - { - global_barrier.Sync(); - } -} - -TEST(HipcubGridTests, GridBarrier) -{ - int device_id = test_common_utils::obtain_device_from_ctest(); - SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); - HIP_CHECK(hipSetDevice(device_id)); - - constexpr int32_t block_size = 256; - // NOTE increasing iterations will cause huge latency for tests - constexpr int32_t iterations = 3; - int32_t grid_size = -1; - - int32_t sm_count; - int32_t max_block_threads; - int32_t max_sm_occupancy; - - HIP_CHECK(hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, device_id)); - HIP_CHECK(hipDeviceGetAttribute(&max_block_threads, hipDeviceAttributeMaxThreadsPerBlock, device_id)); - - HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_sm_occupancy, - KernelGridBarrier, - HIPCUB_HOST_WARP_THREADS, - 0)); - - int32_t occupancy = std::min((max_block_threads / block_size), max_sm_occupancy); - - if (grid_size == -1) - { - grid_size = occupancy * sm_count; - } - else - { - occupancy = grid_size / sm_count; - } -#if defined(__HIP_PLATFORM_NVIDIA__) - _CCCL_SUPPRESS_DEPRECATED_PUSH -#else - HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH -#endif - hipcub::GridBarrierLifetime global_barrier; -#if defined(__HIP_PLATFORM_NVIDIA__) - _CCCL_SUPPRESS_DEPRECATED_POP -#else - HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP -#endif - HIP_CHECK(global_barrier.Setup(grid_size)); - - KernelGridBarrier<<>>(global_barrier, iterations); - HIP_CHECK(hipGetLastError()); -} - template< int32_t BlockSize, class T, From ef75cb4ce1c26288c2de51c3e2b165b6620abc41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Thu, 13 Nov 2025 15:14:57 +0000 Subject: [PATCH 21/95] Replace pre-c++17 traits with modern ones --- .../benchmark_block_adjacent_difference.cpp | 10 +- projects/hipcub/benchmark/benchmark_utils.hpp | 3 +- .../hipcub/benchmark/benchmark_warp_scan.cpp | 7 +- .../rocprim/device/device_histogram.hpp | 18 +-- .../device/device_segmented_reduce.hpp | 8 +- .../rocprim/thread/thread_operators.hpp | 2 +- projects/hipcub/test/hipcub/half.hpp | 4 +- .../test_hipcub_block_adjacent_difference.cpp | 10 +- .../test_hipcub_block_discontinuity.cpp | 24 ++-- .../test_hipcub_block_load_store.kernels.hpp | 4 +- .../hipcub/test_hipcub_block_radix_rank.cpp | 2 +- .../test/hipcub/test_hipcub_device_for.cpp | 2 +- .../test/hipcub/test_hipcub_device_reduce.cpp | 8 +- .../test/hipcub/test_hipcub_device_scan.cpp | 36 +++--- .../test/hipcub/test_hipcub_device_select.cpp | 6 +- .../test/hipcub/test_hipcub_warp_exchange.cpp | 2 +- projects/hipcub/test/hipcub/test_utils.hpp | 119 +++++++++--------- .../test/hipcub/test_utils_assertions.hpp | 28 +++-- .../hipcub/test_utils_data_generation.hpp | 20 +-- .../hipcub/test_utils_thread_operators.hpp | 18 +-- 20 files changed, 168 insertions(+), 163 deletions(-) diff --git a/projects/hipcub/benchmark/benchmark_block_adjacent_difference.cpp b/projects/hipcub/benchmark/benchmark_block_adjacent_difference.cpp index 7c7ac6bc193..3679076910a 100644 --- a/projects/hipcub/benchmark/benchmark_block_adjacent_difference.cpp +++ b/projects/hipcub/benchmark/benchmark_block_adjacent_difference.cpp @@ -221,8 +221,8 @@ template auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) - -> std::enable_if_t::value - && !std::is_same::value> + -> std::enable_if_t + && !std::is_same_v> { constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto num_blocks = (N + items_per_block - 1) / items_per_block; @@ -271,8 +271,8 @@ template auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) - -> std::enable_if_t::value - || std::is_same::value> + -> std::enable_if_t + || std::is_same_v> { constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto num_blocks = (N + items_per_block - 1) / items_per_block; @@ -352,7 +352,7 @@ void add_benchmarks(const std::string& name, BENCHMARK_TYPE(long long, 256, false), BENCHMARK_TYPE(double, 256, false)}; - if(!std::is_same::value) + if(!std::is_same_v) { bs.insert(bs.end(), {BENCHMARK_TYPE(int, 256, true), diff --git a/projects/hipcub/benchmark/benchmark_utils.hpp b/projects/hipcub/benchmark/benchmark_utils.hpp index 7d4085a4c41..52dc87c18bd 100644 --- a/projects/hipcub/benchmark/benchmark_utils.hpp +++ b/projects/hipcub/benchmark/benchmark_utils.hpp @@ -357,8 +357,7 @@ inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = template inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) -> - typename std::enable_if::value - && !std::is_same::value, + typename std::enable_if::value && !std::is_same_v, std::vector>::type { diff --git a/projects/hipcub/benchmark/benchmark_warp_scan.cpp b/projects/hipcub/benchmark/benchmark_warp_scan.cpp index db3fe941f6c..4d05b604be8 100644 --- a/projects/hipcub/benchmark/benchmark_warp_scan.cpp +++ b/projects/hipcub/benchmark/benchmark_warp_scan.cpp @@ -228,9 +228,8 @@ template auto add_benchmarks(std::vector& benchmarks, const std::string& method_name, hipStream_t stream, - size_t size) - -> std::enable_if_t::value - || std::is_same::value> + size_t size) -> std::enable_if_t + || std::is_same_v> { using custom_double2 = benchmark_utils::custom_type; using custom_int_double = benchmark_utils::custom_type; @@ -249,7 +248,7 @@ template auto add_benchmarks(std::vector& benchmarks, const std::string& method_name, hipStream_t stream, - size_t size) -> std::enable_if_t::value> + size_t size) -> std::enable_if_t> { using custom_double2 = benchmark_utils::custom_type; using custom_int_double = benchmark_utils::custom_type; diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_histogram.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_histogram.hpp index e1422c0f2a6..a67a139eca3 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_histogram.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_histogram.hpp @@ -67,18 +67,18 @@ HIPCUB_FORCEINLINE bool may_overflow(LevelT lower_level, template struct int_arithmetic_t { - using type = ::std::conditional_t< - sizeof(SampleT) + sizeof(CommonT) <= sizeof(uint32_t), - uint32_t, + using type + = ::std::conditional_t::value - || ::std::is_same::value), - CommonT, - uint64_t> + ::std::conditional_t<(::std::is_same_v + || ::std::is_same_v), + CommonT, + uint64_t> #else - uint64_t + uint64_t #endif - >; + >; }; // If potential overflow is detected, returns hipErrorInvalidValue, otherwise hipSuccess. diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp index d698db38dec..408df01e34d 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp @@ -353,8 +353,8 @@ struct DeviceSegmentedReduce using OffsetT = int; using T = typename std::iterator_traits::value_type; using O = typename std::iterator_traits::value_type; - using OutputTupleT = typename std:: - conditional::value, KeyValuePair, O>::type; + using OutputTupleT = + typename std::conditional, KeyValuePair, O>::type; using OutputValueT = typename OutputTupleT::Value; using IteratorT = ArgIndexInputIterator; @@ -465,8 +465,8 @@ struct DeviceSegmentedReduce using OffsetT = int; using T = typename std::iterator_traits::value_type; using O = typename std::iterator_traits::value_type; - using OutputTupleT = typename std:: - conditional::value, KeyValuePair, O>::type; + using OutputTupleT = + typename std::conditional, KeyValuePair, O>::type; using OutputValueT = typename OutputTupleT::Value; using IteratorT = ArgIndexInputIterator; diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp index df8a143ad4d..7b967cefac9 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp @@ -559,7 +559,7 @@ namespace detail // Non-void value type. template using non_void_value_t = - typename std::conditional::value, FallbackT, IteratorT>::type; + typename std::conditional, FallbackT, IteratorT>::type; /// Intermediate accumulator type. template diff --git a/projects/hipcub/test/hipcub/half.hpp b/projects/hipcub/test/hipcub/half.hpp index 7bae9b418f6..b0501dddc19 100644 --- a/projects/hipcub/test/hipcub/half.hpp +++ b/projects/hipcub/test/hipcub/half.hpp @@ -84,8 +84,8 @@ struct half_t /// Constructor from unsigned long long int template::value - && (!std::is_same::value)>::type> + std::is_same_v + && (!std::is_same_v)>::type> __host__ __device__ __forceinline__ half_t(T a) { *this = half_t(float(a)); diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_adjacent_difference.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_adjacent_difference.cpp index b4c0e1a7f7d..3b0c204e7cd 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_adjacent_difference.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_adjacent_difference.cpp @@ -254,7 +254,7 @@ TYPED_TEST(HipcubBlockAdjacentDifferenceSubtract, SubtractLeft) using output_type = typename TestFixture::params_subtract::output; - using stored_type = std::conditional_t::value, int, output_type>; + using stored_type = std::conditional_t, int, output_type>; constexpr size_t block_size = TestFixture::params_subtract::block_size; constexpr size_t items_per_thread = TestFixture::params_subtract::items_per_thread; @@ -356,7 +356,7 @@ TYPED_TEST(HipcubBlockAdjacentDifferenceSubtract, SubtractLeftPartialTile) using output_type = typename TestFixture::params_subtract::output; - using stored_type = std::conditional_t::value, int, output_type>; + using stored_type = std::conditional_t, int, output_type>; constexpr size_t block_size = TestFixture::params_subtract::block_size; constexpr size_t items_per_thread = TestFixture::params_subtract::items_per_thread; @@ -478,7 +478,7 @@ TYPED_TEST(HipcubBlockAdjacentDifferenceSubtract, SubtractRight) using output_type = typename TestFixture::params_subtract::output; - using stored_type = std::conditional_t::value, int, output_type>; + using stored_type = std::conditional_t, int, output_type>; constexpr size_t block_size = TestFixture::params_subtract::block_size; constexpr size_t items_per_thread = TestFixture::params_subtract::items_per_thread; @@ -580,7 +580,7 @@ TYPED_TEST(HipcubBlockAdjacentDifferenceSubtract, SubtractRightPartialTile) using output_type = typename TestFixture::params_subtract::output; - using stored_type = std::conditional_t::value, int, output_type>; + using stored_type = std::conditional_t, int, output_type>; constexpr size_t block_size = TestFixture::params_subtract::block_size; constexpr size_t items_per_thread = TestFixture::params_subtract::items_per_thread; @@ -683,7 +683,7 @@ TYPED_TEST(HipcubBlockAdjacentDifferenceSubtract, SubtractRightPartialTile) ASSERT_NO_FATAL_FAILURE(test_utils::assert_near(output, expected, is_add_op::value ? _HIPCUB_STD::max(test_utils::precision::value, test_utils::precision::value) - : std::is_same::value + : std::is_same_v ? 0 : test_utils::precision::value)); // clang-format on diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_discontinuity.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_discontinuity.cpp index 0bed699bdef..28765be3a9a 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_discontinuity.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_discontinuity.cpp @@ -146,10 +146,10 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagHeads) using type = typename TestFixture::params::type; // std::vector is a special case that will cause an error in hipMemcpy - using stored_flag_type = typename std::conditional< - std::is_same::value, - int, - typename TestFixture::params::flag_type>::type; + using stored_flag_type = + typename std::conditional, + int, + typename TestFixture::params::flag_type>::type; using flag_type = typename TestFixture::params::flag_type; using flag_op_type = typename TestFixture::params::flag_op_type; constexpr size_t block_size = TestFixture::params::block_size; @@ -284,10 +284,10 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagTails) using type = typename TestFixture::params::type; // std::vector is a special case that will cause an error in hipMemcpy - using stored_flag_type = typename std::conditional< - std::is_same::value, - int, - typename TestFixture::params::flag_type>::type; + using stored_flag_type = + typename std::conditional, + int, + typename TestFixture::params::flag_type>::type; using flag_type = typename TestFixture::params::flag_type; using flag_op_type = typename TestFixture::params::flag_op_type; constexpr size_t block_size = TestFixture::params::block_size; @@ -451,10 +451,10 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagHeadsAndTails) using type = typename TestFixture::params::type; // std::vector is a special case that will cause an error in hipMemcpy - using stored_flag_type = typename std::conditional< - std::is_same::value, - int, - typename TestFixture::params::flag_type>::type; + using stored_flag_type = + typename std::conditional, + int, + typename TestFixture::params::flag_type>::type; using flag_type = typename TestFixture::params::flag_type; using flag_op_type = typename TestFixture::params::flag_op_type; constexpr size_t block_size = TestFixture::params::block_size; diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_load_store.kernels.hpp b/projects/hipcub/test/hipcub/test_hipcub_block_load_store.kernels.hpp index 4dc2282294d..a126893664c 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_load_store.kernels.hpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_load_store.kernels.hpp @@ -152,8 +152,8 @@ __launch_bounds__(BlockSize) __global__ // The output value type using OutputT = typename std::conditional< - (std::is_same::value_type, - void>::value), // OutputT = (if output iterator's value type is void) ? + (std::is_same_v::value_type, + void>), // OutputT = (if output iterator's value type is void) ? typename std::iterator_traits::value_type, // ... then the input iterator's // value type, typename std::iterator_traits::value_type>:: diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp index 9e5ca6ccb38..5840b0939f6 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp @@ -640,7 +640,7 @@ void test_radix_rank_with_prefix_sum_output() constexpr unsigned end_bit = start_bit + radix_bits; constexpr size_t items_per_block = block_size * items_per_thread; - if constexpr(std::is_same::value) + if constexpr(std::is_same_v) { // Given block size not supported diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp index 94fa2f8e59a..63a07db720b 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp @@ -1087,7 +1087,7 @@ struct offset_count_device_t HIPCUB_DEVICE void operator()(OffsetT i) { - static_assert(std::is_same::value, "T and OffsetT must be the same type"); + static_assert(std::is_same_v, "T and OffsetT must be the same type"); atomicAdd(d_count + i, 1); } }; diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp index 2866cdd76d0..f3d0cc31ab8 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp @@ -138,8 +138,8 @@ TYPED_TEST(HipcubDeviceReduceTests, ReduceSum) size_t temp_storage_size_bytes; void* d_temp_storage = nullptr; // Get size of d_temp_storage - if constexpr(std::is_same::value - || std::is_same::value) + if constexpr(std::is_same_v + || std::is_same_v) { HIP_CHECK(hipcub::DeviceReduce::Reduce(d_temp_storage, temp_storage_size_bytes, @@ -172,8 +172,8 @@ TYPED_TEST(HipcubDeviceReduceTests, ReduceSum) gHelper.startStreamCapture(stream); // Run - if constexpr(std::is_same::value - || std::is_same::value) + if constexpr(std::is_same_v + || std::is_same_v) { HIP_CHECK(hipcub::DeviceReduce::Reduce(d_temp_storage, temp_storage_size_bytes, diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp index e601b00602c..8d9ab74a6ba 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp @@ -79,9 +79,8 @@ using HipcubDeviceScanTestsParams template struct accum_type { - static constexpr bool is_low_precision - = std::is_same::value - || std::is_same::value; + static constexpr bool is_low_precision = std::is_same_v + || std::is_same_v; static constexpr bool is_add = test_utils::is_add_operator::value; using type = typename std::conditional_t; }; @@ -122,7 +121,7 @@ TYPED_TEST(HipcubDeviceScanTests, AccumulatorTypeTest) using T = hipcub::detail::accumulator_t; using U = typename TestFixture::input_type; - static_assert(std::is_same::value, "accumulator type mismatch"); + static_assert(std::is_same_v, "accumulator type mismatch"); ASSERT_TRUE(true); } @@ -141,7 +140,7 @@ TYPED_TEST(HipcubDeviceScanTests, InclusiveScan) using is_add_op = test_utils::is_add_operator; using acc_type = typename accum_type::type; using IteratorType = rocprim::transform_iterator, acc_type>; - constexpr bool inplace = std::is_same::value && std::is_same::value; + constexpr bool inplace = std::is_same_v && std::is_same_v; // for non-associative operations in inclusive scan // intermediate results use the type of input iterator, then @@ -212,7 +211,7 @@ TYPED_TEST(HipcubDeviceScanTests, InclusiveScan) auto call = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { - if HIPCUB_IF_CONSTEXPR(std::is_same::value) + if HIPCUB_IF_CONSTEXPR(std::is_same_v) { if HIPCUB_IF_CONSTEXPR(inplace) { @@ -333,7 +332,7 @@ TYPED_TEST(HipcubDeviceScanTests, InclusiveScanInit) using is_add_op = test_utils::is_add_operator; using acc_type = typename accum_type::type; using IteratorType = rocprim::transform_iterator, acc_type>; - constexpr bool inplace = std::is_same::value && std::is_same::value; + constexpr bool inplace = std::is_same_v && std::is_same_v; // for non-associative operations in inclusive scan // intermediate results use the type of input iterator, then @@ -596,7 +595,7 @@ TYPED_TEST(HipcubDeviceScanTests, InclusiveScanByKey) size_t temp_storage_size_bytes{}; void* d_temp_storage = nullptr; // Get size of d_temp_storage - if(std::is_same::value) + if(std::is_same_v) { HIP_CHECK(hipcub::DeviceScan::InclusiveSumByKey(d_temp_storage, temp_storage_size_bytes, @@ -632,7 +631,7 @@ TYPED_TEST(HipcubDeviceScanTests, InclusiveScanByKey) gHelper.startStreamCapture(stream); // Run - if(std::is_same::value) + if(std::is_same_v) { HIP_CHECK(hipcub::DeviceScan::InclusiveSumByKey(d_temp_storage, temp_storage_size_bytes, @@ -702,7 +701,7 @@ TYPED_TEST(HipcubDeviceScanTests, ExclusiveScan) using is_add_op = test_utils::is_add_operator; using acc_type = typename accum_type::type; using IteratorType = rocprim::transform_iterator, acc_type>; - constexpr bool inplace = std::is_same::value && std::is_same::value; + constexpr bool inplace = std::is_same_v && std::is_same_v; // for non-associative operations in inclusive scan // intermediate results use the type of input iterator, then @@ -764,7 +763,7 @@ TYPED_TEST(HipcubDeviceScanTests, ExclusiveScan) // Calculate expected results on host std::vector expected(input.size()); const T initial_value - = std::is_same::value + = std::is_same_v ? test_utils::convert_to_device(0) : test_utils::get_random_value(test_utils::convert_to_device(1), test_utils::convert_to_device(100), @@ -783,7 +782,7 @@ TYPED_TEST(HipcubDeviceScanTests, ExclusiveScan) auto call = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { - if HIPCUB_IF_CONSTEXPR(std::is_same::value) + if HIPCUB_IF_CONSTEXPR(std::is_same_v) { if HIPCUB_IF_CONSTEXPR(inplace) { @@ -958,7 +957,7 @@ TYPED_TEST(HipcubDeviceScanTests, ExclusiveScanByKey) test_utils::convert_to_device(10), seed_value); T initial_value = initial_value_vector.front(); - if(std::is_same::value) + if(std::is_same_v) { initial_value = test_utils::convert_to_device(0); } @@ -998,7 +997,7 @@ TYPED_TEST(HipcubDeviceScanTests, ExclusiveScanByKey) size_t temp_storage_size_bytes; void* d_temp_storage = nullptr; // Get size of d_temp_storage - if(std::is_same::value) + if(std::is_same_v) { HIP_CHECK(hipcub::DeviceScan::ExclusiveSumByKey(d_temp_storage, temp_storage_size_bytes, @@ -1035,7 +1034,7 @@ TYPED_TEST(HipcubDeviceScanTests, ExclusiveScanByKey) gHelper.startStreamCapture(stream); // Run - if(std::is_same::value) + if(std::is_same_v) { HIP_CHECK(hipcub::DeviceScan::ExclusiveSumByKey(d_temp_storage, temp_storage_size_bytes, @@ -1337,12 +1336,11 @@ TYPED_TEST(HipcubDeviceScanTests, ExclusiveScanFuture) const auto future_initial_value = hipcub::FutureValue{d_initial_value}; // Check the provided aliases to be correct at compile-time - static_assert( - std::is_same::value, - "The futures value type is expected to be U"); + static_assert(std::is_same_v, + "The futures value type is expected to be U"); static_assert( - std::is_same::value, + std::is_same_v, "The futures iterator type is expected to be U*"); // temp storage diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp index 02dc3628002..6d6d75be103 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp @@ -75,7 +75,7 @@ TYPED_TEST(HipcubDeviceSelectTests, Flagged) using U = typename TestFixture::output_type; using F = typename TestFixture::flag_type; - constexpr bool inplace = std::is_same::value; + constexpr bool inplace = std::is_same_v; hipStream_t stream = 0; // default if(TestFixture::use_graphs) @@ -337,7 +337,7 @@ TYPED_TEST(HipcubDeviceSelectTests, SelectOp) using T = typename TestFixture::input_type; using U = typename TestFixture::output_type; - constexpr bool inplace = std::is_same::value; + constexpr bool inplace = std::is_same_v; hipStream_t stream = 0; // default if(TestFixture::use_graphs) @@ -492,7 +492,7 @@ TYPED_TEST(HipcubDeviceSelectTests, FlaggedIf) using U = typename TestFixture::output_type; using F = typename TestFixture::flag_type; - constexpr bool inplace = std::is_same::value; + constexpr bool inplace = std::is_same_v; hipStream_t stream = 0; // default if(TestFixture::use_graphs) diff --git a/projects/hipcub/test/hipcub/test_hipcub_warp_exchange.cpp b/projects/hipcub/test/hipcub/test_hipcub_warp_exchange.cpp index 30adcdbb573..6fc5171b525 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_warp_exchange.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_warp_exchange.cpp @@ -222,7 +222,7 @@ std::enable_if_t> run_warp_exch input[i] = test_utils::convert_to_device(i); } std::vector expected; - if(std::is_same::value) + if(std::is_same_v) { expected = input; input = stripe_vector(input, warp_size, items_per_thread); diff --git a/projects/hipcub/test/hipcub/test_utils.hpp b/projects/hipcub/test/hipcub/test_utils.hpp index 88eac81521a..57ab07e6883 100644 --- a/projects/hipcub/test/hipcub/test_utils.hpp +++ b/projects/hipcub/test/hipcub/test_utils.hpp @@ -178,35 +178,33 @@ OutputIt host_inclusive_scan_init( return host_inclusive_scan_impl(first, last, d_first, op, init_value); } -template::value_type, - test_utils::bfloat16>::value - || std::is_same::value_type, - test_utils::half>::value - || std::is_same::value_type, float>::value, - bool> - = true> +template< + class InputIt, + class OutputIt, + class T, + std::enable_if_t< + std::is_same_v::value_type, test_utils::bfloat16> + || std::is_same_v::value_type, test_utils::half> + || std::is_same_v::value_type, float>, + bool> + = true> OutputIt host_inclusive_scan(InputIt first, InputIt last, OutputIt d_first, test_utils::plus) { using acc_type = double; return host_inclusive_scan_impl(first, last, d_first, test_utils::plus(), acc_type{}); } -template::value_type, - test_utils::bfloat16>::value - || std::is_same::value_type, - test_utils::half>::value - || std::is_same::value_type, float>::value, - bool> - = true> +template< + class InputIt, + class OutputIt, + class InitType, + class T, + std::enable_if_t< + std::is_same_v::value_type, test_utils::bfloat16> + || std::is_same_v::value_type, test_utils::half> + || std::is_same_v::value_type, float>, + bool> + = true> OutputIt host_inclusive_scan_init( InputIt first, InputIt last, OutputIt d_first, InitType init_value, test_utils::plus) { @@ -246,18 +244,17 @@ OutputIt host_exclusive_scan( return host_exclusive_scan_impl(first, last, initial_value, d_first, op, acc_type{}); } -template::value_type, - test_utils::bfloat16>::value - || std::is_same::value_type, - test_utils::half>::value - || std::is_same::value_type, float>::value, - bool> - = true> +template< + class InputIt, + class T, + class OutputIt, + class U, + std::enable_if_t< + std::is_same_v::value_type, test_utils::bfloat16> + || std::is_same_v::value_type, test_utils::half> + || std::is_same_v::value_type, float>, + bool> + = true> OutputIt host_exclusive_scan( InputIt first, InputIt last, T initial_value, OutputIt d_first, test_utils::plus) { @@ -332,20 +329,19 @@ OutputIt host_exclusive_scan_by_key(InputIt first, acc_type{}); } -template::value_type, - test_utils::bfloat16>::value - || std::is_same::value_type, - test_utils::half>::value - || std::is_same::value_type, float>::value, - bool> - = true> +template< + class InputIt, + class KeyIt, + class T, + class OutputIt, + class U, + class KeyCompare, + std::enable_if_t< + std::is_same_v::value_type, test_utils::bfloat16> + || std::is_same_v::value_type, test_utils::half> + || std::is_same_v::value_type, float>, + bool> + = true> OutputIt host_exclusive_scan_by_key(InputIt first, InputIt last, KeyIt k_first, @@ -417,19 +413,18 @@ OutputIt host_inclusive_scan_by_key(InputIt first, acc_type{}); } -template::value_type, - test_utils::bfloat16>::value - || std::is_same::value_type, - test_utils::half>::value - || std::is_same::value_type, float>::value, - bool> - = true> +template< + class InputIt, + class KeyIt, + class OutputIt, + class U, + class KeyCompare, + std::enable_if_t< + std::is_same_v::value_type, test_utils::bfloat16> + || std::is_same_v::value_type, test_utils::half> + || std::is_same_v::value_type, float>, + bool> + = true> OutputIt host_inclusive_scan_by_key(InputIt first, InputIt last, KeyIt k_first, diff --git a/projects/hipcub/test/hipcub/test_utils_assertions.hpp b/projects/hipcub/test/hipcub/test_utils_assertions.hpp index 7e6b5e31b6c..57574f5229a 100644 --- a/projects/hipcub/test/hipcub/test_utils_assertions.hpp +++ b/projects/hipcub/test/hipcub/test_utils_assertions.hpp @@ -137,9 +137,13 @@ inline auto assert_near(const std::vector& result, const std::vector& expe } } -template::value || - std::is_same::value, bool> = true> -inline void assert_near(const std::vector& result, const std::vector& expected, const float percent) +template< + class T, + std::enable_if_t || std::is_same_v, + bool> + = true> +inline void + assert_near(const std::vector& result, const std::vector& expected, const float percent) { ASSERT_EQ(result.size(), expected.size()); for(size_t i = 0; i < result.size(); i++) @@ -176,9 +180,14 @@ inline auto assert_near(const std::vector>& result, const st } } -template::value || - std::is_same::value, bool> = true> -inline void assert_near(const std::vector>& result, const std::vector>& expected, const float percent) +template< + class T, + std::enable_if_t || std::is_same_v, + bool> + = true> +inline void assert_near(const std::vector>& result, + const std::vector>& expected, + const float percent) { ASSERT_EQ(result.size(), expected.size()); for(size_t i = 0; i < result.size(); i++) @@ -209,8 +218,11 @@ inline auto assert_near(const T& result, const T& expected, const float) ASSERT_EQ(result, expected); } -template::value || - std::is_same::value, bool> = true> +template< + class T, + std::enable_if_t || std::is_same_v, + bool> + = true> inline void assert_near(const T& result, const T& expected, const float percent) { if(bit_equal(result, expected)) return; // Check to also regard equality of NaN's, -NaN, +inf, -inf as correct. diff --git a/projects/hipcub/test/hipcub/test_utils_data_generation.hpp b/projects/hipcub/test/hipcub/test_utils_data_generation.hpp index 8edfebb34d4..eb3af3ee54a 100644 --- a/projects/hipcub/test/hipcub/test_utils_data_generation.hpp +++ b/projects/hipcub/test/hipcub/test_utils_data_generation.hpp @@ -333,12 +333,12 @@ void add_special_values(std::vector& source, int seed_value) // Actually causes problems with signed/unsigned char on Windows using clang. template struct is_valid_for_int_distribution - : std::integral_constant< - bool, - std::is_same::value || std::is_same::value - || std::is_same::value || std::is_same::value - || std::is_same::value || std::is_same::value - || std::is_same::value || std::is_same::value> + : std::integral_constant || std::is_same_v + || std::is_same_v || std::is_same_v + || std::is_same_v || std::is_same_v + || std::is_same_v + || std::is_same_v> {}; template @@ -360,8 +360,8 @@ inline auto get_random_data(size_t size, T min, T max, int seed_value) -> template inline auto get_random_data(size_t size, S min, U max, int seed_value) -> typename std::enable_if::value && !is_custom_test_type::value - && !std::is_same::value - && !std::is_same::value, + && !std::is_same_v + && !std::is_same_v, std::vector>::type { std::default_random_engine gen(seed_value); @@ -376,7 +376,7 @@ inline auto get_random_data(size_t size, S min, U max, int seed_value) -> template inline auto get_random_data(size_t size, S min, U max, int seed_value) -> - typename std::enable_if::value, std::vector>::type + typename std::enable_if, std::vector>::type { std::default_random_engine gen(seed_value); std::uniform_int_distribution distribution(static_cast(min), @@ -390,7 +390,7 @@ inline auto get_random_data(size_t size, S min, U max, int seed_value) -> template inline auto get_random_data(size_t size, S min, U max, int seed_value) -> - typename std::enable_if::value, std::vector>::type + typename std::enable_if, std::vector>::type { std::default_random_engine gen(seed_value); std::uniform_int_distribution distribution(static_cast(min), diff --git a/projects/hipcub/test/hipcub/test_utils_thread_operators.hpp b/projects/hipcub/test/hipcub/test_utils_thread_operators.hpp index 8a85a5f4dd1..335565e2dd9 100644 --- a/projects/hipcub/test/hipcub/test_utils_thread_operators.hpp +++ b/projects/hipcub/test/hipcub/test_utils_thread_operators.hpp @@ -155,12 +155,13 @@ struct ArgMax { template::value - || std::is_same::value, + std::enable_if_t + || std::is_same_v, bool> = true> - HIPCUB_HOST_DEVICE __forceinline__ hipcub::KeyValuePair - operator()(const hipcub::KeyValuePair& a, + HIPCUB_HOST_DEVICE __forceinline__ + hipcub::KeyValuePair + operator()(const hipcub::KeyValuePair& a, const hipcub::KeyValuePair& b) const { const hipcub::KeyValuePair native_a(a.key, a.value); @@ -179,12 +180,13 @@ struct ArgMin { template::value - || std::is_same::value, + std::enable_if_t + || std::is_same_v, bool> = true> - HIPCUB_HOST_DEVICE __forceinline__ hipcub::KeyValuePair - operator()(const hipcub::KeyValuePair& a, + HIPCUB_HOST_DEVICE __forceinline__ + hipcub::KeyValuePair + operator()(const hipcub::KeyValuePair& a, const hipcub::KeyValuePair& b) const { const hipcub::KeyValuePair native_a(a.key, a.value); From ca88b0e0a085df6b94612438c4b7f9a949a65848 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Thu, 13 Nov 2025 15:21:07 +0000 Subject: [PATCH 22/95] Replace ``HIPCUB_IF_CONSTEXPR`` --- projects/hipcub/examples/example_utils.hpp | 2 +- .../device/device_segmented_reduce.hpp | 2 +- .../hipcub/backend/rocprim/util_sync.hpp | 2 +- .../hipcub/hipcub/include/hipcub/config.hpp | 13 ---------- .../test/hipcub/test_hipcub_device_scan.cpp | 26 +++++++++---------- .../test/hipcub/test_hipcub_device_select.cpp | 18 ++++++------- .../hipcub/test_hipcub_warp_merge_sort.cpp | 8 +++--- 7 files changed, 29 insertions(+), 42 deletions(-) diff --git a/projects/hipcub/examples/example_utils.hpp b/projects/hipcub/examples/example_utils.hpp index 1248a854cab..f2ed2df11ef 100644 --- a/projects/hipcub/examples/example_utils.hpp +++ b/projects/hipcub/examples/example_utils.hpp @@ -591,7 +591,7 @@ void RandomBits( memcpy(&key, word_buff, sizeof(K)); K copy = key; - if HIPCUB_IF_CONSTEXPR(std::is_floating_point::value) + if constexpr(std::is_floating_point::value) #ifndef _WIN32 if(!std::isnan(copy)) #else diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp index 408df01e34d..d4e01126cc8 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp @@ -156,7 +156,7 @@ inline hipError_t segmented_arg_minmax(void* temporary_storage, std::chrono::high_resolution_clock::time_point start; - if HIPCUB_IF_CONSTEXPR(HIPCUB_DETAIL_DEBUG_SYNC_VALUE) + if constexpr(HIPCUB_DETAIL_DEBUG_SYNC_VALUE) { start = std::chrono::high_resolution_clock::now(); } diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_sync.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_sync.hpp index 0ea8de31641..22235ea2af8 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_sync.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_sync.hpp @@ -41,7 +41,7 @@ { \ return _error; \ } \ - if HIPCUB_IF_CONSTEXPR(HIPCUB_DETAIL_DEBUG_SYNC_VALUE) \ + if constexpr(HIPCUB_DETAIL_DEBUG_SYNC_VALUE) \ { \ std::cout << name << "(" << size << ")"; \ auto __error = hipStreamSynchronize(stream); \ diff --git a/projects/hipcub/hipcub/include/hipcub/config.hpp b/projects/hipcub/hipcub/include/hipcub/config.hpp index 9c9be1708df..97c8fe0cdee 100644 --- a/projects/hipcub/hipcub/include/hipcub/config.hpp +++ b/projects/hipcub/hipcub/include/hipcub/config.hpp @@ -208,19 +208,6 @@ END_HIPCUB_NAMESPACE #define HipcubLog(msg) ::hipcub::Log(msg, __FILE__, __LINE__) #endif -#if __cpp_if_constexpr - #define HIPCUB_IF_CONSTEXPR constexpr -#else - #if defined(_MSC_VER) && !defined(__clang__) - // MSVC (and not Clang pretending to be MSVC) unconditionally exposes if constexpr, - // moreover it triggers warning C4127 (conditional expression is constant) when not using it. nvcc will - // be calling cl.exe for host-side codegen. - #define HIPCUB_IF_CONSTEXPR constexpr - #else - #define HIPCUB_IF_CONSTEXPR - #endif -#endif - #ifdef DOXYGEN_SHOULD_SKIP_THIS // Documentation only /// \def HIPCUB_DEBUG_SYNC diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp index 8d9ab74a6ba..5fde1649b96 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp @@ -188,7 +188,7 @@ TYPED_TEST(HipcubDeviceScanTests, InclusiveScan) T* d_input; U* d_output; HIP_CHECK(test_common_utils::hipMallocHelper(&d_input, input.size() * sizeof(T))); - if HIPCUB_IF_CONSTEXPR(!inplace) + if constexpr(!inplace) { HIP_CHECK(test_common_utils::hipMallocHelper(&d_output, output.size() * sizeof(U))); } @@ -211,9 +211,9 @@ TYPED_TEST(HipcubDeviceScanTests, InclusiveScan) auto call = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { - if HIPCUB_IF_CONSTEXPR(std::is_same_v) + if constexpr(std::is_same_v) { - if HIPCUB_IF_CONSTEXPR(inplace) + if constexpr(inplace) { HIP_CHECK(hipcub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_size_bytes, @@ -233,7 +233,7 @@ TYPED_TEST(HipcubDeviceScanTests, InclusiveScan) } else { - if HIPCUB_IF_CONSTEXPR(inplace) + if constexpr(inplace) { HIP_CHECK(hipcub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_size_bytes, @@ -281,7 +281,7 @@ TYPED_TEST(HipcubDeviceScanTests, InclusiveScan) HIP_CHECK(hipDeviceSynchronize()); // Copy output to host - if HIPCUB_IF_CONSTEXPR(inplace) + if constexpr(inplace) { HIP_CHECK(hipMemcpy(output.data(), d_input, @@ -385,7 +385,7 @@ TYPED_TEST(HipcubDeviceScanTests, InclusiveScanInit) T* d_input; U* d_output; HIP_CHECK(test_common_utils::hipMallocHelper(&d_input, input.size() * sizeof(T))); - if HIPCUB_IF_CONSTEXPR(!inplace) + if constexpr(!inplace) { HIP_CHECK(test_common_utils::hipMallocHelper(&d_output, output.size() * sizeof(U))); } @@ -412,7 +412,7 @@ TYPED_TEST(HipcubDeviceScanTests, InclusiveScanInit) auto call = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { - if HIPCUB_IF_CONSTEXPR(inplace) + if constexpr(inplace) { HIP_CHECK(hipcub::DeviceScan::InclusiveScanInit(d_temp_storage, temp_storage_size_bytes, @@ -462,7 +462,7 @@ TYPED_TEST(HipcubDeviceScanTests, InclusiveScanInit) HIP_CHECK(hipDeviceSynchronize()); // Copy output to host - if HIPCUB_IF_CONSTEXPR(inplace) + if constexpr(inplace) { HIP_CHECK(hipMemcpy(output.data(), d_input, @@ -749,7 +749,7 @@ TYPED_TEST(HipcubDeviceScanTests, ExclusiveScan) T* d_input; U* d_output; HIP_CHECK(test_common_utils::hipMallocHelper(&d_input, input.size() * sizeof(T))); - if HIPCUB_IF_CONSTEXPR(!inplace) + if constexpr(!inplace) { HIP_CHECK(test_common_utils::hipMallocHelper(&d_output, output.size() * sizeof(U))); } @@ -782,9 +782,9 @@ TYPED_TEST(HipcubDeviceScanTests, ExclusiveScan) auto call = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { - if HIPCUB_IF_CONSTEXPR(std::is_same_v) + if constexpr(std::is_same_v) { - if HIPCUB_IF_CONSTEXPR(inplace) + if constexpr(inplace) { HIP_CHECK(hipcub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_size_bytes, @@ -804,7 +804,7 @@ TYPED_TEST(HipcubDeviceScanTests, ExclusiveScan) } else { - if HIPCUB_IF_CONSTEXPR(inplace) + if constexpr(inplace) { HIP_CHECK(hipcub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_size_bytes, @@ -854,7 +854,7 @@ TYPED_TEST(HipcubDeviceScanTests, ExclusiveScan) HIP_CHECK(hipDeviceSynchronize()); // Copy output to host - if HIPCUB_IF_CONSTEXPR(inplace) + if constexpr(inplace) { HIP_CHECK(hipMemcpy(output.data(), d_input, diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp index 6d6d75be103..d8d79895e67 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp @@ -111,7 +111,7 @@ TYPED_TEST(HipcubDeviceSelectTests, Flagged) unsigned int* d_selected_count_output; HIP_CHECK(test_common_utils::hipMallocHelper(&d_input, input.size() * sizeof(T))); HIP_CHECK(test_common_utils::hipMallocHelper(&d_flags, flags.size() * sizeof(F))); - if HIPCUB_IF_CONSTEXPR(!inplace) + if constexpr(!inplace) { HIP_CHECK(test_common_utils::hipMallocHelper(&d_output, input.size() * sizeof(U))); } @@ -135,7 +135,7 @@ TYPED_TEST(HipcubDeviceSelectTests, Flagged) auto call = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { - if HIPCUB_IF_CONSTEXPR(inplace) + if constexpr(inplace) { HIP_CHECK(hipcub::DeviceSelect::Flagged(d_temp_storage, temp_storage_size_bytes, @@ -192,7 +192,7 @@ TYPED_TEST(HipcubDeviceSelectTests, Flagged) // Check if output values are as expected std::vector output(input.size()); - if HIPCUB_IF_CONSTEXPR(inplace) + if constexpr(inplace) { HIP_CHECK(hipMemcpy(output.data(), d_input, @@ -369,7 +369,7 @@ TYPED_TEST(HipcubDeviceSelectTests, SelectOp) U* d_output; unsigned int* d_selected_count_output; HIP_CHECK(test_common_utils::hipMallocHelper(&d_input, input.size() * sizeof(T))); - if HIPCUB_IF_CONSTEXPR(!inplace) + if constexpr(!inplace) { HIP_CHECK(test_common_utils::hipMallocHelper(&d_output, input.size() * sizeof(U))); } @@ -391,7 +391,7 @@ TYPED_TEST(HipcubDeviceSelectTests, SelectOp) auto call = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { - if HIPCUB_IF_CONSTEXPR(inplace) + if constexpr(inplace) { HIP_CHECK(hipcub::DeviceSelect::If(d_temp_storage, temp_storage_size_bytes, @@ -448,7 +448,7 @@ TYPED_TEST(HipcubDeviceSelectTests, SelectOp) // Check if output values are as expected std::vector output(input.size()); - if HIPCUB_IF_CONSTEXPR(inplace) + if constexpr(inplace) { HIP_CHECK(hipMemcpy(output.data(), d_input, @@ -530,7 +530,7 @@ TYPED_TEST(HipcubDeviceSelectTests, FlaggedIf) unsigned int* d_selected_count_output; HIP_CHECK(test_common_utils::hipMallocHelper(&d_input, input.size() * sizeof(T))); HIP_CHECK(test_common_utils::hipMallocHelper(&d_flags, flags.size() * sizeof(F))); - if HIPCUB_IF_CONSTEXPR(!inplace) + if constexpr(!inplace) { HIP_CHECK(test_common_utils::hipMallocHelper(&d_output, input.size() * sizeof(U))); } @@ -554,7 +554,7 @@ TYPED_TEST(HipcubDeviceSelectTests, FlaggedIf) auto call = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { - if HIPCUB_IF_CONSTEXPR(inplace) + if constexpr(inplace) { HIP_CHECK(hipcub::DeviceSelect::FlaggedIf(d_temp_storage, temp_storage_size_bytes, @@ -613,7 +613,7 @@ TYPED_TEST(HipcubDeviceSelectTests, FlaggedIf) // Check if output values are as expected std::vector output(input.size()); - if HIPCUB_IF_CONSTEXPR(inplace) + if constexpr(inplace) { HIP_CHECK(hipMemcpy(output.data(), d_input, diff --git a/projects/hipcub/test/hipcub/test_hipcub_warp_merge_sort.cpp b/projects/hipcub/test/hipcub/test_hipcub_warp_merge_sort.cpp index 9055e377558..be4fbb2e827 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_warp_merge_sort.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_warp_merge_sort.cpp @@ -99,7 +99,7 @@ __device__ auto sort_keys_full_test(Key* keys, Compare compare_op) __shared__ typename warp_merge_sort::TempStorage storage[warps_per_block]; warp_merge_sort wsort{storage[warp_id]}; - if HIPCUB_IF_CONSTEXPR(Stable) + if constexpr(Stable) { wsort.StableSort(thread_keys, compare_op); } else @@ -157,7 +157,7 @@ __device__ auto sort_keys_values_full_test(Key* keys, Value* values, Compare com __shared__ typename warp_merge_sort::TempStorage storage[warps_per_block]; warp_merge_sort wsort{storage[warp_id]}; - if HIPCUB_IF_CONSTEXPR(Stable) + if constexpr(Stable) { wsort.StableSort(thread_keys, thread_values, compare_op); } else @@ -239,7 +239,7 @@ __device__ auto hipcub::LoadDirectBlocked(flat_tid, keys + warp_offset, thread_keys, segment_size); const Key oob_default = sort_last::value; - if HIPCUB_IF_CONSTEXPR(Stable) + if constexpr(Stable) { wsort.StableSort(thread_keys, compare, segment_size, oob_default); } else @@ -311,7 +311,7 @@ __device__ auto sort_keys_values_segmented_test(Key* keys, hipcub::LoadDirectBlocked(flat_tid, values + warp_offset, thread_values, segment_size); const Key oob_default = sort_last::value; - if HIPCUB_IF_CONSTEXPR(Stable) + if constexpr(Stable) { wsort.StableSort(thread_keys, thread_values, compare, segment_size, oob_default); } else From 8d096ad6e5acb53263a541d71b703df082ee3f19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Thu, 13 Nov 2025 15:34:55 +0000 Subject: [PATCH 23/95] Drop small deprecated entites --- .../backend/cub/util_temporary_storage.hpp | 35 ----- .../backend/rocprim/thread/thread_sort.hpp | 14 -- .../backend/rocprim/thread/thread_store.hpp | 3 - projects/hipcub/test/hipcub/CMakeLists.txt | 1 - .../test/hipcub/test_hipcub_util_device.cpp | 133 ------------------ 5 files changed, 186 deletions(-) delete mode 100644 projects/hipcub/test/hipcub/test_hipcub_util_device.cpp diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/util_temporary_storage.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/util_temporary_storage.hpp index fc67d645b14..3c2a84485c4 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/util_temporary_storage.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/util_temporary_storage.hpp @@ -34,39 +34,4 @@ #include // IWYU pragma: export -BEGIN_HIPCUB_NAMESPACE - -/// \brief Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed). -/// \tparam ALLOCATIONS The number of allocations that are needed. -/// \param d_temp_storage [in] Device-accessible allocation of temporary storage. When nullptr, the required allocation size is written to \p temp_storage_bytes and no work is done. -/// \param temp_storage_bytes [in,out] Size in bytes of \t d_temp_storage allocation. -/// \param allocations [out] Pointers to device allocations needed. -/// \param allocation_sizes [in] Sizes in bytes of device allocations needed. -template -HIPCUB_DEPRECATED_BECAUSE("Internal-only implementation detail") -HIPCUB_HOST_DEVICE HIPCUB_FORCEINLINE hipError_t - AliasTemporaries(void* d_temp_storage, - size_t& temp_storage_bytes, - void* (&allocations)[ALLOCATIONS], - const size_t (&allocation_sizes)[ALLOCATIONS]) -{ - cudaError_t error = ::cub::detail::AliasTemporaries(d_temp_storage, - temp_storage_bytes, - allocations, - allocation_sizes); - - if(cudaSuccess == error) - { - return hipSuccess; - } - else if(cudaErrorInvalidValue == error) - { - return hipErrorInvalidValue; - } - - return hipErrorUnknown; -} - -END_HIPCUB_NAMESPACE - #endif // HIPCUB_CUB_UTIL_TEMPORARY_STORAGE_HPP_ diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_sort.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_sort.hpp index 15c3211c575..eec88df700b 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_sort.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_sort.hpp @@ -44,20 +44,6 @@ BEGIN_HIPCUB_NAMESPACE -// Should be deprecated once hip::std::swap is available in this scope. -template -#if defined(__HIP_PLATFORM_NVIDIA__) -HIPCUB_DEPRECATED_BECAUSE("Use cuda::std::swap") -#else -HIPCUB_DEPRECATED_BECAUSE("Use rocprim::swap") -#endif -HIPCUB_DEVICE HIPCUB_FORCEINLINE void Swap(T& lhs, T& rhs) -{ - T temp = lhs; - lhs = rhs; - rhs = temp; -} - /** * @brief Sorts data using odd-even sort method * diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_store.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_store.hpp index 775848a122d..59ce95419ba 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_store.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_store.hpp @@ -128,8 +128,5 @@ struct iterate_thread_store } // namespace detail -template -using IterateThreadStore HIPCUB_DEPRECATED = detail::iterate_thread_store; - END_HIPCUB_NAMESPACE #endif diff --git a/projects/hipcub/test/hipcub/CMakeLists.txt b/projects/hipcub/test/hipcub/CMakeLists.txt index 0bbe882c3b2..92b7eec2629 100644 --- a/projects/hipcub/test/hipcub/CMakeLists.txt +++ b/projects/hipcub/test/hipcub/CMakeLists.txt @@ -261,7 +261,6 @@ add_hipcub_test("hipcub.DeviceTransform" test_hipcub_device_transform.cpp) add_hipcub_test("hipcub.DevicePartition" test_hipcub_device_partition.cpp) add_hipcub_test("hipcub.Grid" test_hipcub_grid.cpp) add_hipcub_test("hipcub.UtilPtx" test_hipcub_util_ptx.cpp) -add_hipcub_test("hipcub.UtilDevice" test_hipcub_util_device.cpp) add_hipcub_test("hipcub.Vector" test_hipcub_vector.cpp) add_hipcub_test("hipcub.WarpExchange" test_hipcub_warp_exchange.cpp) add_hipcub_test("hipcub.WarpLoad" test_hipcub_warp_load.cpp) diff --git a/projects/hipcub/test/hipcub/test_hipcub_util_device.cpp b/projects/hipcub/test/hipcub/test_hipcub_util_device.cpp deleted file mode 100644 index 34277a5ecfc..00000000000 --- a/projects/hipcub/test/hipcub/test_hipcub_util_device.cpp +++ /dev/null @@ -1,133 +0,0 @@ -// MIT License -// -// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#include "common_test_header.hpp" - -// hipcub API -#include - -template -__global__ -void alias_temporaries_kernel(T* data, size_t* temp_storage_bytes) -{ - T* allocations[10]; - size_t allocation_sizes[10] = {1, 2, 3, 5, 8, 13, 21, 34, 55, 89}; - (void) - hipcub::detail::AliasTemporaries(data, *temp_storage_bytes, allocations, allocation_sizes); -} - -TEST(HipcubUtilDevice, AliasTemporariesDevice) -{ - int device_id = test_common_utils::obtain_device_from_ctest(); - SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); - HIP_CHECK(hipSetDevice(device_id)); - - void* data = nullptr; - size_t temp_storage_bytes_host = 0; // Temporary storage on the host - size_t* device_temp_storage_bytes; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_temp_storage_bytes, sizeof(size_t))); - - // First kernel call to determine required temp storage size - alias_temporaries_kernel<<<1, 1, 0, 0>>>(data, device_temp_storage_bytes); - HIP_CHECK(hipDeviceSynchronize()); - HIP_CHECK(hipGetLastError()); - - // Copy the device storage size to host - HIP_CHECK(hipMemcpy(&temp_storage_bytes_host, - device_temp_storage_bytes, - sizeof(size_t), - hipMemcpyDeviceToHost)); - ASSERT_GT(temp_storage_bytes_host, 0U); - - // Allocate the actual data buffer on the device - HIP_CHECK(test_common_utils::hipMallocHelper(&data, temp_storage_bytes_host)); - - // Second kernel call with allocated buffer - alias_temporaries_kernel<<<1, 1, 0, 0>>>(data, device_temp_storage_bytes); - HIP_CHECK(hipDeviceSynchronize()); - HIP_CHECK(hipGetLastError()); - - // Free device memory - HIP_CHECK(hipFree(device_temp_storage_bytes)); - HIP_CHECK(hipFree(data)); -} - -TEST(HipcubUtilDevice, AliasTemporariesHost) -{ - int device_id = test_common_utils::obtain_device_from_ctest(); - SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); - HIP_CHECK(hipSetDevice(device_id)); - - void* data = nullptr; - size_t temp_storage_bytes = 0; - void* allocations[10]; - size_t allocation_sizes[10] = {1, 789, 3, 5, 8, 13, 21, 257, 256, 890}; - - size_t min_size = 0; - for(unsigned int i = 0; i < 10; i++) - { - min_size += allocation_sizes[i]; - } - - // Determine storage size - HIP_CHECK( - hipcub::detail::AliasTemporaries(data, temp_storage_bytes, allocations, allocation_sizes)); - - // Should be larger or equal to the sum of all sizes. - ASSERT_GT(temp_storage_bytes, min_size - 1); - - // Allocate the actual data buffer on the device - HIP_CHECK(test_common_utils::hipMallocHelper(&data, temp_storage_bytes)); - - size_t zero_size = 0; - // Check for error if it does not fit - hipError_t error - = hipcub::detail::AliasTemporaries(data, zero_size, allocations, allocation_sizes); - test_utils::assert_eq(error, hipErrorInvalidValue); - - HIP_CHECK( - hipcub::detail::AliasTemporaries(data, temp_storage_bytes, allocations, allocation_sizes)); - - test_utils::assert_eq(data, allocations[0]); - - for(unsigned int i = 1; i < 10; i++) - { - // The allocations should be in increasing order. - ASSERT_GT(allocations[i], allocations[i - 1]); - size_t current_pointer = (size_t)allocations[i]; - size_t before_pointer = (size_t)allocations[i - 1]; - size_t distance = current_pointer - before_pointer; - - // Check if all pointer have enough space - ASSERT_GT(distance + 1, allocation_sizes[i - 1]); - } - - size_t last_pointer = (size_t)allocations[9]; - size_t start_pointer = (size_t)data; - size_t max_size = start_pointer + temp_storage_bytes; - size_t last_size = max_size - last_pointer; - - // Last size should be equal or larger then the last value in allocation_sizes - ASSERT_GT(last_size + 1, allocation_sizes[9]); - - HIP_CHECK(hipFree(data)); -} From 4d194624bb6496f2ad6803724ae74f0ce477e307 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Thu, 13 Nov 2025 15:48:07 +0000 Subject: [PATCH 24/95] Remove deprecated iterators --- projects/hipcub/CHANGELOG.md | 1 + .../benchmark/benchmark_device_histogram.cpp | 1 - projects/hipcub/examples/example_utils.hpp | 3 +- .../backend/rocprim/device/device_for.hpp | 18 +- .../include/hipcub/backend/rocprim/hipcub.hpp | 4 - .../iterator/constant_input_iterator.hpp | 76 ------ .../iterator/counting_input_iterator.hpp | 77 ------ .../iterator/discard_output_iterator.hpp | 227 ------------------ .../iterator/transform_input_iterator.hpp | 87 ------- .../iterator/constant_input_iterator.hpp | 40 --- .../iterator/counting_input_iterator.hpp | 41 ---- .../iterator/discard_output_iterator.hpp | 40 --- .../iterator/transform_input_iterator.hpp | 40 --- .../hipcub/test_hipcub_block_load_store.cpp | 1 - .../hipcub/test_hipcub_block_load_store.hpp | 23 +- ...test_hipcub_device_adjacent_difference.cpp | 3 - .../test/hipcub/test_hipcub_device_for.cpp | 1 - .../hipcub/test_hipcub_device_histogram.cpp | 6 +- .../test/hipcub/test_hipcub_device_merge.cpp | 1 - .../test/hipcub/test_hipcub_device_reduce.cpp | 1 - .../test/hipcub/test_hipcub_device_scan.cpp | 3 - .../test_hipcub_device_segmented_reduce.cpp | 1 - .../test/hipcub/test_hipcub_device_select.cpp | 2 - .../test/hipcub/test_hipcub_iterators.cpp | 3 - .../hipcub/test_hipcub_thread_operators.cpp | 1 - 25 files changed, 22 insertions(+), 679 deletions(-) delete mode 100644 projects/hipcub/hipcub/include/hipcub/backend/rocprim/iterator/constant_input_iterator.hpp delete mode 100644 projects/hipcub/hipcub/include/hipcub/backend/rocprim/iterator/counting_input_iterator.hpp delete mode 100644 projects/hipcub/hipcub/include/hipcub/backend/rocprim/iterator/discard_output_iterator.hpp delete mode 100644 projects/hipcub/hipcub/include/hipcub/backend/rocprim/iterator/transform_input_iterator.hpp delete mode 100644 projects/hipcub/hipcub/include/hipcub/iterator/constant_input_iterator.hpp delete mode 100644 projects/hipcub/hipcub/include/hipcub/iterator/counting_input_iterator.hpp delete mode 100644 projects/hipcub/hipcub/include/hipcub/iterator/discard_output_iterator.hpp delete mode 100644 projects/hipcub/hipcub/include/hipcub/iterator/transform_input_iterator.hpp diff --git a/projects/hipcub/CHANGELOG.md b/projects/hipcub/CHANGELOG.md index c292f0d6a72..bdeec8d9302 100644 --- a/projects/hipcub/CHANGELOG.md +++ b/projects/hipcub/CHANGELOG.md @@ -20,6 +20,7 @@ Full documentation for hipCUB is available at [https://rocm.docs.amd.com/project ### Removed +* Removed `ConstantInputIterator`, `CountingInputIterator`, `DiscardOutputIterator` and `TransformInputIterator` which were deprecated in hipCUB-4.1.0. * Removed `DeviceSpmv`, which was removed from CUB after CCCL's 2.8.0 release. Use `hipSPARSE` or `rocSPARSE` libraries instead. * Removed `GridBarrier`. * Removed `LEGACY_PTX_ARCH`. diff --git a/projects/hipcub/benchmark/benchmark_device_histogram.cpp b/projects/hipcub/benchmark/benchmark_device_histogram.cpp index 4970862efde..5c33113db2e 100644 --- a/projects/hipcub/benchmark/benchmark_device_histogram.cpp +++ b/projects/hipcub/benchmark/benchmark_device_histogram.cpp @@ -30,7 +30,6 @@ // HIP API #include -#include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; diff --git a/projects/hipcub/examples/example_utils.hpp b/projects/hipcub/examples/example_utils.hpp index f2ed2df11ef..07067d88373 100644 --- a/projects/hipcub/examples/example_utils.hpp +++ b/projects/hipcub/examples/example_utils.hpp @@ -34,9 +34,8 @@ #include #include -#include #include -#include +#include #include _HIPCUB_STD_INCLUDE(functional) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_for.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_for.hpp index 682eb673a38..9712599bd77 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_for.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_for.hpp @@ -31,12 +31,12 @@ #include "../../../config.hpp" -#include "../iterator/counting_input_iterator.hpp" -#include "../iterator/discard_output_iterator.hpp" #include "../thread/thread_operators.hpp" #include "../util_mdspan.hpp" #include // IWYU pragma: export +#include // IWYU pragma: export +#include // IWYU pragma: export #include @@ -122,10 +122,10 @@ struct DeviceFor hipError_t> { using T = typename std::iterator_traits::value_type; - using OutputIterator = typename rocprim::discard_iterator; + detail::bulk::OpWrapper wrapper_op = {op}; - OutputIterator output; + auto output = rocprim::make_discard_iterator(); return rocprim::transform(first, output, @@ -265,11 +265,12 @@ HIPCUB_RUNTIME_FUNCTION { static_assert(std::is_integral::value, "ShapeT must be an integral type"); using InputIterator = typename rocprim::counting_iterator; - using OutputIterator = typename rocprim::discard_iterator; + detail::bulk::OpWrapper wrapper_op = {op}; InputIterator input(ShapeT(0)); - OutputIterator output; + + auto output = rocprim::make_discard_iterator(); return rocprim::transform(input, output, @@ -350,14 +351,13 @@ HIPCUB_RUNTIME_FUNCTION // rocprim::counting_iterator only holds the index, not the data. using InputIterator = typename rocprim::counting_iterator; - // We don't actually need the output, so we use rocprim::discard_iterator here as a placeholder. - using OutputIterator = typename rocprim::discard_iterator; // How many times rocprim::transform will iterate. constexpr auto ext_size = ::hipcub::extents_size::value; InputIterator input(IndexType(0)); // Initialize the input iterator, starting from 0. - OutputIterator output; + + auto output = rocprim::make_discard_iterator(); // `ForEachInExtents` only iterates over the extents on device and does not guarantee ordering. // We only need to invoke `$op` `$ext_size` times. Therefore, `rocprim::transform` is suitable. diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/hipcub.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/hipcub.hpp index 21ae7ff17c4..fdf3c4f24eb 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/hipcub.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/hipcub.hpp @@ -78,11 +78,7 @@ #include "iterator/arg_index_input_iterator.hpp" #include "iterator/cache_modified_input_iterator.hpp" #include "iterator/cache_modified_output_iterator.hpp" -#include "iterator/constant_input_iterator.hpp" -#include "iterator/counting_input_iterator.hpp" -#include "iterator/discard_output_iterator.hpp" #include "iterator/tex_obj_input_iterator.hpp" -#include "iterator/transform_input_iterator.hpp" // Thread #include "thread/thread_load.hpp" diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/iterator/constant_input_iterator.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/iterator/constant_input_iterator.hpp deleted file mode 100644 index e135817dbb0..00000000000 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/iterator/constant_input_iterator.hpp +++ /dev/null @@ -1,76 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -#ifndef HIPCUB_ROCPRIM_ITERATOR_CONSTANT_INPUT_ITERATOR_HPP_ -#define HIPCUB_ROCPRIM_ITERATOR_CONSTANT_INPUT_ITERATOR_HPP_ - -#include "../../../config.hpp" -#include "../../../util_deprecated.hpp" - -#include "iterator_category.hpp" -#include "iterator_wrapper.hpp" - -#include // IWYU pragma: export - -#include - -BEGIN_HIPCUB_NAMESPACE - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -template -class HIPCUB_DEPRECATED_BECAUSE( - "Use rocprim::constant_iterator or rocthrust::constant_iterator instead") ConstantInputIterator - : public detail::IteratorWrapper, - ConstantInputIterator> -{ - using Iterator = rocprim::constant_iterator; - using Base = detail::IteratorWrapper>; - -public: - using iterator_category = typename detail::IteratorCategory::type; - using self_type = typename Iterator::self_type; - - __host__ __device__ __forceinline__ ConstantInputIterator( - const typename Iterator::value_type value, const size_t index = 0) - : Base(Iterator(value, index)) - {} - - // Cast from wrapped iterator to class itself - __host__ __device__ __forceinline__ explicit ConstantInputIterator(Iterator iterator) - : Base(iterator) - {} -}; - -#endif - -END_HIPCUB_NAMESPACE - -#endif // HIPCUB_ROCPRIM_ITERATOR_CONSTANT_INPUT_ITERATOR_HPP_ diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/iterator/counting_input_iterator.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/iterator/counting_input_iterator.hpp deleted file mode 100644 index 06802ffc8bc..00000000000 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/iterator/counting_input_iterator.hpp +++ /dev/null @@ -1,77 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -#ifndef HIPCUB_ROCPRIM_ITERATOR_COUNTING_INPUT_ITERATOR_HPP_ -#define HIPCUB_ROCPRIM_ITERATOR_COUNTING_INPUT_ITERATOR_HPP_ - -#include "../../../config.hpp" -#include "../../../util_deprecated.hpp" - -#include "iterator_category.hpp" -#include "iterator_wrapper.hpp" - -#include // IWYU pragma: export - -#include - -BEGIN_HIPCUB_NAMESPACE - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -template -class HIPCUB_DEPRECATED_BECAUSE( - "Use rocprim::counting_iterator or rocthrust::counting_iterator instead") CountingInputIterator - : public detail::IteratorWrapper, - CountingInputIterator> -{ - using Iterator = rocprim::counting_iterator; - using Base - = detail::IteratorWrapper>; - -public: - using iterator_category = typename detail::IteratorCategory::type; - using self_type = typename Iterator::self_type; - - __host__ __device__ __forceinline__ CountingInputIterator( - const typename Iterator::value_type value) - : Base(Iterator(value)) - {} - - // Cast from wrapped iterator to class itself - __host__ __device__ __forceinline__ explicit CountingInputIterator(Iterator iterator) - : Base(iterator) - {} -}; - -#endif - -END_HIPCUB_NAMESPACE - -#endif // HIPCUB_ROCPRIM_ITERATOR_COUNTING_INPUT_ITERATOR_HPP_ diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/iterator/discard_output_iterator.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/iterator/discard_output_iterator.hpp deleted file mode 100644 index 4896530fb83..00000000000 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/iterator/discard_output_iterator.hpp +++ /dev/null @@ -1,227 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2020-2025, Advanced Micro Devices, Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -#ifndef HIPCUB_ROCPRIM_ITERATOR_DISCARD_OUTPUT_ITERATOR_HPP_ -#define HIPCUB_ROCPRIM_ITERATOR_DISCARD_OUTPUT_ITERATOR_HPP_ - -#include "../../../config.hpp" -#include "../../../util_deprecated.hpp" - -#include "iterator_category.hpp" - -#include // IWYU pragma: export - -#include -#include - -BEGIN_HIPCUB_NAMESPACE - -/** - * \addtogroup UtilIterator - * @{ - */ - - -/** - * \brief A discard iterator - */ -template -class HIPCUB_DEPRECATED_BECAUSE( - "Use rocprim::discard_iterator or rocthrust::discard_iterator instead") DiscardOutputIterator -{ -public: - // Required iterator traits - using self_type = DiscardOutputIterator; ///< My own type - using difference_type - = OffsetT; ///< Type to express the result of subtracting one iterator from another - using value_type = void; ///< The type of the element the iterator can point to - using pointer = void; ///< The type of a pointer to an element the iterator can point to - using reference = void; ///< The type of a reference to an element the iterator can point to - using iterator_category = - typename detail::IteratorCategory::type; ///< The iterator category - -private: - - OffsetT offset; - -public: - - /// Constructor - __host__ __device__ __forceinline__ DiscardOutputIterator( - OffsetT offset = 0) ///< Base offset - : - offset(offset) - {} - - /** - * @typedef self_type - * @brief Postfix increment - */ - __host__ __device__ __forceinline__ self_type operator++(int) - { - self_type retval = *this; - offset++; - return retval; - } - - /** - * @typedef self_type - * @brief Postfix increment - */ - __host__ __device__ __forceinline__ self_type operator++() - { - offset++; - return *this; - } - - /** - * @typedef self_type - * @brief Indirection - */ - __host__ __device__ __forceinline__ self_type& operator*() - { - // return self reference, which can be assigned to anything - return *this; - } - - /** - * @typedef self_type - * @brief Addition - */ - template - __host__ __device__ __forceinline__ self_type operator+(Distance n) const - { - self_type retval(offset + n); - return retval; - } - - /** - * @typedef self_type - * @brief Addition assignment - */ - template - __host__ __device__ __forceinline__ self_type& operator+=(Distance n) - { - offset += n; - return *this; - } - - /** - * @typedef self_type - * @brief Subtraction assignment - */ - template - __host__ __device__ __forceinline__ self_type operator-(Distance n) const - { - self_type retval(offset - n); - return retval; - } - - /** - * @typedef self_type - * @brief Subtraction assignment - */ - template - __host__ __device__ __forceinline__ self_type& operator-=(Distance n) - { - offset -= n; - return *this; - } - - /** - * @typedef self_type - * @brief Distance - */ - __host__ __device__ __forceinline__ difference_type operator-(self_type other) const - { - return offset - other.offset; - } - - /** - * @typedef self_type - * @brief Array subscript - */ - template - __host__ __device__ __forceinline__ self_type& operator[](Distance) - { - // return self reference, which can be assigned to anything - return *this; - } - - /// Structure dereference - __host__ __device__ __forceinline__ pointer operator->() - { - return; - } - - /// Assignment to anything else (no-op) - template - __host__ __device__ __forceinline__ void operator=(T const&) - {} - - /// Cast to void* operator - __host__ __device__ __forceinline__ operator void*() const - { - return nullptr; - } - - /** - * @typedef self_type - * @brief Equal to - */ - __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) const - { - return (offset == rhs.offset); - } - - /** - * @typedef self_type - * @brief Not equal to - */ - __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) const - { - return (offset != rhs.offset); - } - - /** - * @typedef self_type - * @brief ostream operator - */ - HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH - friend std::ostream& operator<<(std::ostream& os, const self_type& itr) - { - os << "[" << itr.offset << "]"; - return os; - } - HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP -}; - -END_HIPCUB_NAMESPACE - -#endif // HIPCUB_ROCPRIM_ITERATOR_DISCARD_OUTPUT_ITERATOR_HPP_ diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/iterator/transform_input_iterator.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/iterator/transform_input_iterator.hpp deleted file mode 100644 index 2b1efba7bf2..00000000000 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/iterator/transform_input_iterator.hpp +++ /dev/null @@ -1,87 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -#ifndef HIPCUB_ROCPRIM_ITERATOR_TRANSFORM_INPUT_ITERATOR_HPP_ -#define HIPCUB_ROCPRIM_ITERATOR_TRANSFORM_INPUT_ITERATOR_HPP_ - -#include "../../../config.hpp" - -#include "iterator_category.hpp" -#include "iterator_wrapper.hpp" - -#include // IWYU pragma: export -#include // IWYU pragma: export - -#include -#include -#include - -BEGIN_HIPCUB_NAMESPACE - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -template -class HIPCUB_DEPRECATED_BECAUSE( - "Use rocprim::transform_iterator or rocthrust::transform_iterator instead") - TransformInputIterator - : public detail::IteratorWrapper< - rocprim::transform_iterator, - TransformInputIterator> -{ - using Iterator = rocprim::transform_iterator; - using Base = detail::IteratorWrapper< - Iterator, - TransformInputIterator>; - -public: - using iterator_category = typename detail::IteratorCategory::type; - using self_type = typename Iterator::self_type; - using unary_function = typename Iterator::unary_function; - - __host__ __device__ __forceinline__ TransformInputIterator(InputIteratorT iterator, - ConversionOp transform) - : Base(Iterator(iterator, transform)) - {} - - // Cast from wrapped iterator to class itself - __host__ __device__ __forceinline__ explicit TransformInputIterator(Iterator iterator) - : Base(iterator) - {} -}; - -#endif - -END_HIPCUB_NAMESPACE - -#endif // HIPCUB_ROCPRIM_ITERATOR_TRANSFORM_INPUT_ITERATOR_HPP_ diff --git a/projects/hipcub/hipcub/include/hipcub/iterator/constant_input_iterator.hpp b/projects/hipcub/hipcub/include/hipcub/iterator/constant_input_iterator.hpp deleted file mode 100644 index 7fc28707274..00000000000 --- a/projects/hipcub/hipcub/include/hipcub/iterator/constant_input_iterator.hpp +++ /dev/null @@ -1,40 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2021-2025, Advanced Micro Devices, Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -#ifndef HIPCUB_CONSTANT_INPUT_ITERATOR_HPP_ -#define HIPCUB_CONSTANT_INPUT_ITERATOR_HPP_ - -#ifdef __HIP_PLATFORM_AMD__ - #include "../backend/rocprim/iterator/constant_input_iterator.hpp" // IWYU pragma: export -#elif defined(__HIP_PLATFORM_NVIDIA__) - #include "../config.hpp" - #include // IWYU pragma: export -#endif - -#endif // HIPCUB_ITERATOR_DISCARD_OUTPUT__HPP_ diff --git a/projects/hipcub/hipcub/include/hipcub/iterator/counting_input_iterator.hpp b/projects/hipcub/hipcub/include/hipcub/iterator/counting_input_iterator.hpp deleted file mode 100644 index 723af464aae..00000000000 --- a/projects/hipcub/hipcub/include/hipcub/iterator/counting_input_iterator.hpp +++ /dev/null @@ -1,41 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2021-2025, Advanced Micro Devices, Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -#ifndef HIPCUB_COUNTING_INPUT_ITERATOR_HPP_ -#define HIPCUB_COUNTING_INPUT_ITERATOR_HPP_ - -#ifdef __HIP_PLATFORM_AMD__ - #include "../backend/rocprim/iterator/counting_input_iterator.hpp" // IWYU pragma: export -#elif defined(__HIP_PLATFORM_NVIDIA__) - #include "../config.hpp" - #include // IWYU pragma: export -#endif - -#endif // HIPCUB_ITERATOR_DISCARD_OUTPUT__HPP_ - diff --git a/projects/hipcub/hipcub/include/hipcub/iterator/discard_output_iterator.hpp b/projects/hipcub/hipcub/include/hipcub/iterator/discard_output_iterator.hpp deleted file mode 100644 index a2c8d6bedd6..00000000000 --- a/projects/hipcub/hipcub/include/hipcub/iterator/discard_output_iterator.hpp +++ /dev/null @@ -1,40 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2020-2025, Advanced Micro Devices, Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -#ifndef HIPCUB_ITERATOR_DISCARD_OUTPUT_HPP_ -#define HIPCUB_ITERATOR_DISCARD_OUTPUT_HPP_ - -#ifdef __HIP_PLATFORM_AMD__ - #include "../backend/rocprim/iterator/discard_output_iterator.hpp" // IWYU pragma: export -#elif defined(__HIP_PLATFORM_NVIDIA__) - #include "../config.hpp" - #include // IWYU pragma: export -#endif - -#endif // HIPCUB_ITERATOR_DISCARD_OUTPUT__HPP_ diff --git a/projects/hipcub/hipcub/include/hipcub/iterator/transform_input_iterator.hpp b/projects/hipcub/hipcub/include/hipcub/iterator/transform_input_iterator.hpp deleted file mode 100644 index b2ce4b9f20f..00000000000 --- a/projects/hipcub/hipcub/include/hipcub/iterator/transform_input_iterator.hpp +++ /dev/null @@ -1,40 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2020-2025, Advanced Micro Devices, Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -#ifndef HIPCUB_ITERATOR_TRANSFORM_INPUT_ITERATOR_HPP_ -#define HIPCUB_ITERATOR_TRANSFORM_INPUT_ITERATOR_HPP_ - -#ifdef __HIP_PLATFORM_AMD__ - #include "../backend/rocprim/iterator/transform_input_iterator.hpp" // IWYU pragma: export -#elif defined(__HIP_PLATFORM_NVIDIA__) - #include "../config.hpp" - #include // IWYU pragma: export -#endif - -#endif // HIPCUB_ITERATOR_TRANSFORM_INPUT_ITERATOR_HPP_ diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_load_store.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_load_store.cpp index b9ff9334b3f..2adfdc0e91b 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_load_store.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_load_store.cpp @@ -34,7 +34,6 @@ // kernel definitions #include "test_hipcub_block_load_store.kernels.hpp" -#include // Start stamping out tests struct HipcubBlockLoadStoreTests; diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_load_store.hpp b/projects/hipcub/test/hipcub/test_hipcub_block_load_store.hpp index bf5fa0e2e1c..cbd5f203ed2 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_load_store.hpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_load_store.hpp @@ -20,6 +20,8 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +#include + test_suite_type_def(suite_name, name_suffix) typed_test_suite_def(HipcubBlockLoadStoreTests, name_suffix, load_store_params); @@ -371,24 +373,19 @@ typed_test_def(HipcubBlockLoadStoreTests, name_suffix, LoadStoreDiscardIterator) input.size() * sizeof(typename decltype(input)::value_type), hipMemcpyHostToDevice)); - // Test with discard output iterator - // using OffsetT = typename std::iterator_traits::difference_type; - HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH - // TODO: Here in block load an store, it's not possible to use rocprim::discard_iterator - hipcub::DiscardOutputIterator discard_itr; + // Running kernel for discard case + Type* dummy; + HIP_CHECK(hipMalloc(&dummy, guarded_elements * sizeof(Type))); - // Running kernel - load_store_guarded_kernel, + load_store_guarded_kernel - <<>>(device_input, - discard_itr, - discard_itr, - guarded_elements); - HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP + <<>>(device_input, dummy, dummy, guarded_elements); + HIP_CHECK(hipFree(dummy)); + // Running kernel load_store_guarded_kernel -#include -#include -#include #include "test_utils.hpp" #include "test_utils_data_generation.hpp" diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp index 63a07db720b..d991a0dbdb9 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp @@ -24,7 +24,6 @@ // required hipcub headers #include -#include #include #include diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp index 7e4def5e4c9..a2cc5a6c761 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp @@ -30,10 +30,6 @@ // hipcub API #include -#include -#include - -#include // rows, columns, (row_stride - columns * Channels) std::vector> get_dims() @@ -394,7 +390,7 @@ TYPED_TEST(HipcubDeviceHistogramEvenOverflow, EvenOverflow) SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate data - auto d_input = thrust::counting_iterator(0); + auto d_input = rocprim::counting_iterator(0); counter_type* d_histogram; HIP_CHECK(test_common_utils::hipMallocHelper(&d_histogram, bins * sizeof(counter_type))); diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_merge.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_merge.cpp index 70ec83e9eec..efdee71e2a5 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_merge.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_merge.cpp @@ -24,7 +24,6 @@ // hipcub API #include -#include #include "identity_iterator.hpp" #include "test_utils_data_generation.hpp" diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp index f3d0cc31ab8..38f94d2cb51 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp @@ -29,7 +29,6 @@ // hipcub API #include -#include // Params for tests template diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp index 5fde1649b96..b887dcbeda2 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp @@ -24,9 +24,6 @@ // hipcub API #include -#include -#include -#include #include #include "single_index_iterator.hpp" diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp index 7b7d0725fad..7a537bab73d 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp @@ -28,7 +28,6 @@ // hipcub API #include -#include template -#include -#include #include "single_index_iterator.hpp" #include "test_utils_bfloat16.hpp" diff --git a/projects/hipcub/test/hipcub/test_hipcub_iterators.cpp b/projects/hipcub/test/hipcub/test_hipcub_iterators.cpp index a25bbf04229..62d579a6334 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_iterators.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_iterators.cpp @@ -27,10 +27,7 @@ #include #include #include -#include -#include #include -#include #include diff --git a/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp b/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp index 7e2c48bba8a..66750ccb471 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp @@ -28,7 +28,6 @@ #include #include #include -#include #include #include From 3eee1cd8ce0de32deb185f5a2f34d72c2937f331 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Mon, 17 Nov 2025 14:50:42 +0000 Subject: [PATCH 25/95] Add `__int128`, `__half` and `hip_bfloat16` overloads to `to_bits` in sort comparator test --- .../hipcub/test_utils_sort_comparator.hpp | 59 +++++++++++++++++-- 1 file changed, 54 insertions(+), 5 deletions(-) diff --git a/projects/hipcub/test/hipcub/test_utils_sort_comparator.hpp b/projects/hipcub/test/hipcub/test_utils_sort_comparator.hpp index cf0927de945..297fb313727 100644 --- a/projects/hipcub/test/hipcub/test_utils_sort_comparator.hpp +++ b/projects/hipcub/test/hipcub/test_utils_sort_comparator.hpp @@ -65,11 +65,13 @@ auto to_bits(const Key key) using unsigned_bits_type = typename hipcub::NumericTraits::UnsignedBits; unsigned_bits_type bit_key; - memcpy(&bit_key, &key, sizeof(Key)); + std::memcpy(&bit_key, &key, sizeof(unsigned_bits_type)); // Remove signed zero, this case is supposed to be treated the same as // unsigned zero in hipcub sorting algorithms. - constexpr unsigned_bits_type minus_zero = unsigned_bits_type{1} << (8 * sizeof(Key) - 1); + constexpr unsigned_bits_type minus_zero = unsigned_bits_type{1} + << (8 * sizeof(unsigned_bits_type) - 1); + // Positive and negative zero should compare the same. if(bit_key == minus_zero) { @@ -90,7 +92,7 @@ auto to_bits(const Key key) template::value, int> = 0> + std::enable_if_t>::value, int> = 0> auto to_bits(const Key& key) { using inner_t = typename inner_type::type; @@ -102,8 +104,8 @@ auto to_bits(const Key& key) uint32_t, std::conditional_t>>; - auto bit_key_upper = static_cast(to_bits<0, sizeof(key.x) * 8>(key.x)); - auto bit_key_lower = static_cast(to_bits<0, sizeof(key.y) * 8>(key.y)); + auto bit_key_upper = static_cast(to_bits<0, sizeof(inner_t) * 8>(key.x)); + auto bit_key_lower = static_cast(to_bits<0, sizeof(inner_t) * 8>(key.y)); // Flip sign bit to properly order signed types if(std::is_signed::value) @@ -122,6 +124,53 @@ auto to_bits(const Key& key) return to_bits(bit_key); } +template +auto to_bits(const hip_bfloat16& key) +{ + float f = static_cast(key); + return to_bits(f); +} + +template +auto to_bits(const __half& key) +{ + float f = static_cast(key); + return to_bits(f); +} + +template +auto to_bits(const __int128 key) +{ + using U = unsigned __int128; + U bits = static_cast(key); + bits ^= (U(1) << 127); + constexpr unsigned width = EndBit - StartBit; + if constexpr(width == 128) + { + return bits; + } + else + { + const U mask = (static_cast(1) << width) - 1; + return (bits >> StartBit) & mask; + } +} + +template +auto to_bits(const unsigned __int128 key) +{ + constexpr unsigned width = EndBit - StartBit; + if constexpr(width == 128) + { + return key; + } + else + { + const unsigned __int128 mask = (static_cast(1) << width) - 1; + return (key >> StartBit) & mask; + } +} + } // namespace detail template From 310bb5ea8697d2ca9bf99ad0f4865c60549393e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Mon, 24 Nov 2025 15:51:09 +0000 Subject: [PATCH 26/95] Fix add implicit conversion for read access for `conditional_discard_value` in single index iterator --- projects/hipcub/test/hipcub/single_index_iterator.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/projects/hipcub/test/hipcub/single_index_iterator.hpp b/projects/hipcub/test/hipcub/single_index_iterator.hpp index 8ba95e600d2..eab3c83f005 100644 --- a/projects/hipcub/test/hipcub/single_index_iterator.hpp +++ b/projects/hipcub/test/hipcub/single_index_iterator.hpp @@ -50,6 +50,14 @@ class single_index_iterator return *this; } + // Implicit conversion for read access + HIPCUB_HOST_DEVICE + inline + operator T() const + { + return *value_; + } + private: T* const value_; const bool keep_; From 4fd0e1b38407b70084b82d0e95ff49081caa22db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Mon, 1 Dec 2025 16:34:26 +0000 Subject: [PATCH 27/95] Drop deprecated features from util_ptx.cuh --- .../hipcub/backend/rocprim/util_ptx.hpp | 134 ------------------ 1 file changed, 134 deletions(-) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_ptx.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_ptx.hpp index 8582d4376b7..3d8c49dea07 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_ptx.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_ptx.hpp @@ -44,12 +44,6 @@ BEGIN_HIPCUB_NAMESPACE // * ThreadExit - not supported // * LogicShiftLeft // * LogicShiftRight -// * ThreadTrap - not supported, deprecated in CUB -// * FFMA_RZ, FMUL_RZ - not supported, deprecated in CUB -// * SHFL_IDX_SYNC - not supported, deprecated in CUB -// * WARP_SYNC - deprecated, deprecated in CUB -// * CTA_SYNC_AND - not supported, deprecated in CUB -// * CTA_SYNC_OR - not supported, deprecated in CUB // * MatchAny - not in CUB public API // // Differences: @@ -69,18 +63,6 @@ HIPCUB_FORCEINLINE int RowMajorTid(int block_dim_x, int block_dim_y, int block_d + hipThreadIdx_x; } -HIPCUB_DEPRECATED_BECAUSE("use ::rocprim::lane_id() instead") -HIPCUB_DEVICE HIPCUB_FORCEINLINE unsigned int LaneId() -{ - return ::rocprim::lane_id(); -} - -HIPCUB_DEPRECATED_BECAUSE("use ::rocprim::warp_id instead") -HIPCUB_DEVICE HIPCUB_FORCEINLINE unsigned int WarpId() -{ - return ::rocprim::warp_id(); -} - template HIPCUB_DEVICE HIPCUB_FORCEINLINE uint64_t WarpMask(unsigned int warp_id) @@ -97,34 +79,6 @@ HIPCUB_FORCEINLINE uint64_t WarpMask(unsigned int warp_id) return member_mask; } -// Returns the warp lane mask of all lanes less than the calling thread -HIPCUB_DEPRECATED_BECAUSE("use ::rocprim::get_sreg_lanemask_lt instead") -HIPCUB_DEVICE HIPCUB_FORCEINLINE uint64_t LaneMaskLt() -{ - return (uint64_t(1) << ::rocprim::lane_id()) - 1; -} - -// Returns the warp lane mask of all lanes less than or equal to the calling thread -HIPCUB_DEPRECATED_BECAUSE("use ::rocprim::get_sreg_lanemask_le instead") -HIPCUB_DEVICE HIPCUB_FORCEINLINE uint64_t LaneMaskLe() -{ - return ((uint64_t(1) << ::rocprim::lane_id()) << 1) - 1; -} - -// Returns the warp lane mask of all lanes greater than the calling thread -HIPCUB_DEPRECATED_BECAUSE("use ::rocprim::get_sreg_lanemask_gt instead") -HIPCUB_DEVICE HIPCUB_FORCEINLINE uint64_t LaneMaskGt() -{ - return uint64_t(-1)^LaneMaskLe(); -} - -// Returns the warp lane mask of all lanes greater than or equal to the calling thread -HIPCUB_DEPRECATED_BECAUSE("use ::rocprim::get_sreg_lanemask_ge instead") -HIPCUB_DEVICE HIPCUB_FORCEINLINE uint64_t LaneMaskGe() -{ - return uint64_t(-1)^LaneMaskLt(); -} - // Shuffle funcs template @@ -167,22 +121,6 @@ HIPCUB_FORCEINLINE T ShuffleIndex(T input, int src_lane, unsigned int member_mas ); } -// Other - -HIPCUB_DEPRECATED_BECAUSE("will be removed in the next major release") -HIPCUB_DEVICE HIPCUB_FORCEINLINE - unsigned int SHR_ADD(unsigned int x, unsigned int shift, unsigned int addend) -{ - return (x >> shift) + addend; -} - -HIPCUB_DEPRECATED_BECAUSE("will be removed in the next major release") -HIPCUB_DEVICE HIPCUB_FORCEINLINE - unsigned int SHL_ADD(unsigned int x, unsigned int shift, unsigned int addend) -{ - return (x << shift) + addend; -} - namespace detail { template @@ -241,78 +179,6 @@ HIPCUB_FORCEINLINE unsigned int BFE(UnsignedBits source, } #endif -// Bitfield insert. -// Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start. -HIPCUB_DEPRECATED_BECAUSE("will be removed in the next major release") -HIPCUB_DEVICE HIPCUB_FORCEINLINE void BFI(unsigned int& ret, - unsigned int x, - unsigned int y, - unsigned int bit_start, - unsigned int num_bits) -{ - #ifdef __HIP_PLATFORM_AMD__ - ret = __bitinsert_u32(x, y, bit_start, num_bits); - #else - x <<= bit_start; - unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start; - unsigned int MASK_Y = ~MASK_X; - ret = (y & MASK_Y) | (x & MASK_X); - #endif // __HIP_PLATFORM_AMD__ -} - -HIPCUB_DEPRECATED_BECAUSE("will be removed in the next major release") -HIPCUB_DEVICE HIPCUB_FORCEINLINE unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z) -{ - return x + y + z; -} - -HIPCUB_DEPRECATED_BECAUSE("will be removed in the next major release") -HIPCUB_DEVICE HIPCUB_FORCEINLINE int PRMT(unsigned int a, unsigned int b, unsigned int index) -{ - return ::__byte_perm(a, b, index); -} - -HIPCUB_DEPRECATED_BECAUSE("will be removed in the next major release") -HIPCUB_DEVICE HIPCUB_FORCEINLINE void BAR(int count) -{ - (void) count; - __syncthreads(); -} - -HIPCUB_DEPRECATED_BECAUSE("use __syncthreads() instead") -HIPCUB_DEVICE HIPCUB_FORCEINLINE void CTA_SYNC() -{ - __syncthreads(); -} - -HIPCUB_DEPRECATED_BECAUSE("use ::rocprim::wave_barrier() instead") -HIPCUB_DEVICE HIPCUB_FORCEINLINE void WARP_SYNC(unsigned int member_mask) -{ - (void) member_mask; - ::rocprim::wave_barrier(); -} - -HIPCUB_DEPRECATED_BECAUSE("use ::__any(predicate) instead") -HIPCUB_DEVICE HIPCUB_FORCEINLINE int WARP_ANY(int predicate, uint64_t member_mask) -{ - (void) member_mask; - return ::__any(predicate); -} - -HIPCUB_DEPRECATED_BECAUSE("use ::__all(predicate) instead") -HIPCUB_DEVICE HIPCUB_FORCEINLINE int WARP_ALL(int predicate, uint64_t member_mask) -{ - (void) member_mask; - return ::__all(predicate); -} - -HIPCUB_DEPRECATED_BECAUSE("use ::__ballot(predicate) instead") -HIPCUB_DEVICE HIPCUB_FORCEINLINE int64_t WARP_BALLOT(int predicate, uint64_t member_mask) -{ - (void) member_mask; - return __ballot(predicate); -} - namespace detail { From b553307ba514e7b9b30ef8c1bc26899d0cf95494 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Tue, 2 Dec 2025 10:09:29 +0000 Subject: [PATCH 28/95] Fix typo in TEST_UTILS_INCLUDE_GUARD --- projects/hipcub/test/hipcub/common_test_header.hpp | 2 +- projects/hipcub/test/hipcub/test_utils.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/hipcub/test/hipcub/common_test_header.hpp b/projects/hipcub/test/hipcub/common_test_header.hpp index ec53dc785ea..ddef10bcada 100755 --- a/projects/hipcub/test/hipcub/common_test_header.hpp +++ b/projects/hipcub/test/hipcub/common_test_header.hpp @@ -47,7 +47,7 @@ // test_utils.hpp should only be included by this header. // The following definition is used as guard in test_utils.hpp // Including test_utils.hpp by itself will cause a compile error. -#define TEST_UTILS_INCLUDE_GAURD +#define TEST_UTILS_INCLUDE_GUARD #include "test_utils.hpp" #define HIP_CHECK(condition) \ diff --git a/projects/hipcub/test/hipcub/test_utils.hpp b/projects/hipcub/test/hipcub/test_utils.hpp index 57ab07e6883..adaf0bd9fdc 100644 --- a/projects/hipcub/test/hipcub/test_utils.hpp +++ b/projects/hipcub/test/hipcub/test_utils.hpp @@ -21,7 +21,7 @@ #ifndef HIPCUB_TEST_TEST_UTILS_HPP_ #define HIPCUB_TEST_TEST_UTILS_HPP_ -#ifndef TEST_UTILS_INCLUDE_GAURD +#ifndef TEST_UTILS_INCLUDE_GUARD #error test_utils.hpp must ONLY be included by common_test_header.hpp. Please include common_test_header.hpp instead. #endif From 45923055d37576232602ab41dc682881b520fffd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Mon, 1 Dec 2025 17:18:36 +0000 Subject: [PATCH 29/95] Use hip::std limits --- .../benchmark/benchmark_device_histogram.cpp | 2 +- projects/hipcub/benchmark/benchmark_utils.hpp | 20 ++--- .../benchmark/benchmark_warp_merge_sort.cpp | 2 +- .../benchmark/common_benchmark_header.hpp | 2 +- .../rocprim/block/block_run_length_decode.hpp | 3 +- .../rocprim/device/device_histogram.hpp | 3 +- .../backend/rocprim/device/device_reduce.hpp | 11 +-- .../device/device_segmented_reduce.hpp | 11 +-- .../hipcub/backend/rocprim/util_type.hpp | 9 ++- .../hipcub/test/hipcub/common_test_header.hpp | 2 +- .../hipcub/test_hipcub_block_merge_sort.cpp | 59 +++++++------- .../hipcub/test_hipcub_block_radix_sort.cpp | 30 +++---- .../test_hipcub_block_run_length_decode.cpp | 13 +-- .../hipcub/test_hipcub_device_histogram.cpp | 14 ++-- .../hipcub/test_hipcub_device_radix_sort.hpp | 8 +- .../test/hipcub/test_hipcub_device_scan.cpp | 2 +- ...est_hipcub_device_segmented_radix_sort.hpp | 79 ++++++++++--------- .../test_hipcub_device_segmented_reduce.cpp | 4 +- .../test_hipcub_device_segmented_sort.hpp | 9 ++- .../test/hipcub/test_hipcub_warp_load.cpp | 16 ++-- .../hipcub/test_hipcub_warp_merge_sort.cpp | 29 ++++--- projects/hipcub/test/hipcub/test_utils.hpp | 12 +-- .../hipcub/test_utils_data_generation.hpp | 23 +++--- 23 files changed, 189 insertions(+), 174 deletions(-) diff --git a/projects/hipcub/benchmark/benchmark_device_histogram.cpp b/projects/hipcub/benchmark/benchmark_device_histogram.cpp index 5c33113db2e..f724db54389 100644 --- a/projects/hipcub/benchmark/benchmark_device_histogram.cpp +++ b/projects/hipcub/benchmark/benchmark_device_histogram.cpp @@ -463,7 +463,7 @@ struct num_limits { static constexpr T max() { - return std::numeric_limits::max(); + return _HIPCUB_STD::numeric_limits::max(); }; }; diff --git a/projects/hipcub/benchmark/benchmark_utils.hpp b/projects/hipcub/benchmark/benchmark_utils.hpp index 52dc87c18bd..e6b5af7e8d2 100644 --- a/projects/hipcub/benchmark/benchmark_utils.hpp +++ b/projects/hipcub/benchmark/benchmark_utils.hpp @@ -43,6 +43,8 @@ #define HIPCUB_WARP_THREADS_MACRO CUB_PTX_WARP_THREADS #endif +#include _HIPCUB_STD_INCLUDE(limits) + namespace benchmark_utils { const size_t default_max_random_size = 1024 * 1024; @@ -303,11 +305,11 @@ struct generate_limits::value>> { static inline T min() { - return std::numeric_limits::min(); + return _HIPCUB_STD::numeric_limits::min(); } static inline T max() { - return std::numeric_limits::max(); + return _HIPCUB_STD::numeric_limits::max(); } }; @@ -382,7 +384,7 @@ std::vector using key_distribution_type = std::conditional_t::value, std::uniform_int_distribution, std::uniform_real_distribution>; - key_distribution_type key_distribution(std::numeric_limits::max()); + key_distribution_type key_distribution(_HIPCUB_STD::numeric_limits::max()); std::vector keys(size); size_t keys_start_index = 0; @@ -492,17 +494,17 @@ class numeric_limits> public: static constexpr inline T min() { - return std::numeric_limits::min(); + return _HIPCUB_STD::numeric_limits::min(); } static constexpr inline T max() { - return std::numeric_limits::max(); + return _HIPCUB_STD::numeric_limits::max(); } static constexpr inline T lowest() { - return std::numeric_limits::lowest(); + return _HIPCUB_STD::numeric_limits::lowest(); } }; @@ -514,17 +516,17 @@ class numeric_limits> public: static constexpr inline T min() { - return std::numeric_limits::min(); + return _HIPCUB_STD::numeric_limits::min(); } static constexpr inline T max() { - return std::numeric_limits::max(); + return _HIPCUB_STD::numeric_limits::max(); } static constexpr inline T lowest() { - return std::numeric_limits::lowest(); + return _HIPCUB_STD::numeric_limits::lowest(); } }; } // namespace std diff --git a/projects/hipcub/benchmark/benchmark_warp_merge_sort.cpp b/projects/hipcub/benchmark/benchmark_warp_merge_sort.cpp index f6d91fe7f9a..cf823609a5f 100644 --- a/projects/hipcub/benchmark/benchmark_warp_merge_sort.cpp +++ b/projects/hipcub/benchmark/benchmark_warp_merge_sort.cpp @@ -149,7 +149,7 @@ __global__ template struct max_value { - static constexpr T value = std::numeric_limits::max(); + static constexpr T value = _HIPCUB_STD::numeric_limits::max(); }; template #include #include -#include #include #include #include @@ -45,6 +44,7 @@ #include #include _HIPCUB_LIBCXX_INCLUDE(cmath) +#include _HIPCUB_STD_INCLUDE(limits) // benchmark_utils.hpp should only be included by this header. // The following definition is used as guard in benchmark_utils.hpp diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_run_length_decode.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_run_length_decode.hpp index 8296c3a37a1..0fb000ab097 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_run_length_decode.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_run_length_decode.hpp @@ -37,7 +37,8 @@ #include "../util_type.hpp" #include "block_scan.hpp" -#include +#include _HIPCUB_STD_INCLUDE(limits) + #include BEGIN_HIPCUB_NAMESPACE diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_histogram.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_histogram.hpp index a67a139eca3..a0ce2b97ceb 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_histogram.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_histogram.hpp @@ -61,7 +61,8 @@ HIPCUB_FORCEINLINE bool may_overflow(LevelT lower_level, ::std::true_type /* is_integral */) { return static_cast(upper_level - lower_level) - > (::std::numeric_limits::max() / static_cast(num_bins)); + > (_HIPCUB_STD::numeric_limits::max() + / static_cast(num_bins)); } template diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp index 8f21617f81e..c9f19bf0573 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp @@ -46,8 +46,9 @@ #include // hip_bfloat16 #include // __half +#include _HIPCUB_STD_INCLUDE(limits) + #include -#include BEGIN_HIPCUB_NAMESPACE namespace detail @@ -66,7 +67,7 @@ HIPCUB_HOST_DEVICE T set_half_bits(uint16_t value) template HIPCUB_HOST_DEVICE inline T get_lowest_value() { - return std::numeric_limits::lowest(); + return _HIPCUB_STD::numeric_limits::lowest(); } template<> @@ -86,7 +87,7 @@ HIPCUB_HOST_DEVICE inline hip_bfloat16 get_lowest_value() template HIPCUB_HOST_DEVICE inline T get_max_value() { - return std::numeric_limits::max(); + return _HIPCUB_STD::numeric_limits::max(); } template<> @@ -116,7 +117,7 @@ template inline auto get_lowest_special_value() -> typename std::enable_if_t::value, T> { - return -std::numeric_limits::infinity(); + return -_HIPCUB_STD::numeric_limits::infinity(); } template<> @@ -146,7 +147,7 @@ template inline auto get_max_special_value() -> typename std::enable_if_t::value, T> { - return std::numeric_limits::infinity(); + return _HIPCUB_STD::numeric_limits::infinity(); } template<> diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp index d4e01126cc8..f978bed008d 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp @@ -42,9 +42,10 @@ #include // IWYU pragma: export #include // IWYU pragma: export +#include _HIPCUB_STD_INCLUDE(limits) + #include #include -#include BEGIN_HIPCUB_NAMESPACE @@ -312,7 +313,7 @@ struct DeviceSegmentedReduce d_begin_offsets, d_end_offsets, ::hipcub::Min(), - std::numeric_limits::max(), + _HIPCUB_STD::numeric_limits::max(), stream); } @@ -362,7 +363,7 @@ struct DeviceSegmentedReduce IteratorT d_indexed_in(d_in); // true maximum value of the full range // key is ::max because ArgMin finds the lowest value that has the lowest key - const OutputTupleT init(std::numeric_limits::max(), + const OutputTupleT init(_HIPCUB_STD::numeric_limits::max(), detail::get_max_special_value()); // special value for empty segments const OutputTupleT empty_value(1, detail::get_max_value()); @@ -424,7 +425,7 @@ struct DeviceSegmentedReduce d_begin_offsets, d_end_offsets, ::hipcub::Max(), - std::numeric_limits::lowest(), + _HIPCUB_STD::numeric_limits::lowest(), stream); } @@ -474,7 +475,7 @@ struct DeviceSegmentedReduce IteratorT d_indexed_in(d_in); // true minimum value of the full range // key is ::max because ArgMax finds the highest value that has the lowest key - const OutputTupleT init(std::numeric_limits::max(), + const OutputTupleT init(_HIPCUB_STD::numeric_limits::max(), detail::get_lowest_special_value()); // special value for empty segments const OutputTupleT empty_value(1, detail::get_lowest_value()); diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp index 9d3df462afb..156d341374b 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp @@ -42,7 +42,8 @@ #include #include -#include +#include _HIPCUB_STD_INCLUDE(limits) + #include BEGIN_HIPCUB_NAMESPACE @@ -611,13 +612,13 @@ struct BaseTraits static HIPCUB_HOST_DEVICE __forceinline__ T Max() { - return std::numeric_limits::max(); + return _HIPCUB_STD::numeric_limits::max(); } static HIPCUB_HOST_DEVICE __forceinline__ T Lowest() { - return std::numeric_limits::lowest(); + return _HIPCUB_STD::numeric_limits::lowest(); } }; @@ -634,7 +635,7 @@ struct NumericTraits : BaseTraits template<> struct NumericTraits - : BaseTraits<(std::numeric_limits::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, + : BaseTraits<(_HIPCUB_STD::numeric_limits::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, unsigned char, char> {}; diff --git a/projects/hipcub/test/hipcub/common_test_header.hpp b/projects/hipcub/test/hipcub/common_test_header.hpp index ddef10bcada..fc04ec1fb4b 100755 --- a/projects/hipcub/test/hipcub/common_test_header.hpp +++ b/projects/hipcub/test/hipcub/common_test_header.hpp @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -43,6 +42,7 @@ #include #include _HIPCUB_LIBCXX_INCLUDE(cmath) +#include _HIPCUB_STD_INCLUDE(limits) // test_utils.hpp should only be included by this header. // The following definition is used as guard in test_utils.hpp diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_merge_sort.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_merge_sort.cpp index 6eda70ee253..9aab4a6e56f 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_merge_sort.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_merge_sort.cpp @@ -123,10 +123,11 @@ TYPED_TEST(HipcubBlockMergeSort, SortKeys) // Generate data std::vector keys_output; - keys_output = test_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max(), - seed_value); + keys_output + = test_utils::get_random_data(size, + _HIPCUB_STD::numeric_limits::min(), + _HIPCUB_STD::numeric_limits::max(), + seed_value); // Calculate expected results on host std::vector expected(keys_output); @@ -217,8 +218,8 @@ TYPED_TEST(HipcubBlockMergeSort, SortKeysWithValidItems) constexpr size_t size = grid_size * items_per_block; // minus|plus two to prevent overflow weirdness - const T mini = std::numeric_limits::min() + static_cast(2); - const T maxi = std::numeric_limits::max() - static_cast(2); + const T mini = _HIPCUB_STD::numeric_limits::min() + static_cast(2); + const T maxi = _HIPCUB_STD::numeric_limits::max() - static_cast(2); const T default_val = static_cast(compare_op(mini, maxi) ? maxi : mini); const int valid_items_arr[8] = {items_per_block / 2, @@ -354,17 +355,18 @@ TYPED_TEST(HipcubBlockMergeSort, SortKeysValues) // Generate data std::vector keys_output; - keys_output = test_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max(), - seed_value); + keys_output + = test_utils::get_random_data(size, + _HIPCUB_STD::numeric_limits::min(), + _HIPCUB_STD::numeric_limits::max(), + seed_value); std::vector values_output; - values_output - = test_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max(), - seed_value + seed_value_addition); + values_output = test_utils::get_random_data( + size, + _HIPCUB_STD::numeric_limits::min(), + _HIPCUB_STD::numeric_limits::max(), + seed_value + seed_value_addition); using key_value = std::pair; @@ -593,17 +595,18 @@ TYPED_TEST(HipcubBlockMergeSort, StableSortKeysValues) // Generate data std::vector keys_output; - keys_output = test_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max(), - seed_value); + keys_output + = test_utils::get_random_data(size, + _HIPCUB_STD::numeric_limits::min(), + _HIPCUB_STD::numeric_limits::max(), + seed_value); std::vector values_output; - values_output - = test_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max(), - seed_value + seed_value_addition); + values_output = test_utils::get_random_data( + size, + _HIPCUB_STD::numeric_limits::min(), + _HIPCUB_STD::numeric_limits::max(), + seed_value + seed_value_addition); // Set some keys to be the same, but have different values to test stability for(size_t i = 0; i < 10; i++) @@ -734,8 +737,8 @@ TYPED_TEST(HipcubBlockMergeSort, StableSortKeysWithValidItems) constexpr size_t size = grid_size * items_per_block; // minus|plus two to prevent overflow weirdness - const T mini = std::numeric_limits::min() + static_cast(2); - const T maxi = std::numeric_limits::max() - static_cast(2); + const T mini = _HIPCUB_STD::numeric_limits::min() + static_cast(2); + const T maxi = _HIPCUB_STD::numeric_limits::max() - static_cast(2); const custom_type default_val = {static_cast(compare_op(mini, maxi) ? maxi : mini), 0}; const int valid_items_arr[8] = {items_per_block / 2, @@ -882,8 +885,8 @@ TYPED_TEST(HipcubBlockMergeSort, StableSortKeysValuesWithValidItems) constexpr size_t size = grid_size * items_per_block; // minus|plus two to prevent overflow weirdness - const T mini = std::numeric_limits::min() + static_cast(2); - const T maxi = std::numeric_limits::max() - static_cast(2); + const T mini = _HIPCUB_STD::numeric_limits::min() + static_cast(2); + const T maxi = _HIPCUB_STD::numeric_limits::max() - static_cast(2); T default_val = static_cast(compare_op(mini, maxi) ? maxi : mini); const int valid_items_arr[8] = {items_per_block / 2, diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_radix_sort.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_radix_sort.cpp index 0388c295793..56c8be53220 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_radix_sort.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_radix_sort.cpp @@ -448,11 +448,11 @@ TYPED_TEST(HipcubBlockRadixSort, SortKeys) } else { - keys_output - = test_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max(), - seed_value); + keys_output = test_utils::get_random_data( + size, + _HIPCUB_STD::numeric_limits::min(), + _HIPCUB_STD::numeric_limits::max(), + seed_value); } // Calculate expected results on host @@ -570,11 +570,11 @@ TYPED_TEST(HipcubBlockRadixSort, SortKeysValues) } else { - keys_output - = test_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max(), - seed_value); + keys_output = test_utils::get_random_data( + size, + _HIPCUB_STD::numeric_limits::min(), + _HIPCUB_STD::numeric_limits::max(), + seed_value); } std::vector values_output; @@ -588,11 +588,11 @@ TYPED_TEST(HipcubBlockRadixSort, SortKeysValues) } else { - values_output - = test_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max(), - seed_value + seed_value_addition); + values_output = test_utils::get_random_data( + size, + _HIPCUB_STD::numeric_limits::min(), + _HIPCUB_STD::numeric_limits::max(), + seed_value + seed_value_addition); } using key_value = std::pair; diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp index c0d9a00cb70..1f277c94c1b 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp @@ -145,10 +145,11 @@ TYPED_TEST(HipcubBlockRunLengthDecodeTest, TestDecode) static_cast(_HIPCUB_STD::numeric_limits::max()))); size_t num_runs = runs_per_thread * block_size; - auto run_items = test_utils::get_random_data(num_runs, - std::numeric_limits::min(), - std::numeric_limits::max(), - seed_value); + auto run_items + = test_utils::get_random_data(num_runs, + _HIPCUB_STD::numeric_limits::min(), + _HIPCUB_STD::numeric_limits::max(), + seed_value); auto run_lengths = test_utils::get_random_data(num_runs, static_cast(1), max_run_length, @@ -161,8 +162,8 @@ TYPED_TEST(HipcubBlockRunLengthDecodeTest, TestDecode) const auto empty_run_items = test_utils::get_random_data(num_trailing_empty_runs, - std::numeric_limits::min(), - std::numeric_limits::max(), + _HIPCUB_STD::numeric_limits::min(), + _HIPCUB_STD::numeric_limits::max(), seed_value); // Not strictly required, but fixes a spurious GCC warning and good practice anyways run_items.reserve(run_items.size() + empty_run_items.size()); diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp index a2cc5a6c761..0a8de808c08 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp @@ -374,7 +374,7 @@ TYPED_TEST(HipcubDeviceHistogramEvenOverflow, EvenOverflow) const native_level_type n_lower_level = 0; const native_level_type n_upper_level - = static_cast(std::numeric_limits::max()); + = static_cast(_HIPCUB_STD::numeric_limits::max()); level_type lower_level = test_utils::convert_to_device(n_lower_level); level_type upper_level = test_utils::convert_to_device(n_upper_level); @@ -798,9 +798,9 @@ TYPED_TEST(HipcubDeviceHistogramMultiEven, MultiEven) SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); std::vector channel_seeds = test_utils::get_random_data( - std::max(size, static_cast(channels)), - std::numeric_limits::min(), - std::numeric_limits::max(), + _HIPCUB_STD::max(size, static_cast(channels)), + _HIPCUB_STD::numeric_limits::min(), + _HIPCUB_STD::numeric_limits::max(), seed_value + seed_value_addition // Make sure that we do not use the same or shifted sequence ); @@ -1100,9 +1100,9 @@ TYPED_TEST(HipcubDeviceHistogramMultiRange, MultiRange) SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); std::vector channel_seeds = test_utils::get_random_data( - std::max(size, static_cast(channels)), - std::numeric_limits::min(), - std::numeric_limits::max(), + _HIPCUB_STD::max(size, static_cast(channels)), + _HIPCUB_STD::numeric_limits::min(), + _HIPCUB_STD::numeric_limits::max(), seed_value); // Generate data diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp b/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp index 70934d35940..de5189890c0 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp @@ -1303,8 +1303,8 @@ inline void sort_keys_over_4g() std::vector keys_input = test_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max(), + _HIPCUB_STD::numeric_limits::min(), + _HIPCUB_STD::numeric_limits::max(), seed_value); //generate histogram of the randomly generated values @@ -1358,7 +1358,7 @@ inline void sort_keys_over_4g() hipMemcpyDeviceToHost)); size_t counter = 0; - for(size_t i = 0; i <= std::numeric_limits::max(); ++i) + for(size_t i = 0; i <= _HIPCUB_STD::numeric_limits::max(); ++i) { for(size_t j = 0; j < histogram[i]; ++j) { @@ -1464,7 +1464,7 @@ inline void sort_keys_large_sizes() HIP_CHECK(hipFree(d_keys)); // Check if output values are as expected - const size_t unique_keys = size_t(std::numeric_limits::max()) + 1; + const size_t unique_keys = size_t(_HIPCUB_STD::numeric_limits::max()) + 1; const size_t segment_length = test_utils::ceiling_div(size, unique_keys); const size_t full_segments = size % unique_keys == 0 ? unique_keys : size % unique_keys; for(size_t i = 0; i < size; i += 4321) diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp index b887dcbeda2..cc23baf7270 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp @@ -94,7 +94,7 @@ std::vector std::default_random_engine prng(seed_value); std::uniform_int_distribution segment_length_distribution(max_segment_length); - std::uniform_int_distribution key_distribution(std::numeric_limits::max()); + std::uniform_int_distribution key_distribution(_HIPCUB_STD::numeric_limits::max()); std::vector keys(size); size_t keys_start_index = 0; diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_segmented_radix_sort.hpp b/projects/hipcub/test/hipcub/test_hipcub_device_segmented_radix_sort.hpp index d32259dc421..7f6d28ac110 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_segmented_radix_sort.hpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_segmented_radix_sort.hpp @@ -104,11 +104,11 @@ inline void sort_keys() } else { - keys_input - = test_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max(), - seed_value + seed_value_addition); + keys_input = test_utils::get_random_data( + size, + _HIPCUB_STD::numeric_limits::min(), + _HIPCUB_STD::numeric_limits::max(), + seed_value + seed_value_addition); } std::vector offsets; @@ -255,11 +255,11 @@ inline void sort_keys_empty_data() } else { - keys_input - = test_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max(), - seed_value + seed_value_addition); + keys_input = test_utils::get_random_data( + size, + _HIPCUB_STD::numeric_limits::min(), + _HIPCUB_STD::numeric_limits::max(), + seed_value + seed_value_addition); } std::vector offsets(2); @@ -382,10 +382,11 @@ inline void sort_keys_large_segments() } else { - keys_input = test_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max(), - seed_value + seed_value_addition); + keys_input = test_utils::get_random_data( + size, + _HIPCUB_STD::numeric_limits::min(), + _HIPCUB_STD::numeric_limits::max(), + seed_value + seed_value_addition); } std::vector offsets(3); @@ -526,11 +527,11 @@ inline void sort_keys_unspecified_ranges() } else { - keys_input - = test_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max(), - seed_value + seed_value_addition); + keys_input = test_utils::get_random_data( + size, + _HIPCUB_STD::numeric_limits::min(), + _HIPCUB_STD::numeric_limits::max(), + seed_value + seed_value_addition); } std::vector begin_offsets; @@ -709,11 +710,11 @@ inline void sort_pairs() } else { - keys_input - = test_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max(), - seed_value + seed_value_addition); + keys_input = test_utils::get_random_data( + size, + _HIPCUB_STD::numeric_limits::min(), + _HIPCUB_STD::numeric_limits::max(), + seed_value + seed_value_addition); } std::vector offsets; @@ -906,11 +907,11 @@ inline void sort_pairs_unspecified_ranges() } else { - keys_input - = test_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max(), - seed_value + seed_value_addition); + keys_input = test_utils::get_random_data( + size, + _HIPCUB_STD::numeric_limits::min(), + _HIPCUB_STD::numeric_limits::max(), + seed_value + seed_value_addition); } std::vector values_input(size); @@ -1132,11 +1133,11 @@ inline void sort_keys_double_buffer() } else { - keys_input - = test_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max(), - seed_value + seed_value_addition); + keys_input = test_utils::get_random_data( + size, + _HIPCUB_STD::numeric_limits::min(), + _HIPCUB_STD::numeric_limits::max(), + seed_value + seed_value_addition); } std::vector offsets; @@ -1289,11 +1290,11 @@ inline void sort_pairs_double_buffer() } else { - keys_input - = test_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max(), - seed_value + seed_value_addition); + keys_input = test_utils::get_random_data( + size, + _HIPCUB_STD::numeric_limits::min(), + _HIPCUB_STD::numeric_limits::max(), + seed_value + seed_value_addition); } std::vector offsets; diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp index 7a537bab73d..2cbc11d35bc 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp @@ -438,7 +438,7 @@ TYPED_TEST(HipcubDeviceSegmentedReduce, Min) using result_type = output_type; using offset_type = unsigned int; - constexpr input_type init = std::numeric_limits::max(); + constexpr input_type init = _HIPCUB_STD::numeric_limits::max(); reduce_op_type reduce_op; std::random_device rd; @@ -595,7 +595,7 @@ TYPED_TEST(HipcubDeviceSegmentedReduce, Max) using result_type = output_type; using offset_type = unsigned int; - constexpr input_type init = std::numeric_limits::lowest(); + constexpr input_type init = _HIPCUB_STD::numeric_limits::lowest(); reduce_op_type reduce_op; std::random_device rd; diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_segmented_sort.hpp b/projects/hipcub/test/hipcub/test_hipcub_device_segmented_sort.hpp index b52f4290bdc..66d58468f64 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_segmented_sort.hpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_segmented_sort.hpp @@ -109,10 +109,11 @@ inline void generate_input_data(std::vector& keys_input, } else { - keys_input = test_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max(), - seed_value + seed_value_addition); + keys_input + = test_utils::get_random_data(size, + _HIPCUB_STD::numeric_limits::min(), + _HIPCUB_STD::numeric_limits::max(), + seed_value + seed_value_addition); } offsets.clear(); diff --git a/projects/hipcub/test/hipcub/test_hipcub_warp_load.cpp b/projects/hipcub/test/hipcub/test_hipcub_warp_load.cpp index 71cead9b29a..7a0c30a96ca 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_warp_load.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_warp_load.cpp @@ -232,14 +232,14 @@ TYPED_TEST(HipcubWarpLoadTest, WarpLoadGuarded) SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); HIP_CHECK(hipSetDevice(device_id)); - using T = typename TestFixture::params::type; - constexpr unsigned warp_size = TestFixture::params::warp_size; - constexpr ::hipcub::WarpLoadAlgorithm algorithm = TestFixture::params::algorithm; - constexpr unsigned items_per_thread = 4; - constexpr unsigned block_size = 1024; - constexpr unsigned items_count = items_per_thread * block_size; - constexpr int valid_items = warp_size / 4; - constexpr T oob_default = std::numeric_limits::max(); + using T = typename TestFixture::params::type; + constexpr unsigned warp_size = TestFixture::params::warp_size; + constexpr ::hipcub::WarpLoadAlgorithm algorithm = TestFixture::params::algorithm; + constexpr unsigned items_per_thread = 4; + constexpr unsigned block_size = 1024; + constexpr unsigned items_count = items_per_thread * block_size; + constexpr int valid_items = warp_size / 4; + constexpr T oob_default = _HIPCUB_STD::numeric_limits::max(); SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size); diff --git a/projects/hipcub/test/hipcub/test_hipcub_warp_merge_sort.cpp b/projects/hipcub/test/hipcub/test_hipcub_warp_merge_sort.cpp index be4fbb2e827..24180e97f69 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_warp_merge_sort.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_warp_merge_sort.cpp @@ -28,7 +28,6 @@ #include #include -#include #include #include @@ -202,12 +201,12 @@ struct sort_last; template struct sort_last { - static constexpr T value = std::numeric_limits::max(); + static constexpr T value = _HIPCUB_STD::numeric_limits::max(); }; template struct sort_last { - static constexpr T value = std::numeric_limits::lowest(); + static constexpr T value = _HIPCUB_STD::numeric_limits::lowest(); }; template( size, - std::numeric_limits::lowest(), - std::numeric_limits::max(), + _HIPCUB_STD::numeric_limits::lowest(), + _HIPCUB_STD::numeric_limits::max(), seed_value); const auto segment_sizes = test_utils::get_random_data( @@ -519,8 +518,8 @@ TYPED_TEST(HipcubWarpMergeSort, SortKeysValuesSegmented) seed_value) : test_utils::get_random_data( size, - std::numeric_limits::lowest(), - std::numeric_limits::max(), + _HIPCUB_STD::numeric_limits::lowest(), + _HIPCUB_STD::numeric_limits::max(), seed_value); using value_wrapped_type = typename test_utils::inner_type::type; @@ -533,8 +532,8 @@ TYPED_TEST(HipcubWarpMergeSort, SortKeysValuesSegmented) seed_value) : test_utils::get_random_data( size, - std::numeric_limits::lowest(), - std::numeric_limits::max(), + _HIPCUB_STD::numeric_limits::lowest(), + _HIPCUB_STD::numeric_limits::max(), seed_value ^ (seed_value >> 1ul)); const auto segment_sizes = test_utils::get_random_data( @@ -667,8 +666,8 @@ TYPED_TEST(HipcubWarpMergeSort, SortKeys) seed_value) : test_utils::get_random_data( size, - std::numeric_limits::lowest(), - std::numeric_limits::max(), + _HIPCUB_STD::numeric_limits::lowest(), + _HIPCUB_STD::numeric_limits::max(), seed_value); const auto compare = typename params::compare_function{}; @@ -766,8 +765,8 @@ TYPED_TEST(HipcubWarpMergeSort, SortKeysValues) seed_value) : test_utils::get_random_data( size, - std::numeric_limits::lowest(), - std::numeric_limits::max(), + _HIPCUB_STD::numeric_limits::lowest(), + _HIPCUB_STD::numeric_limits::max(), seed_value); using value_wrapped_type = typename test_utils::inner_type::type; @@ -780,8 +779,8 @@ TYPED_TEST(HipcubWarpMergeSort, SortKeysValues) seed_value) : test_utils::get_random_data( size, - std::numeric_limits::lowest(), - std::numeric_limits::max(), + _HIPCUB_STD::numeric_limits::lowest(), + _HIPCUB_STD::numeric_limits::max(), seed_value ^ (seed_value >> 1ul)); const auto compare = typename params::compare_function{}; diff --git a/projects/hipcub/test/hipcub/test_utils.hpp b/projects/hipcub/test/hipcub/test_utils.hpp index adaf0bd9fdc..88b4e2d7ac6 100644 --- a/projects/hipcub/test/hipcub/test_utils.hpp +++ b/projects/hipcub/test/hipcub/test_utils.hpp @@ -639,17 +639,17 @@ namespace std static constexpr inline T max() { - return std::numeric_limits::max(); + return _HIPCUB_STD::numeric_limits::max(); } static constexpr inline T min() { - return std::numeric_limits::min(); + return _HIPCUB_STD::numeric_limits::min(); } static constexpr inline T lowest() { - return std::numeric_limits::lowest(); + return _HIPCUB_STD::numeric_limits::lowest(); } }; @@ -662,17 +662,17 @@ namespace std static constexpr inline T max() { - return std::numeric_limits::max(); + return _HIPCUB_STD::numeric_limits::max(); } static constexpr inline T min() { - return std::numeric_limits::min(); + return _HIPCUB_STD::numeric_limits::min(); } static constexpr inline T lowest() { - return std::numeric_limits::lowest(); + return _HIPCUB_STD::numeric_limits::lowest(); } }; } diff --git a/projects/hipcub/test/hipcub/test_utils_data_generation.hpp b/projects/hipcub/test/hipcub/test_utils_data_generation.hpp index eb3af3ee54a..d5a0526ea65 100644 --- a/projects/hipcub/test/hipcub/test_utils_data_generation.hpp +++ b/projects/hipcub/test/hipcub/test_utils_data_generation.hpp @@ -48,14 +48,15 @@ T set_half_bits(uint16_t value) // Numeric limits which also supports custom_test_type classes template -struct numeric_limits : std::numeric_limits +struct numeric_limits : _HIPCUB_STD::numeric_limits {}; template<> -struct numeric_limits : public std::numeric_limits +struct numeric_limits : public _HIPCUB_STD::numeric_limits { public: using T = test_utils::half; + static constexpr bool is_specialized = true; static inline T min() { return T(0.00006104f); @@ -74,11 +75,11 @@ struct numeric_limits : public std::numeric_limits::quiet_NaN()); + return T(_HIPCUB_STD::numeric_limits::quiet_NaN()); }; static inline T signaling_NaN() { - return T(std::numeric_limits::signaling_NaN()); + return T(_HIPCUB_STD::numeric_limits::signaling_NaN()); }; static inline T infinity_neg() { @@ -87,17 +88,19 @@ struct numeric_limits : public std::numeric_limits -class numeric_limits : public std::numeric_limits +class numeric_limits + : public _HIPCUB_STD::numeric_limits { public: using T = test_utils::bfloat16; + static constexpr bool is_specialized = true; static inline T max() { return set_half_bits(0x7f7f); }; static inline T min() { - return T(std::numeric_limits::min()); + return T(_HIPCUB_STD::numeric_limits::min()); }; static inline T lowest() { @@ -109,11 +112,11 @@ class numeric_limits : public std::numeric_limits::quiet_NaN()); + return T(_HIPCUB_STD::numeric_limits::quiet_NaN()); }; static inline T signaling_NaN() { - return T(std::numeric_limits::signaling_NaN()); + return T(_HIPCUB_STD::numeric_limits::signaling_NaN()); }; static inline T infinity_neg() { @@ -122,12 +125,12 @@ class numeric_limits : public std::numeric_limits -class numeric_limits : public std::numeric_limits +class numeric_limits : public _HIPCUB_STD::numeric_limits { public: static inline float infinity_neg() { - return -std::numeric_limits::infinity(); + return -_HIPCUB_STD::numeric_limits::infinity(); }; }; // End of extended numeric_limits From 785bff85f2ba0851eac218a20360226fc74679d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Tue, 9 Dec 2025 19:48:46 +0000 Subject: [PATCH 30/95] Move iterator traits to hip::std::iterator_traits --- projects/hipcub/benchmark/benchmark_utils.hpp | 27 +++--- .../include/hipcub/backend/cub/util_type.hpp | 89 +++++++++++++++++++ .../backend/rocprim/device/device_for.hpp | 22 ++--- .../rocprim/device/device_histogram.hpp | 2 +- .../backend/rocprim/device/device_reduce.hpp | 23 +++-- .../backend/rocprim/device/device_scan.hpp | 7 +- .../device/device_segmented_reduce.hpp | 18 ++-- .../iterator/arg_index_input_iterator.hpp | 4 +- .../backend/rocprim/thread/thread_load.hpp | 11 ++- .../rocprim/thread/thread_operators.hpp | 16 ++-- .../hipcub/backend/rocprim/util_type.hpp | 25 ++++++ .../hipcub/include/hipcub/util_type.hpp | 4 +- .../test_hipcub_block_load_store.kernels.hpp | 8 +- projects/hipcub/test/hipcub/test_utils.hpp | 53 +++++------ 14 files changed, 208 insertions(+), 101 deletions(-) create mode 100644 projects/hipcub/hipcub/include/hipcub/backend/cub/util_type.hpp diff --git a/projects/hipcub/benchmark/benchmark_utils.hpp b/projects/hipcub/benchmark/benchmark_utils.hpp index e6b5af7e8d2..9abcc6d93ea 100644 --- a/projects/hipcub/benchmark/benchmark_utils.hpp +++ b/projects/hipcub/benchmark/benchmark_utils.hpp @@ -116,12 +116,12 @@ inline T get_random_value(T min, T max) // Can't use std::prefix_sum for inclusive/exclusive scan, because // it does not handle short[] -> int(int a, int b) { a + b; } -> int[] // they way we expect. That's because sum in std::prefix_sum's implementation -// is of type typename std::iterator_traits::value_type (short) +// is of type ::hipcub::detail::it_value_t (short) template OutputIt host_inclusive_scan(InputIt first, InputIt last, OutputIt d_first, BinaryOperation op) { - using input_type = typename std::iterator_traits::value_type; - using output_type = typename std::iterator_traits::value_type; + using input_type = ::hipcub::detail::it_value_t; + using output_type = ::hipcub::detail::it_value_t; using result_type = typename std::conditional::value, input_type, output_type>::type; @@ -143,8 +143,8 @@ template OutputIt host_exclusive_scan( InputIt first, InputIt last, T initial_value, OutputIt d_first, BinaryOperation op) { - using input_type = typename std::iterator_traits::value_type; - using output_type = typename std::iterator_traits::value_type; + using input_type = ::hipcub::detail::it_value_t; + using output_type = ::hipcub::detail::it_value_t; using result_type = typename std::conditional::value, input_type, output_type>::type; @@ -177,8 +177,8 @@ OutputIt host_exclusive_scan_by_key(InputIt first, BinaryOperation op, KeyCompare key_compare_op) { - using input_type = typename std::iterator_traits::value_type; - using output_type = typename std::iterator_traits::value_type; + using input_type = ::hipcub::detail::it_value_t; + using output_type = ::hipcub::detail::it_value_t; using result_type = typename std::conditional::value, input_type, output_type>::type; @@ -419,9 +419,6 @@ inline constexpr bool is_power_of_two(const T x) return (x > 0) && ((x & (x - 1)) == 0); } -template -using it_value_t = typename std::iterator_traits::value_type; - using engine_type = std::default_random_engine; // generate_random_data_n() generates only part of sequence and replicates it, @@ -429,9 +426,10 @@ using engine_type = std::default_random_engine; template inline auto generate_random_data_n( OutputIter it, size_t size, U min, V max, Generator& gen, size_t max_random_size = 1024 * 1024) - -> typename std::enable_if_t>::value, OutputIter> + -> typename std::enable_if_t>::value, + OutputIter> { - using T = it_value_t; + using T = ::hipcub::detail::it_value_t; using dis_type = typename std::conditional<(sizeof(T) == 1), short, T>::type; std::uniform_int_distribution distribution((T)min, (T)max); @@ -452,9 +450,10 @@ inline auto generate_random_data_n(OutputIterator it, V max, Generator& gen, size_t max_random_size = 1024 * 1024) - -> std::enable_if_t>::value, OutputIterator> + -> std::enable_if_t>::value, + OutputIterator> { - using T = typename std::iterator_traits::value_type; + using T = ::hipcub::detail::it_value_t; std::uniform_real_distribution distribution((T)min, (T)max); std::generate_n(it, diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/util_type.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/util_type.hpp new file mode 100644 index 00000000000..f81eb00dcf4 --- /dev/null +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/util_type.hpp @@ -0,0 +1,89 @@ +/****************************************************************************** + * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#ifndef HIPCUB_CUB_UTIL_TYPE_HPP_ +#define HIPCUB_CUB_UTIL_TYPE_HPP_ + +#include "../../config.hpp" +#include "../../util_deprecated.hpp" + +#include _HIPCUB_STD_INCLUDE(iterator) + +#include // IWYU pragma: export + +BEGIN_HIPCUB_NAMESPACE + +namespace detail +{ +// the following iterator helpers are not named iter_value_t etc, like the C++20 facilities, because they are defined in +// terms of C++17 iterator_traits and not the new C++20 indirectly_readable trait etc. This allows them to detect nested +// value_type, difference_type and reference aliases, which the new C+20 traits do not consider (they only consider +// specializations of iterator_traits). Also, a value_type of void remains supported (needed by some output iterators). + +template +struct it_traits +{ + using value_type = typename _HIPCUB_STD::iterator_traits::value_type; + using reference = typename _HIPCUB_STD::iterator_traits::reference; + using difference_type = typename _HIPCUB_STD::iterator_traits::difference_type; + using pointer = typename _HIPCUB_STD::iterator_traits::pointer; +}; +template +struct it_traits> +{ + using value_type = typename It::value_type; + using reference = typename It::reference; + using difference_type = typename It::difference_type; + using pointer = typename It::pointer; +}; +template +using it_value_t = typename it_traits::value_type; +template +using it_reference_t = typename it_traits::reference; +template +using it_difference_t = typename it_traits::difference_type; +template +using it_pointer_t = typename it_traits::pointer; + +// use this whenever you need to lazily evaluate a trait. E.g., as an alternative in replace_if_use_default. +template typename Trait, typename... Args> +struct lazy_trait +{ + using type = Trait; +}; + +} // namespace detail + +END_HIPCUB_NAMESPACE + +#endif // HIPCUB_CUB_UTIL_TYPE_HPP_ diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_for.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_for.hpp index 9712599bd77..ec156316c8f 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_for.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_for.hpp @@ -117,11 +117,10 @@ struct DeviceFor OpT op, hipStream_t stream = 0) -> std::enable_if_t()), - typename std::iterator_traits< - RandomAccessIteratorT>::value_type>::value, + detail::it_value_t>::value, hipError_t> { - using T = typename std::iterator_traits::value_type; + using T = detail::it_value_t; detail::bulk::OpWrapper wrapper_op = {op}; @@ -137,14 +136,15 @@ struct DeviceFor template HIPCUB_RUNTIME_FUNCTION - static auto - ForEachN(RandomAccessIteratorT first, OffsetT num_items, OpT op, hipStream_t stream = 0) - -> std::enable_if_t()), - typename std::iterator_traits< - RandomAccessIteratorT>::value_type>::value, - hipError_t> + static auto ForEachN(RandomAccessIteratorT first, + OffsetT num_items, + OpT op, + hipStream_t stream = 0) + -> std::enable_if_t()), + detail::it_value_t>::value, + hipError_t> { - using T = typename std::iterator_traits::value_type; + using T = detail::it_value_t; detail::bulk::OpWrapper wrapper_op = {op}; @@ -207,7 +207,7 @@ HIPCUB_RUNTIME_FUNCTION OpT op, hipStream_t stream = 0) { - using offset_t = typename std::iterator_traits::difference_type; + using offset_t = detail::it_difference_t; const offset_t num_items = static_cast(std::distance(first, last)); return ForEachN(first, num_items, op, stream); diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_histogram.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_histogram.hpp index a0ce2b97ceb..8f9ef71e757 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_histogram.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_histogram.hpp @@ -87,7 +87,7 @@ template HIPCUB_HOST_DEVICE HIPCUB_FORCEINLINE hipError_t check_overflow(LevelT lower_level, LevelT upper_level, int num_levels) { - using sample_type = typename std::iterator_traits::value_type; + using sample_type = it_value_t; using common_type = typename hipcub::common_type::type; static_assert(std::is_convertible::value, "The common type of `LevelT` and `SampleT` must be " diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp index c9f19bf0573..a9ef8480e54 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp @@ -230,9 +230,9 @@ class DeviceReduce NumItemsT num_items, hipStream_t stream = 0) { - using InputT = typename std::iterator_traits::value_type; - using OutputT = typename std::iterator_traits::value_type; - using InitT = hipcub::detail::non_void_value_t; + using InputT = detail::it_value_t; + using OutputT = detail::it_value_t; + using InitT = detail::non_void_value_t; return Reduce(d_temp_storage, temp_storage_bytes, d_in, @@ -265,7 +265,7 @@ class DeviceReduce NumItemsT num_items, hipStream_t stream = 0) { - using T = typename std::iterator_traits::value_type; + using T = detail::it_value_t; return Reduce(d_temp_storage, temp_storage_bytes, d_in, @@ -303,8 +303,8 @@ class DeviceReduce hipStream_t stream = 0) { using OffsetT = NumItemsT; - using T = typename std::iterator_traits::value_type; - using O = typename std::iterator_traits::value_type; + using T = detail::it_value_t; + using O = detail::it_value_t; using OutputTupleT = hipcub::detail::non_void_value_t>; using OutputValueT = typename OutputTupleT::Value; @@ -382,7 +382,7 @@ class DeviceReduce NumItemsT num_items, hipStream_t stream = 0) { - using T = typename std::iterator_traits::value_type; + using T = detail::it_value_t; return Reduce(d_temp_storage, temp_storage_bytes, d_in, @@ -420,9 +420,9 @@ class DeviceReduce hipStream_t stream = 0) { using OffsetT = NumItemsT; - using T = typename std::iterator_traits::value_type; - using O = typename std::iterator_traits::value_type; - using OutputTupleT = hipcub::detail::non_void_value_t>; + using T = detail::it_value_t; + using O = detail::it_value_t; + using OutputTupleT = detail::non_void_value_t>; using OutputValueT = typename OutputTupleT::Value; using IteratorT = ArgIndexInputIterator; @@ -547,8 +547,7 @@ class DeviceReduce NumItemsT num_items, hipStream_t stream = 0) { - using key_compare_op - = ::rocprim::equal_to::value_type>; + using key_compare_op = ::rocprim::equal_to>; return ::rocprim::reduce_by_key(d_temp_storage, temp_storage_bytes, d_keys_in, diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_scan.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_scan.hpp index 2e8d70581a3..0a627af77a8 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_scan.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_scan.hpp @@ -230,7 +230,7 @@ class DeviceScan NumItemsT num_items, hipStream_t stream = 0) { - using T = typename std::iterator_traits::value_type; + using T = detail::it_value_t; return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, @@ -504,7 +504,7 @@ class DeviceScan EqualityOpT equality_op = EqualityOpT(), hipStream_t stream = 0) { - using in_value_type = typename std::iterator_traits::value_type; + using in_value_type = detail::it_value_t; return ExclusiveScanByKey(d_temp_storage, temp_storage_bytes, @@ -689,8 +689,7 @@ class DeviceScan EqualityOpT equality_op = EqualityOpT(), hipStream_t stream = 0) { - using acc_t = ::rocprim:: - accumulator_t::value_type>; + using acc_t = ::rocprim::accumulator_t>; return ::rocprim::inclusive_scan_by_key<::rocprim::default_config, KeysInputIteratorT, diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp index f978bed008d..d0d5c029dd5 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp @@ -87,7 +87,7 @@ inline hipError_t launch_segmented_arg_minmax(::rocprim::detail::target current_ const unsigned int segment_id = ::rocprim::detail::block_id<0>(); // Large indices need bigger offset type than unsigned int - using offset_type = typename std::iterator_traits::value_type; + using offset_type = it_value_t; const offset_type begin_offset = begin_offsets[segment_id]; const offset_type end_offset = end_offsets[segment_id]; @@ -134,7 +134,7 @@ inline hipError_t segmented_arg_minmax(void* temporary_storage, InitValueType empty_value, hipStream_t stream) { - using input_type = typename std::iterator_traits::value_type; + using input_type = detail::it_value_t; using result_type = ::rocprim::accumulator_t; using selector = ::rocprim::detail::segmented_reduce_config_selector; @@ -255,7 +255,7 @@ struct DeviceSegmentedReduce OffsetIteratorT d_end_offsets, hipStream_t stream = 0) { - using input_type = typename std::iterator_traits::value_type; + using input_type = detail::it_value_t; return Reduce(d_temp_storage, temp_storage_bytes, @@ -303,7 +303,7 @@ struct DeviceSegmentedReduce OffsetIteratorT d_end_offsets, hipStream_t stream = 0) { - using input_type = typename std::iterator_traits::value_type; + using input_type = detail::it_value_t; return Reduce(d_temp_storage, temp_storage_bytes, @@ -352,8 +352,8 @@ struct DeviceSegmentedReduce hipStream_t stream = 0) { using OffsetT = int; - using T = typename std::iterator_traits::value_type; - using O = typename std::iterator_traits::value_type; + using T = hipcub::detail::it_value_t; + using O = hipcub::detail::it_value_t; using OutputTupleT = typename std::conditional, KeyValuePair, O>::type; @@ -415,7 +415,7 @@ struct DeviceSegmentedReduce OffsetIteratorT d_end_offsets, hipStream_t stream = 0) { - using input_type = typename std::iterator_traits::value_type; + using input_type = detail::it_value_t; return Reduce(d_temp_storage, temp_storage_bytes, @@ -464,8 +464,8 @@ struct DeviceSegmentedReduce hipStream_t stream = 0) { using OffsetT = int; - using T = typename std::iterator_traits::value_type; - using O = typename std::iterator_traits::value_type; + using T = hipcub::detail::it_value_t; + using O = hipcub::detail::it_value_t; using OutputTupleT = typename std::conditional, KeyValuePair, O>::type; diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/iterator/arg_index_input_iterator.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/iterator/arg_index_input_iterator.hpp index 1acdb507278..199aadc78fa 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/iterator/arg_index_input_iterator.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/iterator/arg_index_input_iterator.hpp @@ -32,6 +32,8 @@ #include "../../../config.hpp" +#include "../util_type.hpp" + #include "iterator_category.hpp" #include "iterator_wrapper.hpp" @@ -45,7 +47,7 @@ BEGIN_HIPCUB_NAMESPACE template::value_type> + class InputValueType = detail::it_value_t> class ArgIndexInputIterator : public detail::IteratorWrapper< rocprim::arg_index_iterator, diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_load.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_load.hpp index 655814697d4..1f26e85ed56 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_load.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_load.hpp @@ -77,10 +77,10 @@ HIPCUB_FORCEINLINE T ThreadLoadVolatilePointer(T* ptr, Fundamental /* is_fundame template HIPCUB_DEVICE -HIPCUB_FORCEINLINE typename std::iterator_traits::value_type - ThreadLoad(InputIteratorT itr, - detail::int_constant_t /*modifier*/, - ::std::false_type /*is_pointer*/) +HIPCUB_FORCEINLINE detail::it_value_t + ThreadLoad(InputIteratorT itr, + detail::int_constant_t /*modifier*/, + ::std::false_type /*is_pointer*/) { return rocprim::thread_load(itr); } @@ -96,8 +96,7 @@ HIPCUB_FORCEINLINE T ThreadLoad(T* ptr, template HIPCUB_DEVICE -HIPCUB_FORCEINLINE - typename std::iterator_traits::value_type ThreadLoad(InputIteratorT itr) +HIPCUB_FORCEINLINE detail::it_value_t ThreadLoad(InputIteratorT itr) { return ThreadLoad(itr, detail::int_constant_t(), diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp index 7b967cefac9..0e5068e1e82 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp @@ -570,16 +570,16 @@ using accumulator_t = ::rocprim::accumulator_t; // // /// The output value type // using OutputT = -// typename If<(Equals::value_type, +// typename If<(Equals, // void>::VALUE), // OutputT = (if output iterator's value type is void) ? -// typename std::iterator_traits< -// InputIteratorT>::value_type, // ... then the input iterator's value type, -// typename std::iterator_traits::value_type>:: +// it_value_t< +// InputIteratorT>, // ... then the input iterator's value type, +// it_value_t>:: // Type; // ... else the output iterator's value type // // rocPRIM (as well as Thrust) uses result type of BinaryFunction instead (if not void): // -// using input_type = typename std::iterator_traits::value_type; +// using input_type = detail::it_value_t; // using result_type = ::rocprim::accumulator_t; // // For short -> float using Sum() @@ -594,8 +594,8 @@ template< > struct convert_result_type_wrapper { - using input_type = typename std::iterator_traits::value_type; - using output_type = typename std::iterator_traits::value_type; + using input_type = detail::it_value_t; + using output_type = it_value_t; using result_type = non_void_value_t; convert_result_type_wrapper(BinaryFunction op) : op(op) {} @@ -636,7 +636,7 @@ convert_result_type(BinaryFunction op) template struct convert_binary_result_type_wrapper { - using input_type = typename std::iterator_traits::value_type; + using input_type = detail::it_value_t; using init_type = InitT; using accum_type = accumulator_t; diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp index 156d341374b..6fb669648a9 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp @@ -43,6 +43,7 @@ #include #include _HIPCUB_STD_INCLUDE(limits) +#include _HIPCUB_STD_INCLUDE(iterator) #include @@ -94,6 +95,30 @@ struct PowerOfTwo namespace detail { +// the following iterator helpers are not named iter_value_t etc, like the C++20 facilities, because they are defined in +// terms of C++17 iterator_traits and not the new C++20 indirectly_readable trait etc. This allows them to detect nested +// value_type, difference_type and reference aliases, which the new C+20 traits do not consider (they only consider +// specializations of iterator_traits). Also, a value_type of void remains supported (needed by some output iterators). + +template +using it_value_t = typename _HIPCUB_STD::iterator_traits::value_type; + +template +using it_reference_t = typename _HIPCUB_STD::iterator_traits::reference; + +template +using it_difference_t = typename _HIPCUB_STD::iterator_traits::difference_type; + +template +using it_pointer_t = typename _HIPCUB_STD::iterator_traits::pointer; + +// use this whenever you need to lazily evaluate a trait. E.g., as an alternative in replace_if_use_default. +template typename Trait, typename... Args> +struct lazy_trait +{ + using type = Trait; +}; + template struct Log2Impl { diff --git a/projects/hipcub/hipcub/include/hipcub/util_type.hpp b/projects/hipcub/hipcub/include/hipcub/util_type.hpp index 9f755ce45d6..6dd78870928 100644 --- a/projects/hipcub/hipcub/include/hipcub/util_type.hpp +++ b/projects/hipcub/hipcub/include/hipcub/util_type.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2020-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2020-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -33,7 +33,7 @@ #ifdef __HIP_PLATFORM_AMD__ #include "backend/rocprim/util_type.hpp" // IWYU pragma: export #elif defined(__HIP_PLATFORM_NVIDIA__) - #include // IWYU pragma: export + #include "backend/cub/util_type.hpp" // IWYU pragma: export #endif diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_load_store.kernels.hpp b/projects/hipcub/test/hipcub/test_hipcub_block_load_store.kernels.hpp index a126893664c..94cef4f8504 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_load_store.kernels.hpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_load_store.kernels.hpp @@ -148,15 +148,15 @@ __launch_bounds__(BlockSize) __global__ }; // The input value type - using InputT = typename std::iterator_traits::value_type; + using InputT = hipcub::detail::it_value_t; // The output value type using OutputT = typename std::conditional< - (std::is_same_v::value_type, + (std::is_same_v, void>), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's + hipcub::detail::it_value_t, // ... then the input iterator's // value type, - typename std::iterator_traits::value_type>:: + hipcub::detail::it_value_t>:: type; // ... else the output iterator's value type // Threadblock load/store abstraction types diff --git a/projects/hipcub/test/hipcub/test_utils.hpp b/projects/hipcub/test/hipcub/test_utils.hpp index 88b4e2d7ac6..961178fba18 100644 --- a/projects/hipcub/test/hipcub/test_utils.hpp +++ b/projects/hipcub/test/hipcub/test_utils.hpp @@ -167,7 +167,7 @@ OutputIt host_inclusive_scan_impl( template OutputIt host_inclusive_scan(InputIt first, InputIt last, OutputIt d_first, BinaryOperation op) { - using acc_type = typename std::iterator_traits::value_type; + using acc_type = ::hipcub::detail::it_value_t; return host_inclusive_scan_impl(first, last, d_first, op, acc_type{}); } @@ -182,11 +182,10 @@ template< class InputIt, class OutputIt, class T, - std::enable_if_t< - std::is_same_v::value_type, test_utils::bfloat16> - || std::is_same_v::value_type, test_utils::half> - || std::is_same_v::value_type, float>, - bool> + std::enable_if_t, test_utils::bfloat16> + || std::is_same_v, test_utils::half> + || std::is_same_v, float>, + bool> = true> OutputIt host_inclusive_scan(InputIt first, InputIt last, OutputIt d_first, test_utils::plus) { @@ -199,11 +198,10 @@ template< class OutputIt, class InitType, class T, - std::enable_if_t< - std::is_same_v::value_type, test_utils::bfloat16> - || std::is_same_v::value_type, test_utils::half> - || std::is_same_v::value_type, float>, - bool> + std::enable_if_t, test_utils::bfloat16> + || std::is_same_v<::hipcub::detail::it_value_t, test_utils::half> + || std::is_same_v<::hipcub::detail::it_value_t, float>, + bool> = true> OutputIt host_inclusive_scan_init( InputIt first, InputIt last, OutputIt d_first, InitType init_value, test_utils::plus) @@ -240,7 +238,7 @@ template OutputIt host_exclusive_scan( InputIt first, InputIt last, T initial_value, OutputIt d_first, BinaryOperation op) { - using acc_type = typename std::iterator_traits::value_type; + using acc_type = ::hipcub::detail::it_value_t; return host_exclusive_scan_impl(first, last, initial_value, d_first, op, acc_type{}); } @@ -249,11 +247,10 @@ template< class T, class OutputIt, class U, - std::enable_if_t< - std::is_same_v::value_type, test_utils::bfloat16> - || std::is_same_v::value_type, test_utils::half> - || std::is_same_v::value_type, float>, - bool> + std::enable_if_t, test_utils::bfloat16> + || std::is_same_v<::hipcub::detail::it_value_t, test_utils::half> + || std::is_same_v<::hipcub::detail::it_value_t, float>, + bool> = true> OutputIt host_exclusive_scan( InputIt first, InputIt last, T initial_value, OutputIt d_first, test_utils::plus) @@ -318,7 +315,7 @@ OutputIt host_exclusive_scan_by_key(InputIt first, BinaryOperation op, KeyCompare key_compare_op) { - using acc_type = typename std::iterator_traits::value_type; + using acc_type = ::hipcub::detail::it_value_t; return host_exclusive_scan_by_key_impl(first, last, k_first, @@ -336,11 +333,10 @@ template< class OutputIt, class U, class KeyCompare, - std::enable_if_t< - std::is_same_v::value_type, test_utils::bfloat16> - || std::is_same_v::value_type, test_utils::half> - || std::is_same_v::value_type, float>, - bool> + std::enable_if_t, test_utils::bfloat16> + || std::is_same_v<::hipcub::detail::it_value_t, test_utils::half> + || std::is_same_v<::hipcub::detail::it_value_t, float>, + bool> = true> OutputIt host_exclusive_scan_by_key(InputIt first, InputIt last, @@ -403,7 +399,7 @@ OutputIt host_inclusive_scan_by_key(InputIt first, BinaryOperation op, KeyCompare key_compare_op) { - using acc_type = typename std::iterator_traits::value_type; + using acc_type = ::hipcub::detail::it_value_t; return host_inclusive_scan_by_key_impl(first, last, k_first, @@ -419,11 +415,10 @@ template< class OutputIt, class U, class KeyCompare, - std::enable_if_t< - std::is_same_v::value_type, test_utils::bfloat16> - || std::is_same_v::value_type, test_utils::half> - || std::is_same_v::value_type, float>, - bool> + std::enable_if_t, test_utils::bfloat16> + || std::is_same_v<::hipcub::detail::it_value_t, test_utils::half> + || std::is_same_v<::hipcub::detail::it_value_t, float>, + bool> = true> OutputIt host_inclusive_scan_by_key(InputIt first, InputIt last, From a5e3050c1f80f3d1218e29310602b8119a4b67e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Wed, 3 Dec 2025 09:50:48 +0000 Subject: [PATCH 31/95] Avoid int overflow during multiplication --- .../include/hipcub/backend/rocprim/grid/grid_even_share.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/grid/grid_even_share.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/grid/grid_even_share.hpp index a3432cd8e57..9485b7e6338 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/grid/grid_even_share.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/grid/grid_even_share.hpp @@ -127,8 +127,8 @@ struct GridEvenShare int avg_tiles_per_block = total_tiles / grid_size; // leftover grains go to big blocks: this->big_shares = total_tiles - (avg_tiles_per_block * grid_size); - this->normal_share_items = avg_tiles_per_block * tile_items; - this->normal_base_offset = big_shares * tile_items; + this->normal_share_items = static_cast(avg_tiles_per_block) * tile_items; + this->normal_base_offset = static_cast(big_shares) * tile_items; this->big_share_items = normal_share_items + tile_items; } From 90f8bb6fd1efc3aed641fbd82085ef6121ce94bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Wed, 3 Dec 2025 10:00:09 +0000 Subject: [PATCH 32/95] Remove `HIPCUB_MIN` and `HIPCUB_MAX` --- projects/hipcub/CHANGELOG.md | 1 + .../include/hipcub/backend/cub/util_macro.hpp | 12 ------------ .../rocprim/thread/thread_operators.hpp | 18 ++++++++++-------- .../backend/rocprim/thread/thread_search.hpp | 8 +++++--- .../hipcub/backend/rocprim/util_macro.hpp | 12 ------------ 5 files changed, 16 insertions(+), 35 deletions(-) diff --git a/projects/hipcub/CHANGELOG.md b/projects/hipcub/CHANGELOG.md index bdeec8d9302..bfcca7e37a8 100644 --- a/projects/hipcub/CHANGELOG.md +++ b/projects/hipcub/CHANGELOG.md @@ -23,6 +23,7 @@ Full documentation for hipCUB is available at [https://rocm.docs.amd.com/project * Removed `ConstantInputIterator`, `CountingInputIterator`, `DiscardOutputIterator` and `TransformInputIterator` which were deprecated in hipCUB-4.1.0. * Removed `DeviceSpmv`, which was removed from CUB after CCCL's 2.8.0 release. Use `hipSPARSE` or `rocSPARSE` libraries instead. * Removed `GridBarrier`. +* Removed `HIPCUB_MIN` and `HIPCUB_MAX`. * Removed `LEGACY_PTX_ARCH`. * Removed `hipcub:max` and `hipcub:min`, which were deprecated. Use `hip::std::max` and `hip::std::min` instead. * Deprecated `hipcub::Swap`, use `rocprim::swap` instead. diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/util_macro.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/util_macro.hpp index 794d527aade..e974d6946f4 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/util_macro.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/util_macro.hpp @@ -36,18 +36,6 @@ BEGIN_HIPCUB_NAMESPACE -/// Deprecated since rocm [7.1] -#ifndef HIPCUB_MAX - /// Select maximum(a, b) - #define HIPCUB_MAX(a, b) (((b) > (a)) ? (b) : (a)) -#endif - -/// Deprecated since rocm [7.1] -#ifndef HIPCUB_MIN - /// Select minimum(a, b) - #define HIPCUB_MIN(a, b) (((b) < (a)) ? (b) : (a)) -#endif - /// Deprecated since rocm [7.1] #ifndef HIPCUB_QUOTIENT_FLOOR /// Quotient of x/y rounded down to nearest integer diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp index 0e5068e1e82..27d900de974 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp @@ -38,6 +38,8 @@ #include // IWYU pragma: export #include +#include _HIPCUB_STD_INCLUDE(functional) + #include BEGIN_HIPCUB_NAMESPACE @@ -276,7 +278,7 @@ struct [[deprecated( constexpr uint32_t operator()(int32_t t, int32_t u) const { - return HIPCUB_MIN(t, u); + return _HIPCUB_STD::min(t, u); } }; @@ -290,7 +292,7 @@ struct [[deprecated( constexpr uint32_t operator()(uint32_t t, uint32_t u) const { - return HIPCUB_MIN(t, u); + return _HIPCUB_STD::min(t, u); } }; @@ -306,7 +308,7 @@ struct [[deprecated( __half2 operator()(__half2 t, __half2 u) const { - return HIPCUB_MIN(t, u); + return _HIPCUB_STD::min(t, u); } }; #endif // !defined(__HIP_NO_HALF_OPERATORS__) @@ -322,7 +324,7 @@ struct [[deprecated("SIMD intrinsics are currently not supported on HIP, use Min __hip_bfloat162 operator()(__hip_bfloat162 t, __hip_bfloat162 u) const { - return HIPCUB_MIN(t, u); + return _HIPCUB_STD::min(t, u); } }; @@ -343,7 +345,7 @@ struct [[deprecated( constexpr uint32_t operator()(int32_t t, int32_t u) const { - return HIPCUB_MAX(t, u); + return _HIPCUB_STD::max(t, u); } }; @@ -357,7 +359,7 @@ struct [[deprecated( constexpr uint32_t operator()(uint32_t t, uint32_t u) const { - return HIPCUB_MAX(t, u); + return _HIPCUB_STD::max(t, u); } }; @@ -372,7 +374,7 @@ struct [[deprecated( __half2 operator()(__half2 t, __half2 u) const { - return HIPCUB_MAX(t, u); + return _HIPCUB_STD::max(t, u); } }; #endif // !defined(__HIP_NO_HALF_OPERATORS__) @@ -387,7 +389,7 @@ struct [[deprecated("SIMD intrinsics are currently not supported on HIP, use Max __hip_bfloat162 operator()(__hip_bfloat162 t, __hip_bfloat162 u) const { - return HIPCUB_MAX(t, u); + return _HIPCUB_STD::max(t, u); } }; diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_search.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_search.hpp index f383d0b4386..b77c8f85de4 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_search.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_search.hpp @@ -32,6 +32,8 @@ #include "../../../config.hpp" +#include _HIPCUB_STD_INCLUDE(functional) + #include BEGIN_HIPCUB_NAMESPACE @@ -55,8 +57,8 @@ __host__ __device__ __forceinline__ void MergePathSearch( OffsetT b_len, CoordinateT& path_coordinate) { - OffsetT split_min = CUB_MAX(diagonal - b_len, 0); - OffsetT split_max = CUB_MIN(diagonal, a_len); + OffsetT split_min = _HIPCUB_STD::max(diagonal - b_len, 0); + OffsetT split_max = _HIPCUB_STD::min(diagonal, a_len); while (split_min < split_max) { @@ -73,7 +75,7 @@ __host__ __device__ __forceinline__ void MergePathSearch( } } - path_coordinate.x = CUB_MIN(split_min, a_len); + path_coordinate.x = _HIPCUB_STD::min(split_min, a_len); path_coordinate.y = diagonal - split_min; } diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_macro.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_macro.hpp index d0d2203b2b9..4af5dd21e7c 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_macro.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_macro.hpp @@ -41,18 +41,6 @@ BEGIN_HIPCUB_NAMESPACE * @{ */ -/// Deprecated since rocm [7.1] -#ifndef HIPCUB_MAX - /// Select maximum(a, b) - #define HIPCUB_MAX(a, b) (((b) > (a)) ? (b) : (a)) -#endif - -/// Deprecated since rocm [7.1] -#ifndef HIPCUB_MIN - /// Select minimum(a, b) - #define HIPCUB_MIN(a, b) (((b) < (a)) ? (b) : (a)) -#endif - /// Deprecated since rocm [7.1] #ifndef HIPCUB_QUOTIENT_FLOOR /// Quotient of x/y rounded down to nearest integer From 917f1e1f69e36d8afca8e00a3375d7fde49a498e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Mon, 8 Dec 2025 16:33:47 +0000 Subject: [PATCH 33/95] Minimize usage of `hipcub::Traits` --- .../hipcub/backend/rocprim/util_type.hpp | 160 +++++++++++++----- projects/hipcub/test/hipcub/bfloat16.hpp | 29 +--- projects/hipcub/test/hipcub/half.hpp | 14 +- .../hipcub/test_utils_sort_comparator.hpp | 6 +- 4 files changed, 132 insertions(+), 77 deletions(-) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp index 6fb669648a9..50b19c05814 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp @@ -524,18 +524,21 @@ enum Category FLOATING_POINT }; -/** - * \brief Basic type traits - */ -template +namespace detail +{ +struct is_primitive_impl; + +template struct BaseTraits -{}; +{ +private: + friend struct is_primitive_impl; + + static constexpr bool is_primitive = _PRIMITIVE; +}; -/** - * Basic type traits (unsigned primitive specialization) - */ template -struct BaseTraits +struct BaseTraits { using UnsignedBits = _UnsignedBits; @@ -554,6 +557,8 @@ struct BaseTraits return key; } + //! deprecated [Since 5.0] + HIPCUB_DEPRECATED_BECAUSE("Use hip::std::numeric_limits::max()") static HIPCUB_HOST_DEVICE __forceinline__ T Max() { UnsignedBits retval_bits = MAX_KEY; @@ -562,6 +567,8 @@ struct BaseTraits return retval; } + //! deprecated [Since 5.0] + HIPCUB_DEPRECATED_BECAUSE("Use hip::std::numeric_limits::lowest()") static HIPCUB_HOST_DEVICE __forceinline__ T Lowest() { UnsignedBits retval_bits = LOWEST_KEY; @@ -569,13 +576,15 @@ struct BaseTraits memcpy(&retval, &retval_bits, sizeof(T)); return retval; } + +private: + friend struct is_primitive_impl; + + static constexpr bool is_primitive = true; }; -/** - * Basic type traits (signed primitive specialization) - */ template -struct BaseTraits +struct BaseTraits { using UnsignedBits = _UnsignedBits; @@ -595,24 +604,30 @@ struct BaseTraits return key ^ HIGH_BIT; }; + //! deprecated [Since 5.0] + HIPCUB_DEPRECATED_BECAUSE("Use hip::std::numeric_limits::max()") static HIPCUB_HOST_DEVICE __forceinline__ T Max() { UnsignedBits retval = MAX_KEY; return reinterpret_cast(retval); } + //! deprecated [Since 5.0] + HIPCUB_DEPRECATED_BECAUSE("Use hip::std::numeric_limits::lowest()") static HIPCUB_HOST_DEVICE __forceinline__ T Lowest() { UnsignedBits retval = LOWEST_KEY; return reinterpret_cast(retval); } + +private: + friend struct is_primitive_impl; + + static constexpr bool is_primitive = true; }; -/** - * Basic type traits (fp primitive specialization) - */ template -struct BaseTraits +struct BaseTraits { using UnsignedBits = _UnsignedBits; @@ -634,71 +649,91 @@ struct BaseTraits return key ^ mask; }; + //! deprecated [Since 5.0] + HIPCUB_DEPRECATED_BECAUSE("Use hip::std::numeric_limits::max()") static HIPCUB_HOST_DEVICE __forceinline__ T Max() { return _HIPCUB_STD::numeric_limits::max(); } + //! deprecated [Since 5.0] + HIPCUB_DEPRECATED_BECAUSE("Use hip::std::numeric_limits::lowest()") static HIPCUB_HOST_DEVICE __forceinline__ T Lowest() { return _HIPCUB_STD::numeric_limits::lowest(); } + +private: + friend struct is_primitive_impl; + + static constexpr bool is_primitive = true; }; +} // namespace detail + +//! Use this class as base when specializing \ref NumericTraits for primitive signed/unsigned integers or floating-point +//! types. +template +using BaseTraits = detail::BaseTraits<_CATEGORY, _PRIMITIVE, _UnsignedBits, T>; + +//! Numeric type traits for radix sort key operations, decoupled lookback and tuning. You can specialize this template +//! for your own types if: +//! * There is an unsigned integral type of equal size +//! * The size of the type is smaller than 64bits +//! * The arithmetic throughput of the type is similar to other built-in types of the same size +//! For other types, if you want to use them with radix sort, please use the decomposer interface of the radix sort. -/** - * \brief Numeric type traits - */ template -struct NumericTraits : BaseTraits +struct NumericTraits : BaseTraits {}; template<> -struct NumericTraits : BaseTraits +struct NumericTraits : BaseTraits {}; template<> struct NumericTraits : BaseTraits<(_HIPCUB_STD::numeric_limits::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, + true, unsigned char, char> {}; template<> -struct NumericTraits : BaseTraits +struct NumericTraits : BaseTraits {}; template<> -struct NumericTraits : BaseTraits +struct NumericTraits : BaseTraits {}; template<> -struct NumericTraits : BaseTraits +struct NumericTraits : BaseTraits {}; template<> -struct NumericTraits : BaseTraits +struct NumericTraits : BaseTraits {}; template<> -struct NumericTraits : BaseTraits +struct NumericTraits : BaseTraits {}; template<> struct NumericTraits - : BaseTraits + : BaseTraits {}; template<> struct NumericTraits - : BaseTraits + : BaseTraits {}; template<> struct NumericTraits - : BaseTraits + : BaseTraits {}; template<> struct NumericTraits - : BaseTraits + : BaseTraits {}; template<> struct NumericTraits - : BaseTraits + : BaseTraits {}; #if _CCCL_HAS_INT128() @@ -723,11 +758,15 @@ struct NumericTraits<__uint128_t> return key; } + //! deprecated [Since 5.0] + HIPCUB_DEPRECATED_BECAUSE("Use hip::std::numeric_limits::max()") static __host__ __device__ __forceinline__ T Max() { return MAX_KEY; } + //! deprecated [Since 5.0] + HIPCUB_DEPRECATED_BECAUSE("Use hip::std::numeric_limits::lowest()") static __host__ __device__ __forceinline__ T Lowest() { return LOWEST_KEY; @@ -756,53 +795,86 @@ struct NumericTraits<__int128_t> return key ^ HIGH_BIT; }; + //! deprecated [Since 5.0] + HIPCUB_DEPRECATED_BECAUSE("Use hip::std::numeric_limits::max()") static __host__ __device__ __forceinline__ T Max() { UnsignedBits retval = MAX_KEY; return reinterpret_cast(retval); } + //! deprecated [Since 5.0] + HIPCUB_DEPRECATED_BECAUSE("Use hip::std::numeric_limits::lowest()") static __host__ __device__ __forceinline__ T Lowest() { UnsignedBits retval = LOWEST_KEY; return reinterpret_cast(retval); } + +private: + friend struct detail::is_primitive_impl; + + static constexpr bool is_primitive = false; }; #endif template<> -struct NumericTraits : BaseTraits +struct NumericTraits : BaseTraits {}; template<> -struct NumericTraits : BaseTraits +struct NumericTraits : BaseTraits {}; template<> -struct NumericTraits<__half> : BaseTraits -{}; +struct NumericTraits<__half> : BaseTraits +{ + using UnsignedBits = unsigned short; +}; template<> struct NumericTraits - : BaseTraits -{}; + : BaseTraits +{ + using UnsignedBits = unsigned short; +}; template<> struct NumericTraits - : BaseTraits::VolatileWord, bool> + : BaseTraits::VolatileWord, bool> {}; -/** - * \brief Type traits - */ +namespace detail +{ template struct Traits : NumericTraits::type> {}; +} // namespace detail + +//! \brief Query type traits for radix sort key operations, decoupled lookback and tunings. To add support for your own +//! primitive types please specialize \ref NumericTraits. +template +using Traits = detail::Traits; + namespace detail { -// __uint128_t and __int128_t are not primitive +// we cannot befriend is_primitive on GCC < 11, since it's a template (bug) +struct is_primitive_impl +{ + // must be a struct instead of an alias, so the access of Traits::is_primitive happens in the context of this class + template + struct is_primitive : _HIPCUB_STD::bool_constant::is_primitive> + {}; +}; +// This trait serves two purposes: +// 1. It is used for tunings to detect whether we have a build-in arithmetic type for which we can expect certain +// arithmetic throughput. E.g.: we expect all primitive types of the same size to show roughly similar performance. +// 2. Decoupled lookback uses this trait to determine whether there is a machine word twice the size of T which can be +// loaded/stored with a single instruction. +// TODO(bgruber): for 2. we should probably just check whether sizeof(T) * 2 <= sizeof(int128) (or 256-bit on SM100) +// Users must be able to hook into both scenarios with their custom types, so this trait must depend on cub::Traits HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH template -struct is_primitive : ::std::bool_constant::PRIMITIVE> +struct is_primitive : is_primitive_impl::is_primitive {}; template diff --git a/projects/hipcub/test/hipcub/bfloat16.hpp b/projects/hipcub/test/hipcub/bfloat16.hpp index 06e034d6b9d..0e4976ed1ad 100644 --- a/projects/hipcub/test/hipcub/bfloat16.hpp +++ b/projects/hipcub/test/hipcub/bfloat16.hpp @@ -39,7 +39,9 @@ #include #if defined(__HIP_PLATFORM_NVIDIA__) -#include + #include +#else + #include #endif #ifdef __GNUC__ @@ -263,35 +265,22 @@ inline std::ostream& operator<<(std::ostream &out, const bfloat16_t &x) #if defined(__HIP_PLATFORM_NVIDIA__) - /// Insert formatted \p __nv_bfloat16 into the output stream - inline std::ostream& operator<<(std::ostream &out, const __nv_bfloat16 &x) - { - return out << bfloat16_t(x); - } +/// Insert formatted \p __nv_bfloat16 into the output stream +inline std::ostream& operator<<(std::ostream& out, const __nv_bfloat16& x) +{ + return out << bfloat16_t(x); +} #endif - - - /****************************************************************************** * Traits overloads ******************************************************************************/ -#if defined(__HIP_PLATFORM_NVIDIA__) -_CCCL_SUPPRESS_DEPRECATED_PUSH -#else -HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH -#endif template<> struct hipcub::NumericTraits - : hipcub::BaseTraits + : hipcub::BaseTraits {}; -#if defined(__HIP_PLATFORM_NVIDIA__) -_CCCL_SUPPRESS_DEPRECATED_POP -#else -HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP -#endif #ifdef __GNUC__ #pragma GCC diagnostic pop diff --git a/projects/hipcub/test/hipcub/half.hpp b/projects/hipcub/test/hipcub/half.hpp index b0501dddc19..7c1bdd1d653 100644 --- a/projects/hipcub/test/hipcub/half.hpp +++ b/projects/hipcub/test/hipcub/half.hpp @@ -39,6 +39,8 @@ #if defined(__HIP_PLATFORM_NVIDIA__) #include +#else + #include #endif #include @@ -331,20 +333,10 @@ inline std::ostream& operator<<(std::ostream &out, const half_t &x) * Traits overloads ******************************************************************************/ -#if defined(__HIP_PLATFORM_NVIDIA__) -_CCCL_SUPPRESS_DEPRECATED_PUSH -#else -HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH -#endif template<> struct hipcub::NumericTraits - : hipcub::BaseTraits + : hipcub::BaseTraits {}; -#if defined(__HIP_PLATFORM_NVIDIA__) -_CCCL_SUPPRESS_DEPRECATED_POP -#else -HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP -#endif #ifdef __GNUC__ #pragma GCC diagnostic pop diff --git a/projects/hipcub/test/hipcub/test_utils_sort_comparator.hpp b/projects/hipcub/test/hipcub/test_utils_sort_comparator.hpp index 297fb313727..03f80829f13 100644 --- a/projects/hipcub/test/hipcub/test_utils_sort_comparator.hpp +++ b/projects/hipcub/test/hipcub/test_utils_sort_comparator.hpp @@ -48,8 +48,10 @@ template::value, int> = 0> Key to_bits(const Key key) { - static constexpr Key radix_mask_upper - = EndBit == 8 * sizeof(Key) ? ~Key(0) : static_cast((Key(1) << EndBit) - 1); + using Bits = typename hipcub::Traits::UnsignedBits; + static constexpr Key radix_mask_upper = EndBit == 8 * sizeof(Key) + ? static_cast(~Bits(0)) + : static_cast((Bits(1) << EndBit) - 1); static constexpr Key radix_mask_bottom = static_cast((Key(1) << StartBit) - 1); static constexpr Key radix_mask = radix_mask_upper ^ radix_mask_bottom; From 9ad012395c14a0bbd06b0ba20a502a857ae7566b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Wed, 3 Dec 2025 10:05:57 +0000 Subject: [PATCH 34/95] Drop deprecated CUB macros --- projects/hipcub/CHANGELOG.md | 2 +- .../include/hipcub/backend/cub/util_macro.hpp | 28 --------------- .../hipcub/backend/rocprim/util_macro.hpp | 35 ------------------- 3 files changed, 1 insertion(+), 64 deletions(-) diff --git a/projects/hipcub/CHANGELOG.md b/projects/hipcub/CHANGELOG.md index bfcca7e37a8..8c3269a99a0 100644 --- a/projects/hipcub/CHANGELOG.md +++ b/projects/hipcub/CHANGELOG.md @@ -23,7 +23,7 @@ Full documentation for hipCUB is available at [https://rocm.docs.amd.com/project * Removed `ConstantInputIterator`, `CountingInputIterator`, `DiscardOutputIterator` and `TransformInputIterator` which were deprecated in hipCUB-4.1.0. * Removed `DeviceSpmv`, which was removed from CUB after CCCL's 2.8.0 release. Use `hipSPARSE` or `rocSPARSE` libraries instead. * Removed `GridBarrier`. -* Removed `HIPCUB_MIN` and `HIPCUB_MAX`. +* Removed `HIPCUB_MIN`, `HIPCUB_MAX`, `HIPCUB_QUOTIENT_FLOOR`, `HIPCUB_QUOTIENT_CEILING`, `HIPCUB_ROUND_UP_NEAREST` and `HIPCUB_ROUND_DOWN_NEAREST` which were deprecated in hipCUB-4.1.0. * Removed `LEGACY_PTX_ARCH`. * Removed `hipcub:max` and `hipcub:min`, which were deprecated. Use `hip::std::max` and `hip::std::min` instead. * Deprecated `hipcub::Swap`, use `rocprim::swap` instead. diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/util_macro.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/util_macro.hpp index e974d6946f4..ef7f734121c 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/util_macro.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/util_macro.hpp @@ -34,32 +34,4 @@ #include // IWYU pragma: export -BEGIN_HIPCUB_NAMESPACE - -/// Deprecated since rocm [7.1] -#ifndef HIPCUB_QUOTIENT_FLOOR - /// Quotient of x/y rounded down to nearest integer - #define HIPCUB_QUOTIENT_FLOOR(x, y) ((x) / (y)) -#endif - -/// Deprecated since rocm [7.1] -#ifndef HIPCUB_QUOTIENT_CEILING - /// Quotient of x/y rounded up to nearest integer - #define HIPCUB_QUOTIENT_CEILING(x, y) (((x) + (y)-1) / (y)) -#endif - -/// Deprecated since rocm [7.1] -#ifndef HIPCUB_ROUND_UP_NEAREST - /// x rounded up to the nearest multiple of y - #define HIPCUB_ROUND_UP_NEAREST(x, y) (HIPCUB_QUOTIENT_CEILING(x, y) * y) -#endif - -/// Deprecated since rocm [7.1] -#ifndef HIPCUB_ROUND_DOWN_NEAREST - /// x rounded down to the nearest multiple of y - #define HIPCUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y) -#endif - -END_HIPCUB_NAMESPACE - #endif // HIPCUB_CUB_MACRO_HPP_ diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_macro.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_macro.hpp index 4af5dd21e7c..de5f4c919c2 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_macro.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_macro.hpp @@ -34,39 +34,4 @@ #include -BEGIN_HIPCUB_NAMESPACE - -/** - * \addtogroup UtilModule - * @{ - */ - -/// Deprecated since rocm [7.1] -#ifndef HIPCUB_QUOTIENT_FLOOR - /// Quotient of x/y rounded down to nearest integer - #define HIPCUB_QUOTIENT_FLOOR(x, y) ((x) / (y)) -#endif - -/// Deprecated since rocm [7.1] -#ifndef HIPCUB_QUOTIENT_CEILING - /// Quotient of x/y rounded up to nearest integer - #define HIPCUB_QUOTIENT_CEILING(x, y) (((x) + (y)-1) / (y)) -#endif - -/// Deprecated since rocm [7.1] -#ifndef HIPCUB_ROUND_UP_NEAREST - /// x rounded up to the nearest multiple of y - #define HIPCUB_ROUND_UP_NEAREST(x, y) (HIPCUB_QUOTIENT_CEILING(x, y) * y) -#endif - -/// Deprecated since rocm [7.1] -#ifndef HIPCUB_ROUND_DOWN_NEAREST - /// x rounded down to the nearest multiple of y - #define HIPCUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y) -#endif - -/** @} */ // end group UtilModule - -END_HIPCUB_NAMESPACE - #endif // HIPCUB_ROCPRIM_MACRO_HPP_ From c0b017b9e6254b2d28d5b630d24b4287a9db32c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Wed, 3 Dec 2025 10:19:22 +0000 Subject: [PATCH 35/95] Update mdspan support --- .../include/hipcub/backend/cub/device/device_for.hpp | 8 ++------ projects/hipcub/test/hipcub/test_hipcub_device_for.cpp | 4 ++-- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_for.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_for.hpp index 0f22c405179..cbc159f97b2 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_for.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_for.hpp @@ -33,13 +33,11 @@ #include // IWYU pragma: export -#if __cccl_lib_mdspan - #include +#include BEGIN_HIPCUB_NAMESPACE template using extents = ::cuda::std::extents; END_HIPCUB_NAMESPACE -#endif // __cccl_lib_mdspan BEGIN_HIPCUB_NAMESPACE @@ -164,8 +162,7 @@ HIPCUB_RUNTIME_FUNCTION cub::DeviceFor::Bulk(d_temp_storage, temp_storage_bytes, shape, op, stream)); } -// ForEachInExtents only enables when the cccl mdspan extension is enabled -#ifdef __cccl_lib_mdspan + // ForEachInExtents only enables when the cccl mdspan extension is enabled template HIPCUB_RUNTIME_FUNCTION static hipError_t ForEachInExtents(void* d_temp_storage, @@ -189,7 +186,6 @@ HIPCUB_RUNTIME_FUNCTION { return hipCUDAErrorTohipError(cub::DeviceFor::ForEachInExtents(extents, op, stream)); } -#endif // __cccl_lib_mdspan }; END_HIPCUB_NAMESPACE diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp index d991a0dbdb9..408a88dd751 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp @@ -853,7 +853,7 @@ TEST(HipcubDeviceForTests, ForEachCopyNTempStore) } // ForEachInExtents only enables when the cccl mdspan extension is enabled -#if(defined(__HIP_PLATFORM_NVIDIA__) && defined(__cccl_lib_mdspan)) || defined(__HIP_PLATFORM_AMD__) +#if defined(__HIP_PLATFORM_NVIDIA__) || defined(__HIP_PLATFORM_AMD__) template struct HipcubTestParamsMerge @@ -1064,7 +1064,7 @@ TYPED_TEST(HipcubDeviceForEachInExtentsTests, ForEachInExtentsStatic) HIP_CHECK(hipFree(d_input)); } -#endif // (defined(__HIP_PLATFORM_NVIDIA__) && defined(__cccl_lib_mdspan)) || defined(__HIP_PLATFORM_AMD__) +#endif // defined(__HIP_PLATFORM_NVIDIA__) || defined(__HIP_PLATFORM_AMD__) template class HipcubDeviceForBulkTests : public HipcubDeviceForTests From cb35f139c6203fa8ba43fe73ccc5e81a02b8771d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Wed, 3 Dec 2025 11:00:29 +0000 Subject: [PATCH 36/95] Replace util_arch.cuh macros with inline constexpr variables --- projects/hipcub/benchmark/benchmark_utils.hpp | 2 +- projects/hipcub/hipcub/include/hipcub/config.hpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/projects/hipcub/benchmark/benchmark_utils.hpp b/projects/hipcub/benchmark/benchmark_utils.hpp index 9abcc6d93ea..b7a1cd0e941 100644 --- a/projects/hipcub/benchmark/benchmark_utils.hpp +++ b/projects/hipcub/benchmark/benchmark_utils.hpp @@ -40,7 +40,7 @@ #ifndef HIPCUB_CUB_API #define HIPCUB_WARP_THREADS_MACRO warpSize #else - #define HIPCUB_WARP_THREADS_MACRO CUB_PTX_WARP_THREADS + #define HIPCUB_WARP_THREADS_MACRO warp_threads #endif #include _HIPCUB_STD_INCLUDE(limits) diff --git a/projects/hipcub/hipcub/include/hipcub/config.hpp b/projects/hipcub/hipcub/include/hipcub/config.hpp index 97c8fe0cdee..1c0327d6bd4 100644 --- a/projects/hipcub/hipcub/include/hipcub/config.hpp +++ b/projects/hipcub/hipcub/include/hipcub/config.hpp @@ -120,9 +120,9 @@ END_HIPCUB_NAMESPACE #define HIPCUB_RUNTIME_FUNCTION CUB_RUNTIME_FUNCTION #include - #define HIPCUB_WARP_THREADS CUB_PTX_WARP_THREADS - #define HIPCUB_DEVICE_WARP_THREADS CUB_PTX_WARP_THREADS - #define HIPCUB_HOST_WARP_THREADS CUB_PTX_WARP_THREADS + #define HIPCUB_WARP_THREADS warp_threads + #define HIPCUB_DEVICE_WARP_THREADS warp_threads + #define HIPCUB_HOST_WARP_THREADS warp_threads #define HIPCUB_ARCH CUB_PTX_ARCH BEGIN_HIPCUB_NAMESPACE using namespace cub; From 6cc205071b40364423c4a7471e10d53c56732855 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Wed, 3 Dec 2025 12:09:21 +0000 Subject: [PATCH 37/95] Use `_CCCL_PRAGMA_UNROLL_FULL()` and `_CCCL_PRAGMA_NOUNROLL()` --- projects/hipcub/CHANGELOG.md | 1 + .../benchmark_block_adjacent_difference.cpp | 8 ++--- .../benchmark_block_discontinuity.cpp | 6 ++-- .../benchmark/benchmark_block_exchange.cpp | 12 ++++---- .../benchmark/benchmark_block_histogram.cpp | 4 +-- .../benchmark/benchmark_block_merge_sort.cpp | 4 +-- .../benchmark/benchmark_block_radix_rank.cpp | 4 +-- .../benchmark/benchmark_block_radix_sort.cpp | 4 +-- .../benchmark/benchmark_block_reduce.cpp | 2 +- .../benchmark_block_run_length_decode.cpp | 2 +- .../hipcub/benchmark/benchmark_block_scan.cpp | 4 +-- .../benchmark/benchmark_block_shuffle.cpp | 8 ++--- .../benchmark/benchmark_device_memory.cpp | 10 +++---- .../benchmark/benchmark_warp_exchange.cpp | 8 ++--- .../hipcub/benchmark/benchmark_warp_load.cpp | 2 +- .../benchmark/benchmark_warp_reduce.cpp | 4 +-- .../hipcub/benchmark/benchmark_warp_scan.cpp | 6 ++-- .../hipcub/benchmark/benchmark_warp_store.cpp | 2 +- projects/hipcub/cmake/SetupNVCC.cmake | 2 +- .../rocprim/block/block_merge_sort.hpp | 12 ++++---- .../rocprim/block/block_run_length_decode.hpp | 8 ++--- .../backend/rocprim/thread/thread_reduce.hpp | 12 ++++---- .../backend/rocprim/thread/thread_scan.hpp | 4 +-- .../backend/rocprim/thread/thread_sort.hpp | 30 +++++++++---------- .../hipcub/hipcub/include/hipcub/config.hpp | 9 ++++++ .../hipcub/test_hipcub_block_histogram.cpp | 2 +- .../test_hipcub_block_load_store.kernels.hpp | 2 +- .../hipcub/test_hipcub_block_radix_rank.cpp | 4 +-- ...test_hipcub_single_pass_scan_operators.cpp | 2 +- 29 files changed, 94 insertions(+), 84 deletions(-) diff --git a/projects/hipcub/CHANGELOG.md b/projects/hipcub/CHANGELOG.md index 8c3269a99a0..c721c71c859 100644 --- a/projects/hipcub/CHANGELOG.md +++ b/projects/hipcub/CHANGELOG.md @@ -34,6 +34,7 @@ Full documentation for hipCUB is available at [https://rocm.docs.amd.com/project * Updated the documentation on how to run hipCUB tests on multiple GPUs in parallel. * Changed `CCCL_MINIMUM_VERSION` to `3.0.0` to align with CUB. * Add support for large num_items `DeviceMerge` and `DeviceSegmentedSort`. +* Replace `#pragma unroll` by `_CCCL_PRAGMA_UNROLL_FULL()` and `_CCCL_PRAGMA_NOUNROLL()` by `_CCCL_PRAGMA_NOUNROLL()`. ### Removed diff --git a/projects/hipcub/benchmark/benchmark_block_adjacent_difference.cpp b/projects/hipcub/benchmark/benchmark_block_adjacent_difference.cpp index 3679076910a..079977409fe 100644 --- a/projects/hipcub/benchmark/benchmark_block_adjacent_difference.cpp +++ b/projects/hipcub/benchmark/benchmark_block_adjacent_difference.cpp @@ -63,7 +63,7 @@ struct subtract_left hipcub::BlockAdjacentDifference adjacent_difference; -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < trials; trial++) { T output[ItemsPerThread]; @@ -106,7 +106,7 @@ struct subtract_left_partial_tile // Try to evenly distribute the length of tile_sizes between all the trials const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1; -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < trials; trial++) { T output[ItemsPerThread]; @@ -150,7 +150,7 @@ struct subtract_right hipcub::BlockAdjacentDifference adjacent_difference; -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < trials; trial++) { T output[ItemsPerThread]; @@ -193,7 +193,7 @@ struct subtract_right_partial_tile // Try to evenly distribute the length of tile_sizes between all the trials const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1; -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < trials; trial++) { T output[ItemsPerThread]; diff --git a/projects/hipcub/benchmark/benchmark_block_discontinuity.cpp b/projects/hipcub/benchmark/benchmark_block_discontinuity.cpp index 5e36160c140..f205430d2dc 100644 --- a/projects/hipcub/benchmark/benchmark_block_discontinuity.cpp +++ b/projects/hipcub/benchmark/benchmark_block_discontinuity.cpp @@ -68,7 +68,7 @@ struct flag_heads T input[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockDiscontinuity bdiscontinuity; @@ -106,7 +106,7 @@ struct flag_tails T input[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockDiscontinuity bdiscontinuity; @@ -144,7 +144,7 @@ struct flag_heads_and_tails T input[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockDiscontinuity bdiscontinuity; diff --git a/projects/hipcub/benchmark/benchmark_block_exchange.cpp b/projects/hipcub/benchmark/benchmark_block_exchange.cpp index 000cd41be6a..2b3e96784c8 100644 --- a/projects/hipcub/benchmark/benchmark_block_exchange.cpp +++ b/projects/hipcub/benchmark/benchmark_block_exchange.cpp @@ -54,7 +54,7 @@ struct blocked_to_striped T input[ItemsPerThread]; hipcub::LoadDirectBlocked(lid, d_input + block_offset, input); -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockExchange exchange; @@ -78,7 +78,7 @@ struct striped_to_blocked T input[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockExchange exchange; @@ -102,7 +102,7 @@ struct blocked_to_warp_striped T input[ItemsPerThread]; hipcub::LoadDirectBlocked(lid, d_input + block_offset, input); -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockExchange exchange; @@ -126,7 +126,7 @@ struct warp_striped_to_blocked T input[ItemsPerThread]; hipcub::LoadDirectWarpStriped(lid, d_input + block_offset, input); -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockExchange exchange; @@ -152,7 +152,7 @@ struct scatter_to_blocked hipcub::LoadDirectStriped(lid, d_input + block_offset, input); hipcub::LoadDirectStriped(lid, d_ranks + block_offset, ranks); -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockExchange exchange; @@ -178,7 +178,7 @@ struct scatter_to_striped hipcub::LoadDirectStriped(lid, d_input + block_offset, input); hipcub::LoadDirectStriped(lid, d_ranks + block_offset, ranks); -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockExchange exchange; diff --git a/projects/hipcub/benchmark/benchmark_block_histogram.cpp b/projects/hipcub/benchmark/benchmark_block_histogram.cpp index 1206e042353..c825e75c2bd 100644 --- a/projects/hipcub/benchmark/benchmark_block_histogram.cpp +++ b/projects/hipcub/benchmark/benchmark_block_histogram.cpp @@ -64,13 +64,13 @@ struct histogram __shared__ T histogram[BinSize]; __shared__ typename bhistogram_t::TempStorage storage; -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { bhistogram_t(storage).Histogram(values, histogram); } -#pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for(unsigned int offset = 0; offset < BinSize; offset += BlockSize) { if(offset + hipThreadIdx_x < BinSize) diff --git a/projects/hipcub/benchmark/benchmark_block_merge_sort.cpp b/projects/hipcub/benchmark/benchmark_block_merge_sort.cpp index c8c7402b136..f1ac69b768b 100644 --- a/projects/hipcub/benchmark/benchmark_block_merge_sort.cpp +++ b/projects/hipcub/benchmark/benchmark_block_merge_sort.cpp @@ -52,7 +52,7 @@ void sort_keys_kernel(const T* input, T* output, CompareOp compare_op) T keys[ItemsPerThread]; hipcub::LoadDirectStriped(lid, input + block_offset, keys); -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockMergeSort sort; @@ -82,7 +82,7 @@ void sort_pairs_kernel(const T* input, T* output, CompareOp compare_op) values[i] = keys[i] + T(1); } -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockMergeSort sort; diff --git a/projects/hipcub/benchmark/benchmark_block_radix_rank.cpp b/projects/hipcub/benchmark/benchmark_block_radix_rank.cpp index ffcd1d77505..cba7f813e27 100644 --- a/projects/hipcub/benchmark/benchmark_block_radix_rank.cpp +++ b/projects/hipcub/benchmark/benchmark_block_radix_rank.cpp @@ -70,7 +70,7 @@ __global__ __launch_bounds__(BlockSize) void rank_kernel(const T* keys_input, in Descending, BenchmarkKind == RadixRankAlgorithm::RADIX_RANK_MEMOIZE>>; -#pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for(unsigned int key = 0; key < ItemsPerThread; key++) { unsigned_keys[key] = KeyTraits::TwiddleIn(unsigned_keys[key]); @@ -78,7 +78,7 @@ __global__ __launch_bounds__(BlockSize) void rank_kernel(const T* keys_input, in int ranks[ItemsPerThread]; -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { __shared__ typename RankType::TempStorage storage; diff --git a/projects/hipcub/benchmark/benchmark_block_radix_sort.cpp b/projects/hipcub/benchmark/benchmark_block_radix_sort.cpp index 4b75c26910a..e1fc336c933 100644 --- a/projects/hipcub/benchmark/benchmark_block_radix_sort.cpp +++ b/projects/hipcub/benchmark/benchmark_block_radix_sort.cpp @@ -134,7 +134,7 @@ __global__ __launch_bounds__(BlockSize) void sort_keys_kernel(const T* input, T* T keys[ItemsPerThread]; Helper::template load(lid, input + block_offset, keys); -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { Helper::template sort(keys); @@ -162,7 +162,7 @@ __global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(const T* input, T values[i] = keys[i] + T(1); } -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { Helper::template sort(keys, values); diff --git a/projects/hipcub/benchmark/benchmark_block_reduce.cpp b/projects/hipcub/benchmark/benchmark_block_reduce.cpp index fe4b815d50c..82c11e08e9b 100644 --- a/projects/hipcub/benchmark/benchmark_block_reduce.cpp +++ b/projects/hipcub/benchmark/benchmark_block_reduce.cpp @@ -58,7 +58,7 @@ struct reduce using breduce_t = hipcub::BlockReduce; __shared__ typename breduce_t::TempStorage storage; -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { reduced_value = breduce_t(storage).Reduce(values, hipcub::Sum()); diff --git a/projects/hipcub/benchmark/benchmark_block_run_length_decode.cpp b/projects/hipcub/benchmark/benchmark_block_run_length_decode.cpp index a42d3c48065..9f061370c62 100644 --- a/projects/hipcub/benchmark/benchmark_block_run_length_decode.cpp +++ b/projects/hipcub/benchmark/benchmark_block_run_length_decode.cpp @@ -58,7 +58,7 @@ __global__ = d_run_offsets[(hipBlockIdx_x + 1) * BlockSize * RunsPerThread] - d_run_offsets[hipBlockIdx_x * BlockSize * RunsPerThread]; -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned i = 0; i < Trials; ++i) { OffsetT decoded_window_offset = 0; diff --git a/projects/hipcub/benchmark/benchmark_block_scan.cpp b/projects/hipcub/benchmark/benchmark_block_scan.cpp index 51bf6c63fac..b4e79f44b5c 100644 --- a/projects/hipcub/benchmark/benchmark_block_scan.cpp +++ b/projects/hipcub/benchmark/benchmark_block_scan.cpp @@ -57,7 +57,7 @@ struct inclusive_scan using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage storage; -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { bscan_t(storage).InclusiveScan(values, values, hipcub::Sum()); @@ -87,7 +87,7 @@ struct exclusive_scan using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage storage; -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { bscan_t(storage).ExclusiveScan(values, values, init, hipcub::Sum()); diff --git a/projects/hipcub/benchmark/benchmark_block_shuffle.cpp b/projects/hipcub/benchmark/benchmark_block_shuffle.cpp index 697d381c24d..7e9eeed2059 100644 --- a/projects/hipcub/benchmark/benchmark_block_shuffle.cpp +++ b/projects/hipcub/benchmark/benchmark_block_shuffle.cpp @@ -53,7 +53,7 @@ struct offset using bshuffle_t = hipcub::BlockShuffle; __shared__ typename bshuffle_t::TempStorage storage; -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { bshuffle_t(storage).Offset(value, value, 1); @@ -84,7 +84,7 @@ struct rotate using bshuffle_t = hipcub::BlockShuffle; __shared__ typename bshuffle_t::TempStorage storage; -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { bshuffle_t(storage).Rotate(value, value, 1); @@ -116,7 +116,7 @@ struct up using bshuffle_t = hipcub::BlockShuffle; __shared__ typename bshuffle_t::TempStorage storage; -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { bshuffle_t(storage).Up(values, values); @@ -151,7 +151,7 @@ struct down using bshuffle_t = hipcub::BlockShuffle; __shared__ typename bshuffle_t::TempStorage storage; -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { bshuffle_t(storage).Down(values, values); diff --git a/projects/hipcub/benchmark/benchmark_device_memory.cpp b/projects/hipcub/benchmark/benchmark_device_memory.cpp index 1e62167a7a1..4b71b4de749 100644 --- a/projects/hipcub/benchmark/benchmark_device_memory.cpp +++ b/projects/hipcub/benchmark/benchmark_device_memory.cpp @@ -81,12 +81,12 @@ struct operation (void)storage; (void)global_mem_output; -#pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for(unsigned int i = 0; i < ItemsPerThread; i++) { input[i] = input[i] + 666; constexpr unsigned int repeats = 30; -#pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for(unsigned int j = 0; j < repeats; j++) { input[i] = input[i] * (input[j % ItemsPerThread]); @@ -134,7 +134,7 @@ struct operation const unsigned int index = threadIdx.x * ItemsPerThread + blockIdx.x * blockDim.x * ItemsPerThread; -#pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for(unsigned int i = 0; i < ItemsPerThread; i++) { atomicAdd(&global_mem_output[index + i], T(666)); @@ -159,7 +159,7 @@ struct operation const unsigned int index = (threadIdx.x % warpSize) * ItemsPerThread + blockIdx.x * blockDim.x * ItemsPerThread; -#pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for(unsigned int i = 0; i < ItemsPerThread; i++) { atomicAdd(&global_mem_output[index + i], T(666)); @@ -183,7 +183,7 @@ struct operation (void)input; const unsigned int index = threadIdx.x * ItemsPerThread; -#pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for(unsigned int i = 0; i < ItemsPerThread; i++) { atomicAdd(&global_mem_output[index + i], T(666)); diff --git a/projects/hipcub/benchmark/benchmark_warp_exchange.cpp b/projects/hipcub/benchmark/benchmark_warp_exchange.cpp index 0c41be0588a..3b40b17884b 100644 --- a/projects/hipcub/benchmark/benchmark_warp_exchange.cpp +++ b/projects/hipcub/benchmark/benchmark_warp_exchange.cpp @@ -41,7 +41,7 @@ __device__ auto warp_exchange_benchmark(T* d_output) -> std::enable_if_t> { T thread_data[ItemsPerThread]; -#pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for(unsigned i = 0; i < ItemsPerThread; ++i) { thread_data[i] = static_cast(i); @@ -59,7 +59,7 @@ __device__ auto warp_exchange_benchmark(T* d_output) WarpExchangeT warp_exchange(temp_storage[warp_id]); Op{}(warp_exchange, thread_data); -#pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for(unsigned i = 0; i < ItemsPerThread; ++i) { const unsigned global_idx = (BlockSize * blockIdx.x + threadIdx.x) * ItemsPerThread + i; @@ -99,7 +99,7 @@ __device__ auto warp_exchange_scatter_to_striped_benchmark(T* d_output) const unsigned warp_id = threadIdx.x / LogicalWarpSize; T thread_data[ItemsPerThread]; OffsetT thread_ranks[ItemsPerThread]; -#pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for(unsigned i = 0; i < ItemsPerThread; ++i) { thread_data[i] = static_cast(i); @@ -112,7 +112,7 @@ __device__ auto warp_exchange_scatter_to_striped_benchmark(T* d_output) WarpExchangeT(temp_storage[warp_id]).ScatterToStriped(thread_data, thread_ranks); -#pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for(unsigned i = 0; i < ItemsPerThread; ++i) { const unsigned striped_global_idx diff --git a/projects/hipcub/benchmark/benchmark_warp_load.cpp b/projects/hipcub/benchmark/benchmark_warp_load.cpp index 2c74609dfc9..4b05bab8ec5 100644 --- a/projects/hipcub/benchmark/benchmark_warp_load.cpp +++ b/projects/hipcub/benchmark/benchmark_warp_load.cpp @@ -50,7 +50,7 @@ __device__ auto warp_load_benchmark(T* d_input, T* d_output) WarpLoadT(temp_storage[warp_id]).Load(d_input + global_warp_id * tile_size, thread_data); -#pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for(unsigned i = 0; i < ItemsPerThread; ++i) { const unsigned striped_global_idx diff --git a/projects/hipcub/benchmark/benchmark_warp_reduce.cpp b/projects/hipcub/benchmark/benchmark_warp_reduce.cpp index 66e80917ab7..e18743243f7 100644 --- a/projects/hipcub/benchmark/benchmark_warp_reduce.cpp +++ b/projects/hipcub/benchmark/benchmark_warp_reduce.cpp @@ -40,7 +40,7 @@ __device__ auto warp_reduce_benchmark(const T* d_input, T* d_output) using wreduce_t = hipcub::WarpReduce; __shared__ typename wreduce_t::TempStorage storage; auto reduce_op = hipcub::Sum(); -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { value = wreduce_t(storage).Reduce(value, reduce_op); @@ -72,7 +72,7 @@ __device__ auto segmented_warp_reduce_benchmark(const T* d_input, Flag* d_flags, using wreduce_t = hipcub::WarpReduce; __shared__ typename wreduce_t::TempStorage storage; -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { value = wreduce_t(storage).HeadSegmentedSum(value, flag); diff --git a/projects/hipcub/benchmark/benchmark_warp_scan.cpp b/projects/hipcub/benchmark/benchmark_warp_scan.cpp index 4d05b604be8..d1d9b778aa9 100644 --- a/projects/hipcub/benchmark/benchmark_warp_scan.cpp +++ b/projects/hipcub/benchmark/benchmark_warp_scan.cpp @@ -57,7 +57,7 @@ struct inclusive_scan using wscan_t = hipcub::WarpScan; __shared__ typename wscan_t::TempStorage storage; auto scan_op = hipcub::Sum(); -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { wscan_t(storage).InclusiveScan(value, value, scan_op); @@ -86,7 +86,7 @@ struct exclusive_scan using wscan_t = hipcub::WarpScan; __shared__ typename wscan_t::TempStorage storage; auto scan_op = hipcub::Sum(); -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { wscan_t(storage).ExclusiveScan(value, value, init, scan_op); @@ -118,7 +118,7 @@ struct broadcast using wscan_t = hipcub::WarpScan; __shared__ typename wscan_t::TempStorage storage; -#pragma nounroll + _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { value = wscan_t(storage).Broadcast(value, src_lane); diff --git a/projects/hipcub/benchmark/benchmark_warp_store.cpp b/projects/hipcub/benchmark/benchmark_warp_store.cpp index 6632faf178b..8c63ec5489a 100644 --- a/projects/hipcub/benchmark/benchmark_warp_store.cpp +++ b/projects/hipcub/benchmark/benchmark_warp_store.cpp @@ -40,7 +40,7 @@ __device__ auto warp_store_benchmark(T* d_output) -> std::enable_if_t> { T thread_data[ItemsPerThread]; -#pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for(unsigned i = 0; i < ItemsPerThread; ++i) { thread_data[i] = static_cast(i); diff --git a/projects/hipcub/cmake/SetupNVCC.cmake b/projects/hipcub/cmake/SetupNVCC.cmake index 46b88d27833..88f520f20a7 100644 --- a/projects/hipcub/cmake/SetupNVCC.cmake +++ b/projects/hipcub/cmake/SetupNVCC.cmake @@ -123,6 +123,6 @@ if (NOT _HIPCUB_HIP_NVCC_FLAGS_SET) set(_HIPCUB_HIP_NVCC_FLAGS_SET ON CACHE INTERNAL "") endif() -# Ignore warnings about #pragma unroll +# Ignore warnings about _CCCL_PRAGMA_UNROLL_FULL() # and about deprecated CUDA function(s) used in hip/nvcc_detail/hip_runtime_api.h # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_CPP_CONFIG_FLAGS_STRIP} -Wno-unknown-pragmas -Wno-deprecated-declarations" CACHE STRING "compile flags" FORCE) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_merge_sort.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_merge_sort.hpp index d306c834e3a..b9bcf5b53d2 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_merge_sort.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_merge_sort.hpp @@ -95,7 +95,7 @@ HIPCUB_DEVICE __forceinline__ void SerialMerge(KeyT *keys_shared, KeyT key1 = keys_shared[keys1_beg]; KeyT key2 = keys_shared[keys2_beg]; -#pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for (int item = 0; item < ITEMS_PER_THREAD; ++item) { bool p = (keys2_beg < keys2_end) && @@ -387,7 +387,7 @@ class BlockMergeSortStrategy // KeyT max_key = oob_default; - #pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for (int item = WARP_SORT ? 1 : 0; item < ITEMS_PER_THREAD; ++item) { if (ITEMS_PER_THREAD * static_cast(linear_tid) + item < valid_items) @@ -411,7 +411,7 @@ class BlockMergeSortStrategy // each thread has sorted keys // merge sort keys in shared memory // - #pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for (int target_merged_threads_number = 2; target_merged_threads_number <= NUM_THREADS; target_merged_threads_number *= 2) @@ -423,7 +423,7 @@ class BlockMergeSortStrategy // store keys in shmem // - #pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for (int item = 0; item < ITEMS_PER_THREAD; ++item) { int idx = ITEMS_PER_THREAD * linear_tid + item; @@ -482,7 +482,7 @@ class BlockMergeSortStrategy // store keys in shmem // - #pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for (int item = 0; item < ITEMS_PER_THREAD; ++item) { int idx = ITEMS_PER_THREAD * linear_tid + item; @@ -493,7 +493,7 @@ class BlockMergeSortStrategy // gather items from shmem // - #pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for (int item = 0; item < ITEMS_PER_THREAD; ++item) { items[item] = temp_storage.items_shared[indices[item]]; diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_run_length_decode.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_run_length_decode.hpp index 0fb000ab097..a2ca04740bc 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_run_length_decode.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_run_length_decode.hpp @@ -248,7 +248,7 @@ class BlockRunLengthDecode { OffsetT lower_bound = 0; OffsetT upper_bound = num_items; - #pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for (int i = 0; i <= Log2::VALUE; i++) { OffsetT mid = hipcub::MidPoint(lower_bound, upper_bound); @@ -273,7 +273,7 @@ class BlockRunLengthDecode { // Keep the runs' items and the offsets of each run's beginning in the temporary storage RunOffsetT thread_dst_offset = static_cast(linear_tid) * static_cast(RUNS_PER_THREAD); - #pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for (int i = 0; i < RUNS_PER_THREAD; i++) { temp_storage.runs.run_values[thread_dst_offset] = run_values[i]; @@ -292,7 +292,7 @@ class BlockRunLengthDecode { // Compute the offset for the beginning of each run DecodedOffsetT run_offsets[RUNS_PER_THREAD]; - #pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for (int i = 0; i < RUNS_PER_THREAD; i++) { run_offsets[i] = static_cast(run_lengths[i]); @@ -348,7 +348,7 @@ class BlockRunLengthDecode ItemT val = temp_storage.runs.run_values[assigned_run]; - #pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for (DecodedOffsetT i = 0; i < DECODED_ITEMS_PER_THREAD; i++) { decoded_items[i] = val; diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_reduce.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_reduce.hpp index 7547974fc00..dadb0e8dd38 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_reduce.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_reduce.hpp @@ -45,7 +45,7 @@ AccumType { AccumType retval = static_cast(prefix); constexpr int length = ::hipcub::detail::static_size_v(); -#pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for(int i = 0; i < length; ++i) { retval = reduction_op(retval, input[i]); @@ -60,7 +60,7 @@ AccumType ThreadReduceSequential(const InputType& input, ReductionOp reduction_o { AccumType retval = input[0]; constexpr int length = ::hipcub::detail::static_size_v(); -#pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for(int i = 1; i < length; ++i) { retval = reduction_op(retval, input[i]); @@ -76,10 +76,10 @@ __device__ __forceinline__ AccumType ThreadReduceBinaryTree(const InputType& input, ReductionOp reduction_op) { constexpr auto length = ::hipcub::detail::static_size_v(); -#pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for(int i = 1; i < length; i *= 2) { -#pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for(int j = 0; j + i < length; j += i * 2) { input[j] = reduction_op(input[j], input[j + i]); @@ -95,10 +95,10 @@ __device__ __forceinline__ AccumType ThreadReduceTernaryTree(const InputType& input, ReductionOp reduction_op) { constexpr auto length = ::hipcub::detail::static_size_v(); -#pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for(int i = 1; i < length; i *= 3) { -#pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for(int j = 0; j + i < length; j += i * 3) { auto value = reduction_op(input[j], input[j + i]); diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_scan.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_scan.hpp index 4995d72382e..526743c8724 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_scan.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_scan.hpp @@ -61,7 +61,7 @@ HIPCUB_FORCEINLINE ScanOp scan_op, ///< [in] Binary scan operator detail::int_constant_t /*length*/) { -#pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for(int i = 0; i < LENGTH; ++i) { inclusive = scan_op(exclusive, input[i]); @@ -144,7 +144,7 @@ HIPCUB_FORCEINLINE ScanOp scan_op, ///< [in] Binary scan operator detail::int_constant_t /*length*/) { -#pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for(int i = 0; i < LENGTH; ++i) { inclusive = scan_op(inclusive, input[i]); diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_sort.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_sort.hpp index eec88df700b..87b20e2c31c 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_sort.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_sort.hpp @@ -84,28 +84,28 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], { constexpr bool KEYS_ONLY = ::rocprim::Equals::VALUE; - #pragma unroll - for (int i = 0; i < ITEMS_PER_THREAD; ++i) + _CCCL_PRAGMA_UNROLL_FULL() + for(int i = 0; i < ITEMS_PER_THREAD; ++i) { - #pragma unroll - for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) - { - if (compare_op(keys[j + 1], keys[j])) + _CCCL_PRAGMA_UNROLL_FULL() + for(int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) { + if(compare_op(keys[j + 1], keys[j])) + { #if defined(__HIP_PLATFORM_NVIDIA__) - using ::cuda::std::swap; + using ::cuda::std::swap; #else - using ::rocprim::swap; + using ::rocprim::swap; #endif - swap(keys[j], keys[j + 1]); - if(!KEYS_ONLY) - { - swap(items[j], items[j + 1]); + swap(keys[j], keys[j + 1]); + if(!KEYS_ONLY) + { + swap(items[j], items[j + 1]); + } } - } - } // inner loop - } // outer loop + } // inner loop + } // outer loop } diff --git a/projects/hipcub/hipcub/include/hipcub/config.hpp b/projects/hipcub/hipcub/include/hipcub/config.hpp index 1c0327d6bd4..610b145078c 100644 --- a/projects/hipcub/hipcub/include/hipcub/config.hpp +++ b/projects/hipcub/hipcub/include/hipcub/config.hpp @@ -237,4 +237,13 @@ END_HIPCUB_NAMESPACE #endif #endif // HIPCUB_ROCPRIM_API +// This API needs to be deprecated once libhipcxx is available. +#if !defined(_CCCL_PRAGMA_UNROLL_FULL) + #define _CCCL_PRAGMA_UNROLL_FULL() _Pragma("unroll") +#endif // !defined(_CCCL_PRAGMA_UNROLL_FULL) + +#if !defined(_CCCL_PRAGMA_NOUNROLL) + #define _CCCL_PRAGMA_NOUNROLL() _Pragma("nounroll") +#endif // !defined(_CCCL_PRAGMA_NOUNROLL) + #endif // HIPCUB_CONFIG_HPP_ diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_histogram.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_histogram.cpp index 0691aabde32..dcef6ab3857 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_histogram.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_histogram.cpp @@ -117,7 +117,7 @@ void histogram_kernel(T* device_output, T* device_output_bin) bhistogram_t(temp_storage).Histogram(in_out, hist); __syncthreads(); - #pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for (unsigned int offset = 0; offset < BinSize; offset += BlockSize) { if(offset + hipThreadIdx_x < BinSize) diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_load_store.kernels.hpp b/projects/hipcub/test/hipcub/test_hipcub_block_load_store.kernels.hpp index 94cef4f8504..5e6192857a0 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_load_store.kernels.hpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_load_store.kernels.hpp @@ -191,7 +191,7 @@ __launch_bounds__(BlockSize) __global__ __syncthreads(); // reset data -#pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for(unsigned int item = 0; item < ItemsPerThread; ++item) data[item] = OutputT(); diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp index 5840b0939f6..25823d5b1a4 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp @@ -341,7 +341,7 @@ void rank_kernel(const KeyType* keys_input, UnsignedBits(&unsigned_keys)[ItemsPerThread] = reinterpret_cast(keys); -#pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for(unsigned int key = 0; key < ItemsPerThread; key++) { unsigned_keys[key] = KeyTraits::TwiddleIn(unsigned_keys[key]); @@ -571,7 +571,7 @@ void rank_with_prefix_sum_kernel(const KeyType* keys_input, UnsignedBits(&unsigned_keys)[ItemsPerThread] = reinterpret_cast(keys); -#pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for(unsigned int key = 0; key < ItemsPerThread; key++) { unsigned_keys[key] = KeyTraits::TwiddleIn(unsigned_keys[key]); diff --git a/projects/hipcub/test/hipcub/test_hipcub_single_pass_scan_operators.cpp b/projects/hipcub/test/hipcub/test_hipcub_single_pass_scan_operators.cpp index f6f21f26c5c..f440e7f6f88 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_single_pass_scan_operators.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_single_pass_scan_operators.cpp @@ -272,7 +272,7 @@ static void RunningPrefixKernel(T* d_input, T* d_output) prefix_type prefix(T(), ScanOp{}); -#pragma unroll + _CCCL_PRAGMA_UNROLL_FULL() for(int i = 0; i < num_items; ++i) { T value = d_input[i]; From a803a6835db6de5778963bedb23b92f74c42bb6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Wed, 3 Dec 2025 17:07:28 +0000 Subject: [PATCH 38/95] Allow rapids to avoid unrolling some loops in sort --- projects/hipcub/CHANGELOG.md | 1 + .../backend/rocprim/block/block_merge_sort.hpp | 5 +++-- .../hipcub/backend/rocprim/thread/thread_sort.hpp | 5 +++-- .../include/hipcub/backend/rocprim/util_macro.hpp | 13 ++++++++++++- 4 files changed, 19 insertions(+), 5 deletions(-) diff --git a/projects/hipcub/CHANGELOG.md b/projects/hipcub/CHANGELOG.md index c721c71c859..5111892ad6f 100644 --- a/projects/hipcub/CHANGELOG.md +++ b/projects/hipcub/CHANGELOG.md @@ -35,6 +35,7 @@ Full documentation for hipCUB is available at [https://rocm.docs.amd.com/project * Changed `CCCL_MINIMUM_VERSION` to `3.0.0` to align with CUB. * Add support for large num_items `DeviceMerge` and `DeviceSegmentedSort`. * Replace `#pragma unroll` by `_CCCL_PRAGMA_UNROLL_FULL()` and `_CCCL_PRAGMA_NOUNROLL()` by `_CCCL_PRAGMA_NOUNROLL()`. +* Add `_CCCL_SORT_MAYBE_UNROLL()` in block merge sort and thread sort. ### Removed diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_merge_sort.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_merge_sort.hpp index b9bcf5b53d2..ae9b89c1aeb 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_merge_sort.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_merge_sort.hpp @@ -32,6 +32,7 @@ #include "../../../config.hpp" #include "../thread/thread_sort.hpp" +#include "../util_macro.hpp" #include "../util_math.hpp" #include "../util_type.hpp" @@ -95,7 +96,7 @@ HIPCUB_DEVICE __forceinline__ void SerialMerge(KeyT *keys_shared, KeyT key1 = keys_shared[keys1_beg]; KeyT key2 = keys_shared[keys2_beg]; - _CCCL_PRAGMA_UNROLL_FULL() + _CCCL_SORT_MAYBE_UNROLL() for (int item = 0; item < ITEMS_PER_THREAD; ++item) { bool p = (keys2_beg < keys2_end) && @@ -387,7 +388,7 @@ class BlockMergeSortStrategy // KeyT max_key = oob_default; - _CCCL_PRAGMA_UNROLL_FULL() + _CCCL_SORT_MAYBE_UNROLL() for (int item = WARP_SORT ? 1 : 0; item < ITEMS_PER_THREAD; ++item) { if (ITEMS_PER_THREAD * static_cast(linear_tid) + item < valid_items) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_sort.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_sort.hpp index 87b20e2c31c..7d8aaf5253e 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_sort.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_sort.hpp @@ -31,6 +31,7 @@ #include "../../../config.hpp" +#include "../util_macro.hpp" #include "../util_ptx.hpp" #include "../util_type.hpp" @@ -84,10 +85,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], { constexpr bool KEYS_ONLY = ::rocprim::Equals::VALUE; - _CCCL_PRAGMA_UNROLL_FULL() + _CCCL_SORT_MAYBE_UNROLL() for(int i = 0; i < ITEMS_PER_THREAD; ++i) { - _CCCL_PRAGMA_UNROLL_FULL() + _CCCL_SORT_MAYBE_UNROLL() for(int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) { if(compare_op(keys[j + 1], keys[j])) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_macro.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_macro.hpp index de5f4c919c2..97b36a7654d 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_macro.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_macro.hpp @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2025, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -34,4 +34,15 @@ #include +BEGIN_HIPCUB_NAMESPACE + +// RAPIDS cuDF needs to avoid unrolling some loops in sort to prevent compile time issues +#if defined(CCCL_AVOID_SORT_UNROLL) + #define _CCCL_SORT_MAYBE_UNROLL() _CCCL_PRAGMA_NOUNROLL() +#else // ^^^ CCCL_AVOID_SORT_UNROLL ^^^ / vvv !CCCL_AVOID_SORT_UNROLL vvv + #define _CCCL_SORT_MAYBE_UNROLL() _CCCL_PRAGMA_UNROLL_FULL() +#endif // !CCCL_AVOID_SORT_UNROLL + +END_HIPCUB_NAMESPACE + #endif // HIPCUB_ROCPRIM_MACRO_HPP_ From 1ae1c0c240dac1019979d532f7a3f37f92a93a00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Wed, 3 Dec 2025 16:40:37 +0000 Subject: [PATCH 39/95] Adds support for large num items to DeviceMerge --- .../backend/rocprim/device/device_merge.hpp | 42 +++++++++---------- .../test/hipcub/test_hipcub_device_merge.cpp | 2 +- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_merge.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_merge.hpp index 014695e5ba0..66ae98b6ea1 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_merge.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_merge.hpp @@ -43,15 +43,15 @@ struct DeviceMerge typename KeyIteratorOut, typename CompareOp = ::rocprim::less<>> HIPCUB_RUNTIME_FUNCTION - static hipError_t MergeKeys(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorIn1 keys_in1, - int num_keys1, - KeyIteratorIn2 keys_in2, - int num_keys2, - KeyIteratorOut keys_out, - CompareOp compare_op = {}, - hipStream_t stream = 0) + static hipError_t MergeKeys(void* d_temp_storage, + std::size_t& temp_storage_bytes, + KeyIteratorIn1 keys_in1, + _HIPCUB_STD::int64_t num_keys1, + KeyIteratorIn2 keys_in2, + _HIPCUB_STD::int64_t num_keys2, + KeyIteratorOut keys_out, + CompareOp compare_op = {}, + hipStream_t stream = 0) { return ::rocprim::merge(d_temp_storage, @@ -74,18 +74,18 @@ struct DeviceMerge typename ValueIteratorOut, typename CompareOp = ::rocprim::less<>> HIPCUB_RUNTIME_FUNCTION - static hipError_t MergePairs(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorIn1 keys_in1, - ValueIteratorIn1 values_in1, - int num_keys1, - KeyIteratorIn2 keys_in2, - ValueIteratorIn2 values_in2, - int num_keys2, - KeyIteratorOut keys_out, - ValueIteratorOut values_out, - CompareOp compare_op = {}, - hipStream_t stream = 0) + static hipError_t MergePairs(void* d_temp_storage, + std::size_t& temp_storage_bytes, + KeyIteratorIn1 keys_in1, + ValueIteratorIn1 values_in1, + _HIPCUB_STD::int64_t num_keys1, + KeyIteratorIn2 keys_in2, + ValueIteratorIn2 values_in2, + _HIPCUB_STD::int64_t num_keys2, + KeyIteratorOut keys_out, + ValueIteratorOut values_out, + CompareOp compare_op = {}, + hipStream_t stream = 0) { return ::rocprim::merge(d_temp_storage, diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_merge.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_merge.cpp index efdee71e2a5..8594d2ac5f6 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_merge.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_merge.cpp @@ -472,7 +472,7 @@ TEST(HipcubDeviceMerge, MergeLargeSizeIterators) SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); HIP_CHECK(hipSetDevice(device_id)); - using key_type = int; + using key_type = _HIPCUB_STD::int64_t; using compare_function = test_utils::less; hipStream_t stream = 0; // default From 50500b7154871e4da886d1102a06564045ee0469 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Wed, 3 Dec 2025 16:40:49 +0000 Subject: [PATCH 40/95] Adds support for large number of buffers to `DeviceCpy::Batched` and `DeviceMemcpy::Batched` --- .../hipcub/backend/cub/device/device_copy.hpp | 14 +++++++------- .../hipcub/backend/cub/device/device_memcpy.hpp | 14 +++++++------- .../hipcub/backend/rocprim/device/device_copy.hpp | 14 +++++++------- .../backend/rocprim/device/device_memcpy.hpp | 14 +++++++------- 4 files changed, 28 insertions(+), 28 deletions(-) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_copy.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_copy.hpp index a6315c51724..b7f9d56fec2 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_copy.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_copy.hpp @@ -40,13 +40,13 @@ BEGIN_HIPCUB_NAMESPACE struct DeviceCopy { template - static hipError_t Batched(void* d_temp_storage, - size_t& temp_storage_bytes, - InputBufferIt input_buffer_it, - OutputBufferIt output_buffer_it, - BufferSizeIteratorT buffer_sizes, - uint32_t num_buffers, - hipStream_t stream = 0) + static hipError_t Batched(void* d_temp_storage, + size_t& temp_storage_bytes, + InputBufferIt input_buffer_it, + OutputBufferIt output_buffer_it, + BufferSizeIteratorT buffer_sizes, + _HIPCUB_STD::int64_t num_buffers, + hipStream_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_memcpy.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_memcpy.hpp index df43a8f0648..e5455be55e7 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_memcpy.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_memcpy.hpp @@ -40,13 +40,13 @@ BEGIN_HIPCUB_NAMESPACE struct DeviceMemcpy { template - static hipError_t Batched(void* d_temp_storage, - size_t& temp_storage_bytes, - InputBufferIt input_buffer_it, - OutputBufferIt output_buffer_it, - BufferSizeIteratorT buffer_sizes, - uint32_t num_buffers, - hipStream_t stream = 0) + static hipError_t Batched(void* d_temp_storage, + size_t& temp_storage_bytes, + InputBufferIt input_buffer_it, + OutputBufferIt output_buffer_it, + BufferSizeIteratorT buffer_sizes, + _HIPCUB_STD::int64_t num_buffers, + hipStream_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceMemcpy::Batched(d_temp_storage, temp_storage_bytes, diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_copy.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_copy.hpp index 8e41fa0fd33..7c4b30b6117 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_copy.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_copy.hpp @@ -40,13 +40,13 @@ BEGIN_HIPCUB_NAMESPACE struct DeviceCopy { template - static hipError_t Batched(void* d_temp_storage, - size_t& temp_storage_bytes, - InputBufferIt input_buffer_it, - OutputBufferIt output_buffer_it, - BufferSizeIteratorT buffer_sizes, - uint32_t num_buffers, - hipStream_t stream = 0) + static hipError_t Batched(void* d_temp_storage, + size_t& temp_storage_bytes, + InputBufferIt input_buffer_it, + OutputBufferIt output_buffer_it, + BufferSizeIteratorT buffer_sizes, + _HIPCUB_STD::int64_t num_buffers, + hipStream_t stream = 0) { return rocprim::batch_copy(d_temp_storage, temp_storage_bytes, diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_memcpy.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_memcpy.hpp index a5828b7ad4d..fc7b01cadcd 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_memcpy.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_memcpy.hpp @@ -40,13 +40,13 @@ BEGIN_HIPCUB_NAMESPACE struct DeviceMemcpy { template - static hipError_t Batched(void* d_temp_storage, - size_t& temp_storage_bytes, - InputBufferIt input_buffer_it, - OutputBufferIt output_buffer_it, - BufferSizeIteratorT buffer_sizes, - uint32_t num_buffers, - hipStream_t stream = 0) + static hipError_t Batched(void* d_temp_storage, + size_t& temp_storage_bytes, + InputBufferIt input_buffer_it, + OutputBufferIt output_buffer_it, + BufferSizeIteratorT buffer_sizes, + _HIPCUB_STD::int64_t num_buffers, + hipStream_t stream = 0) { return rocprim::batch_memcpy(d_temp_storage, temp_storage_bytes, From af803c0a05c90d2526eeb1c607e8377776028c1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Wed, 3 Dec 2025 16:57:12 +0000 Subject: [PATCH 41/95] Fix hip std namespace --- .../benchmark_device_adjacent_difference.cpp | 16 +- projects/hipcub/benchmark/benchmark_utils.hpp | 2 +- projects/hipcub/examples/example_utils.hpp | 51 ++-- .../cub/device/device_adjacent_difference.hpp | 64 ++--- .../backend/cub/device/device_merge.hpp | 4 +- .../backend/cub/device/device_merge_sort.hpp | 123 +++++----- .../backend/cub/device/device_partition.hpp | 2 +- .../backend/cub/device/device_reduce.hpp | 4 +- .../hipcub/backend/cub/device/device_scan.hpp | 10 +- .../include/hipcub/backend/cub/util_type.hpp | 1 + .../rocprim/block/block_radix_sort.hpp | 2 +- .../device/device_adjacent_difference.hpp | 140 +++++------ .../backend/rocprim/device/device_merge.hpp | 4 +- .../rocprim/device/device_merge_sort.hpp | 231 +++++++++--------- .../rocprim/device/device_partition.hpp | 4 +- .../backend/rocprim/device/device_scan.hpp | 16 +- .../backend/rocprim/thread/thread_load.hpp | 2 +- .../backend/rocprim/thread/thread_store.hpp | 2 +- .../rocprim/util_temporary_storage.hpp | 4 +- .../hipcub/backend/rocprim/util_type.hpp | 10 +- projects/hipcub/test/hipcub/bfloat16.hpp | 10 +- projects/hipcub/test/hipcub/half.hpp | 12 +- ...test_hipcub_device_adjacent_difference.cpp | 6 +- .../test/hipcub/test_hipcub_device_for.cpp | 18 +- .../test/hipcub/test_hipcub_device_reduce.cpp | 2 +- .../hipcub/test_utils_data_generation.hpp | 13 +- 26 files changed, 383 insertions(+), 370 deletions(-) diff --git a/projects/hipcub/benchmark/benchmark_device_adjacent_difference.cpp b/projects/hipcub/benchmark/benchmark_device_adjacent_difference.cpp index 335144c0248..6bb0e0af290 100644 --- a/projects/hipcub/benchmark/benchmark_device_adjacent_difference.cpp +++ b/projects/hipcub/benchmark/benchmark_device_adjacent_difference.cpp @@ -43,7 +43,7 @@ namespace { #ifndef DEFAULT_N -constexpr std::size_t DEFAULT_N = 1024 * 1024 * 128; +constexpr size_t DEFAULT_N = 1024 * 1024 * 128; #endif constexpr unsigned int batch_size = 10; @@ -53,7 +53,7 @@ template auto dispatch_adjacent_difference(std::true_type /*left*/, std::true_type /*copy*/, void* const temporary_storage, - std::size_t& storage_size, + size_t& storage_size, const InputIt input, const OutputIt output, Args&&... args) @@ -69,7 +69,7 @@ template auto dispatch_adjacent_difference(std::false_type /*left*/, std::true_type /*copy*/, void* const temporary_storage, - std::size_t& storage_size, + size_t& storage_size, const InputIt input, const OutputIt output, Args&&... args) @@ -85,7 +85,7 @@ template auto dispatch_adjacent_difference(std::true_type /*left*/, std::false_type /*copy*/, void* const temporary_storage, - std::size_t& storage_size, + size_t& storage_size, const InputIt input, const OutputIt /*output*/, Args&&... args) @@ -100,7 +100,7 @@ template auto dispatch_adjacent_difference(std::false_type /*left*/, std::false_type /*copy*/, void* const temporary_storage, - std::size_t& storage_size, + size_t& storage_size, const InputIt input, const OutputIt /*output*/, Args&&... args) @@ -112,7 +112,7 @@ auto dispatch_adjacent_difference(std::false_type /*left*/, } template -void run_benchmark(benchmark::State& state, const std::size_t size, const hipStream_t stream) +void run_benchmark(benchmark::State& state, const size_t size, const hipStream_t stream) { using output_type = T; @@ -134,7 +134,7 @@ void run_benchmark(benchmark::State& state, const std::size_t size, const hipStr static constexpr std::integral_constant copy_tag; // Allocate temporary storage - std::size_t temp_storage_size{}; + size_t temp_storage_size{}; void* d_temp_storage = nullptr; const auto launch = [&] @@ -237,7 +237,7 @@ int main(int argc, char* argv[]) // Add benchmarks const std::vector benchmarks = { CREATE_BENCHMARKS(int), - CREATE_BENCHMARKS(std::int64_t), + CREATE_BENCHMARKS(_HIPCUB_STD::int64_t), CREATE_BENCHMARKS(uint8_t), diff --git a/projects/hipcub/benchmark/benchmark_utils.hpp b/projects/hipcub/benchmark/benchmark_utils.hpp index b7a1cd0e941..2902dd08672 100644 --- a/projects/hipcub/benchmark/benchmark_utils.hpp +++ b/projects/hipcub/benchmark/benchmark_utils.hpp @@ -466,7 +466,7 @@ inline auto generate_random_data_n(OutputIterator it, return it + size; } -template +template struct alignas(Alignment) custom_aligned_type { unsigned char data[Size]; diff --git a/projects/hipcub/examples/example_utils.hpp b/projects/hipcub/examples/example_utils.hpp index 07067d88373..9c0cb8e06d5 100644 --- a/projects/hipcub/examples/example_utils.hpp +++ b/projects/hipcub/examples/example_utils.hpp @@ -76,8 +76,8 @@ struct CommandLineArgs std::vector args; hipDeviceProp_t deviceProp; float device_giga_bandwidth; - std::size_t device_free_physmem; - std::size_t device_total_physmem; + size_t device_free_physmem; + size_t device_total_physmem; /** * Constructor @@ -125,7 +125,7 @@ struct CommandLineArgs { using namespace std; - for (std::size_t i = 0; i < keys.size(); ++i) + for(size_t i = 0; i < keys.size(); ++i) { if (keys[i] == string(arg_name)) return true; @@ -147,8 +147,8 @@ struct CommandLineArgs /** * Returns the commandline parameter for a given index (not including flags) */ - template - void GetCmdLineArgument(std::size_t index, T &val) + template + void GetCmdLineArgument(size_t index, T& val) { using namespace std; if (index < args.size()) { @@ -165,7 +165,7 @@ struct CommandLineArgs { using namespace std; - for (std::size_t i = 0; i < keys.size(); ++i) + for(size_t i = 0; i < keys.size(); ++i) { if (keys[i] == string(arg_name)) { @@ -190,7 +190,7 @@ struct CommandLineArgs vals.clear(); // Recover from multi-value string - for (std::size_t i = 0; i < keys.size(); ++i) + for(size_t i = 0; i < keys.size(); ++i) { if (keys[i] == string(arg_name)) { @@ -385,7 +385,6 @@ int CompareResults(double* computed, double* reference, OffsetT len, bool verbos return 0; } - // /** // * Verify the contents of a device array match those // * of a host array @@ -393,7 +392,7 @@ int CompareResults(double* computed, double* reference, OffsetT len, bool verbos // int CompareDeviceResults( // hipcub::NullType */* h_reference */, // hipcub::NullType */* d_data */, -// std::size_t /* num_items */, +// size_t /* num_items */, // bool /* verbose */ = true, // bool /* display_data */ = false) // { @@ -408,7 +407,7 @@ int CompareResults(double* computed, double* reference, OffsetT len, bool verbos // int CompareDeviceResults( // S *h_reference, // rocprim::discard_iterator d_data, -// std::size_t num_items, +// size_t num_items, // bool verbose = true, // bool display_data = false) // { @@ -419,13 +418,9 @@ int CompareResults(double* computed, double* reference, OffsetT len, bool verbos * Verify the contents of a device array match those * of a host array */ -template +template int CompareDeviceResults( - S *h_reference, - T *d_data, - std::size_t num_items, - bool verbose = true, - bool display_data = false) + S* h_reference, T* d_data, size_t num_items, bool verbose = true, bool display_data = false) { // Allocate array on host T *h_data = (T*) malloc(num_items * sizeof(T)); @@ -437,12 +432,12 @@ int CompareDeviceResults( if (display_data) { printf("Reference:\n"); - for (std::size_t i = 0; i < num_items; i++) + for(size_t i = 0; i < num_items; i++) { std::cout << CoutCast(h_reference[i]) << ", "; } printf("\n\nComputed:\n"); - for (std::size_t i = 0; i < num_items; i++) + for(size_t i = 0; i < num_items; i++) { std::cout << CoutCast(h_data[i]) << ", "; } @@ -463,13 +458,9 @@ int CompareDeviceResults( * Verify the contents of a device array match those * of a device array */ -template +template int CompareDeviceDeviceResults( - T *d_reference, - T *d_data, - std::size_t num_items, - bool verbose = true, - bool display_data = false) + T* d_reference, T* d_data, size_t num_items, bool verbose = true, bool display_data = false) { // Allocate array on host T *h_reference = (T*) malloc(num_items * sizeof(T)); @@ -482,12 +473,12 @@ int CompareDeviceDeviceResults( // Display data if (display_data) { printf("Reference:\n"); - for (std::size_t i = 0; i < num_items; i++) + for(size_t i = 0; i < num_items; i++) { std::cout << CoutCast(h_reference[i]) << ", "; } printf("\n\nComputed:\n"); - for (std::size_t i = 0; i < num_items; i++) + for(size_t i = 0; i < num_items; i++) { std::cout << CoutCast(h_data[i]) << ", "; } @@ -507,13 +498,11 @@ int CompareDeviceDeviceResults( /** * Print the contents of a host array */ -template -void DisplayResults( - InputIteratorT h_data, - std::size_t num_items) +template +void DisplayResults(InputIteratorT h_data, size_t num_items) { // Display data - for (std::size_t i = 0; i < num_items; i++) + for(size_t i = 0; i < num_items; i++) { std::cout << CoutCast(h_data[i]) << ", "; } diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_adjacent_difference.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_adjacent_difference.hpp index 7c8f96dd569..6c864f406df 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_adjacent_difference.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_adjacent_difference.hpp @@ -41,14 +41,15 @@ struct DeviceAdjacentDifference template - static HIPCUB_RUNTIME_FUNCTION hipError_t SubtractLeftCopy(void* d_temp_storage, - std::size_t& temp_storage_bytes, - InputIteratorT d_input, - OutputIteratorT d_output, - NumItemsT num_items, - DifferenceOpT difference_op = {}, - hipStream_t stream = 0) + typename NumItemsT = uint32_t> + static HIPCUB_RUNTIME_FUNCTION + hipError_t SubtractLeftCopy(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_input, + OutputIteratorT d_output, + NumItemsT num_items, + DifferenceOpT difference_op = {}, + hipStream_t stream = 0) { return hipCUDAErrorTohipError( ::cub::DeviceAdjacentDifference::SubtractLeftCopy(d_temp_storage, @@ -62,13 +63,14 @@ struct DeviceAdjacentDifference template - static HIPCUB_RUNTIME_FUNCTION hipError_t SubtractLeft(void* d_temp_storage, - std::size_t& temp_storage_bytes, - RandomAccessIteratorT d_input, - NumItemsT num_items, - DifferenceOpT difference_op = {}, - hipStream_t stream = 0) + typename NumItemsT = uint32_t> + static HIPCUB_RUNTIME_FUNCTION + hipError_t SubtractLeft(void* d_temp_storage, + size_t& temp_storage_bytes, + RandomAccessIteratorT d_input, + NumItemsT num_items, + DifferenceOpT difference_op = {}, + hipStream_t stream = 0) { return hipCUDAErrorTohipError( ::cub::DeviceAdjacentDifference::SubtractLeft(d_temp_storage, @@ -82,14 +84,15 @@ struct DeviceAdjacentDifference template - static HIPCUB_RUNTIME_FUNCTION hipError_t SubtractRightCopy(void* d_temp_storage, - std::size_t& temp_storage_bytes, - InputIteratorT d_input, - OutputIteratorT d_output, - NumItemsT num_items, - DifferenceOpT difference_op = {}, - hipStream_t stream = 0) + typename NumItemsT = uint32_t> + static HIPCUB_RUNTIME_FUNCTION + hipError_t SubtractRightCopy(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_input, + OutputIteratorT d_output, + NumItemsT num_items, + DifferenceOpT difference_op = {}, + hipStream_t stream = 0) { return hipCUDAErrorTohipError( ::cub::DeviceAdjacentDifference::SubtractRightCopy(d_temp_storage, @@ -103,13 +106,14 @@ struct DeviceAdjacentDifference template - static HIPCUB_RUNTIME_FUNCTION hipError_t SubtractRight(void* d_temp_storage, - std::size_t& temp_storage_bytes, - RandomAccessIteratorT d_input, - NumItemsT num_items, - DifferenceOpT difference_op = {}, - hipStream_t stream = 0) + typename NumItemsT = uint32_t> + static HIPCUB_RUNTIME_FUNCTION + hipError_t SubtractRight(void* d_temp_storage, + size_t& temp_storage_bytes, + RandomAccessIteratorT d_input, + NumItemsT num_items, + DifferenceOpT difference_op = {}, + hipStream_t stream = 0) { return hipCUDAErrorTohipError( ::cub::DeviceAdjacentDifference::SubtractRight(d_temp_storage, diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_merge.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_merge.hpp index 1489266704a..6bd57bef5e8 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_merge.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_merge.hpp @@ -47,7 +47,7 @@ struct DeviceMerge typename CompareOp = ::cuda::std::less<>> HIPCUB_RUNTIME_FUNCTION static hipError_t MergeKeys(void* d_temp_storage, - std::size_t& temp_storage_bytes, + size_t& temp_storage_bytes, KeyIteratorIn1 keys_in1, int64_t num_keys1, KeyIteratorIn2 keys_in2, @@ -77,7 +77,7 @@ struct DeviceMerge typename CompareOp = ::cuda::std::less<>> HIPCUB_RUNTIME_FUNCTION static hipError_t MergePairs(void* d_temp_storage, - std::size_t& temp_storage_bytes, + size_t& temp_storage_bytes, KeyIteratorIn1 keys_in1, ValueIteratorIn1 values_in1, int64_t num_keys1, diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_merge_sort.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_merge_sort.hpp index 61bc4165b7a..eabc2a64194 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_merge_sort.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_merge_sort.hpp @@ -40,13 +40,14 @@ BEGIN_HIPCUB_NAMESPACE struct DeviceMergeSort { template - HIPCUB_RUNTIME_FUNCTION static hipError_t SortPairs(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorT d_keys, - ValueIteratorT d_items, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t SortPairs(void* d_temp_storage, + size_t& temp_storage_bytes, + KeyIteratorT d_keys, + ValueIteratorT d_items, + OffsetT num_items, + CompareOpT compare_op, + hipStream_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceMergeSort::SortPairs(d_temp_storage, temp_storage_bytes, @@ -63,15 +64,16 @@ struct DeviceMergeSort typename ValueIteratorT, typename OffsetT, typename CompareOpT> - HIPCUB_RUNTIME_FUNCTION static hipError_t SortPairsCopy(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyInputIteratorT d_input_keys, - ValueInputIteratorT d_input_items, - KeyIteratorT d_output_keys, - ValueIteratorT d_output_items, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t SortPairsCopy(void* d_temp_storage, + size_t& temp_storage_bytes, + KeyInputIteratorT d_input_keys, + ValueInputIteratorT d_input_items, + KeyIteratorT d_output_keys, + ValueIteratorT d_output_items, + OffsetT num_items, + CompareOpT compare_op, + hipStream_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceMergeSort::SortPairsCopy(d_temp_storage, temp_storage_bytes, @@ -85,12 +87,13 @@ struct DeviceMergeSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t SortKeys(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorT d_keys, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t SortKeys(void* d_temp_storage, + size_t& temp_storage_bytes, + KeyIteratorT d_keys, + OffsetT num_items, + CompareOpT compare_op, + hipStream_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceMergeSort::SortKeys(d_temp_storage, temp_storage_bytes, @@ -104,13 +107,14 @@ struct DeviceMergeSort typename KeyIteratorT, typename OffsetT, typename CompareOpT> - HIPCUB_RUNTIME_FUNCTION static hipError_t SortKeysCopy(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyInputIteratorT d_input_keys, - KeyIteratorT d_output_keys, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t SortKeysCopy(void* d_temp_storage, + size_t& temp_storage_bytes, + KeyInputIteratorT d_input_keys, + KeyIteratorT d_output_keys, + OffsetT num_items, + CompareOpT compare_op, + hipStream_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceMergeSort::SortKeysCopy(d_temp_storage, @@ -123,13 +127,14 @@ struct DeviceMergeSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t StableSortPairs(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorT d_keys, - ValueIteratorT d_items, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortPairs(void* d_temp_storage, + size_t& temp_storage_bytes, + KeyIteratorT d_keys, + ValueIteratorT d_items, + OffsetT num_items, + CompareOpT compare_op, + hipStream_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceMergeSort::StableSortPairs(d_temp_storage, temp_storage_bytes, @@ -141,12 +146,13 @@ struct DeviceMergeSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t StableSortKeys(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorT d_keys, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortKeys(void* d_temp_storage, + size_t& temp_storage_bytes, + KeyIteratorT d_keys, + OffsetT num_items, + CompareOpT compare_op, + hipStream_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceMergeSort::StableSortKeys(d_temp_storage, temp_storage_bytes, @@ -160,13 +166,14 @@ struct DeviceMergeSort typename KeyIteratorT, typename OffsetT, typename CompareOpT> - HIPCUB_RUNTIME_FUNCTION static hipError_t StableSortKeysCopy(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyInputIteratorT d_input_keys, - KeyIteratorT d_output_keys, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortKeysCopy(void* d_temp_storage, + size_t& temp_storage_bytes, + KeyInputIteratorT d_input_keys, + KeyIteratorT d_output_keys, + OffsetT num_items, + CompareOpT compare_op, + hipStream_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceMergeSort::StableSortKeysCopy(d_temp_storage, temp_storage_bytes, @@ -181,15 +188,15 @@ struct DeviceMergeSort typename KeyIteratorT, typename OffsetT, typename CompareOpT> - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortKeysCopy(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyInputIteratorT d_input_keys, - KeyIteratorT d_output_keys, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream, - bool debug_synchronous) + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortKeysCopy(void* d_temp_storage, + size_t& temp_storage_bytes, + KeyInputIteratorT d_input_keys, + KeyIteratorT d_output_keys, + OffsetT num_items, + CompareOpT compare_op, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return StableSortKeysCopy(d_temp_storage, diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_partition.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_partition.hpp index 1ccbaf88b1b..a347cd09b82 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_partition.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_partition.hpp @@ -99,7 +99,7 @@ struct DevicePartition typename NumItemsT> HIPCUB_RUNTIME_FUNCTION static hipError_t If(void* d_temp_storage, - std::size_t& temp_storage_bytes, + size_t& temp_storage_bytes, InputIteratorT d_in, FirstOutputIteratorT d_first_part_out, SecondOutputIteratorT d_second_part_out, diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp index 7bb0d0df244..6dd0b93ebb2 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp @@ -103,7 +103,7 @@ class DeviceReduce InputIteratorT d_in, ExtremumOutIteratorT d_min_out, IndexOutIteratorT d_index_out, - ::std::int64_t num_items, + std::int64_t num_items, hipStream_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceReduce::ArgMin(d_temp_storage, @@ -163,7 +163,7 @@ class DeviceReduce InputIteratorT d_in, ExtremumOutIteratorT d_max_out, IndexOutIteratorT d_index_out, - ::std::int64_t num_items, + std::int64_t num_items, hipError_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceReduce::ArgMax(d_temp_storage, diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp index 57abaaafec8..2751c7efb5f 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp @@ -260,7 +260,7 @@ class DeviceScan typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename EqualityOpT = ::hipcub::Equality, - typename NumItemsT = ::cuda::std::uint32_t> + typename NumItemsT = std::uint32_t> HIPCUB_RUNTIME_FUNCTION static hipError_t ExclusiveSumByKey(void* d_temp_storage, size_t& temp_storage_bytes, @@ -287,7 +287,7 @@ class DeviceScan typename ScanOpT, typename InitValueT, typename EqualityOpT = ::hipcub::Equality, - typename NumItemsT = ::cuda::std::uint32_t> + typename NumItemsT = std::uint32_t> HIPCUB_RUNTIME_FUNCTION static hipError_t ExclusiveScanByKey(void* d_temp_storage, size_t& temp_storage_bytes, @@ -316,7 +316,7 @@ class DeviceScan typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename EqualityOpT = ::hipcub::Equality, - typename NumItemsT = ::cuda::std::uint32_t> + typename NumItemsT = std::uint32_t> HIPCUB_RUNTIME_FUNCTION static hipError_t InclusiveSumByKey(void* d_temp_storage, size_t& temp_storage_bytes, @@ -342,7 +342,7 @@ class DeviceScan typename ValuesOutputIteratorT, typename ScanOpT, typename EqualityOpT = ::hipcub::Equality, - typename NumItemsT = ::cuda::std::uint32_t> + typename NumItemsT = std::uint32_t> HIPCUB_RUNTIME_FUNCTION static hipError_t InclusiveScanByKey(void* d_temp_storage, size_t& temp_storage_bytes, @@ -370,7 +370,7 @@ class DeviceScan typename ValuesOutputIteratorT, typename ScanOpT, typename EqualityOpT = ::hipcub::Equality, - typename NumItemsT = ::cuda::std::uint32_t> + typename NumItemsT = std::uint32_t> HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t InclusiveScanByKey(void* d_temp_storage, size_t& temp_storage_bytes, diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/util_type.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/util_type.hpp index f81eb00dcf4..aaa05042d76 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/util_type.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/util_type.hpp @@ -34,6 +34,7 @@ #include "../../util_deprecated.hpp" #include _HIPCUB_STD_INCLUDE(iterator) +#include _HIPCUB_STD_INCLUDE(type_traits) #include // IWYU pragma: export diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_radix_sort.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_radix_sort.hpp index 598f39a6ebe..e997a202b97 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_radix_sort.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_radix_sort.hpp @@ -74,7 +74,7 @@ constexpr auto tuple_bit_size_impl() template struct tuple_bit_size<::hipcub::tuple> - : public std::integral_constant, 0>()> + : public std::integral_constant, 0>()> {}; } // namespace detail diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_adjacent_difference.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_adjacent_difference.hpp index e66f8b4bf9c..fc87cec8dda 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_adjacent_difference.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_adjacent_difference.hpp @@ -42,14 +42,15 @@ struct DeviceAdjacentDifference template - static HIPCUB_RUNTIME_FUNCTION hipError_t SubtractLeftCopy(void* d_temp_storage, - std::size_t& temp_storage_bytes, - InputIteratorT d_input, - OutputIteratorT d_output, - NumItemsT num_items, - DifferenceOpT difference_op = {}, - hipStream_t stream = 0) + typename NumItemsT = uint32_t> + static HIPCUB_RUNTIME_FUNCTION + hipError_t SubtractLeftCopy(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_input, + OutputIteratorT d_output, + NumItemsT num_items, + DifferenceOpT difference_op = {}, + hipStream_t stream = 0) { return ::rocprim::adjacent_difference(d_temp_storage, temp_storage_bytes, @@ -64,16 +65,16 @@ struct DeviceAdjacentDifference template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS static HIPCUB_RUNTIME_FUNCTION hipError_t - SubtractLeftCopy(void* d_temp_storage, - std::size_t& temp_storage_bytes, - InputIteratorT d_input, - OutputIteratorT d_output, - NumItemsT num_items, - DifferenceOpT difference_op, - hipStream_t stream, - bool debug_synchronous) + typename NumItemsT = uint32_t> + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS static HIPCUB_RUNTIME_FUNCTION + hipError_t SubtractLeftCopy(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_input, + OutputIteratorT d_output, + NumItemsT num_items, + DifferenceOpT difference_op, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return SubtractLeftCopy(d_temp_storage, @@ -87,13 +88,14 @@ struct DeviceAdjacentDifference template - static HIPCUB_RUNTIME_FUNCTION hipError_t SubtractLeft(void* d_temp_storage, - std::size_t& temp_storage_bytes, - RandomAccessIteratorT d_input, - NumItemsT num_items, - DifferenceOpT difference_op = {}, - hipStream_t stream = 0) + typename NumItemsT = uint32_t> + static HIPCUB_RUNTIME_FUNCTION + hipError_t SubtractLeft(void* d_temp_storage, + size_t& temp_storage_bytes, + RandomAccessIteratorT d_input, + NumItemsT num_items, + DifferenceOpT difference_op = {}, + hipStream_t stream = 0) { return ::rocprim::adjacent_difference_inplace(d_temp_storage, temp_storage_bytes, @@ -106,15 +108,15 @@ struct DeviceAdjacentDifference template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS static HIPCUB_RUNTIME_FUNCTION hipError_t - SubtractLeft(void* d_temp_storage, - std::size_t& temp_storage_bytes, - RandomAccessIteratorT d_input, - NumItemsT num_items, - DifferenceOpT difference_op, - hipStream_t stream, - bool debug_synchronous) + typename NumItemsT = uint32_t> + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS static HIPCUB_RUNTIME_FUNCTION + hipError_t SubtractLeft(void* d_temp_storage, + size_t& temp_storage_bytes, + RandomAccessIteratorT d_input, + NumItemsT num_items, + DifferenceOpT difference_op, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return SubtractLeft(d_temp_storage, @@ -128,14 +130,15 @@ struct DeviceAdjacentDifference template - static HIPCUB_RUNTIME_FUNCTION hipError_t SubtractRightCopy(void* d_temp_storage, - std::size_t& temp_storage_bytes, - InputIteratorT d_input, - OutputIteratorT d_output, - NumItemsT num_items, - DifferenceOpT difference_op = {}, - hipStream_t stream = 0) + typename NumItemsT = uint32_t> + static HIPCUB_RUNTIME_FUNCTION + hipError_t SubtractRightCopy(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_input, + OutputIteratorT d_output, + NumItemsT num_items, + DifferenceOpT difference_op = {}, + hipStream_t stream = 0) { return ::rocprim::adjacent_difference_right(d_temp_storage, temp_storage_bytes, @@ -150,16 +153,16 @@ struct DeviceAdjacentDifference template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS static HIPCUB_RUNTIME_FUNCTION hipError_t - SubtractRightCopy(void* d_temp_storage, - std::size_t& temp_storage_bytes, - InputIteratorT d_input, - OutputIteratorT d_output, - NumItemsT num_items, - DifferenceOpT difference_op, - hipStream_t stream, - bool debug_synchronous) + typename NumItemsT = uint32_t> + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS static HIPCUB_RUNTIME_FUNCTION + hipError_t SubtractRightCopy(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_input, + OutputIteratorT d_output, + NumItemsT num_items, + DifferenceOpT difference_op, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return SubtractRightCopy(d_temp_storage, @@ -173,13 +176,14 @@ struct DeviceAdjacentDifference template - static HIPCUB_RUNTIME_FUNCTION hipError_t SubtractRight(void* d_temp_storage, - std::size_t& temp_storage_bytes, - RandomAccessIteratorT d_input, - NumItemsT num_items, - DifferenceOpT difference_op = {}, - hipStream_t stream = 0) + typename NumItemsT = uint32_t> + static HIPCUB_RUNTIME_FUNCTION + hipError_t SubtractRight(void* d_temp_storage, + size_t& temp_storage_bytes, + RandomAccessIteratorT d_input, + NumItemsT num_items, + DifferenceOpT difference_op = {}, + hipStream_t stream = 0) { return ::rocprim::adjacent_difference_right_inplace(d_temp_storage, temp_storage_bytes, @@ -192,15 +196,15 @@ struct DeviceAdjacentDifference template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS static HIPCUB_RUNTIME_FUNCTION hipError_t - SubtractRight(void* d_temp_storage, - std::size_t& temp_storage_bytes, - RandomAccessIteratorT d_input, - NumItemsT num_items, - DifferenceOpT difference_op, - hipStream_t stream, - bool debug_synchronous) + typename NumItemsT = uint32_t> + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS static HIPCUB_RUNTIME_FUNCTION + hipError_t SubtractRight(void* d_temp_storage, + size_t& temp_storage_bytes, + RandomAccessIteratorT d_input, + NumItemsT num_items, + DifferenceOpT difference_op, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return SubtractRight(d_temp_storage, diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_merge.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_merge.hpp index 66ae98b6ea1..b4e7e0762f3 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_merge.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_merge.hpp @@ -44,7 +44,7 @@ struct DeviceMerge typename CompareOp = ::rocprim::less<>> HIPCUB_RUNTIME_FUNCTION static hipError_t MergeKeys(void* d_temp_storage, - std::size_t& temp_storage_bytes, + size_t& temp_storage_bytes, KeyIteratorIn1 keys_in1, _HIPCUB_STD::int64_t num_keys1, KeyIteratorIn2 keys_in2, @@ -75,7 +75,7 @@ struct DeviceMerge typename CompareOp = ::rocprim::less<>> HIPCUB_RUNTIME_FUNCTION static hipError_t MergePairs(void* d_temp_storage, - std::size_t& temp_storage_bytes, + size_t& temp_storage_bytes, KeyIteratorIn1 keys_in1, ValueIteratorIn1 values_in1, _HIPCUB_STD::int64_t num_keys1, diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_merge_sort.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_merge_sort.hpp index 0c9f4df4d82..c3da0ca00f7 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_merge_sort.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_merge_sort.hpp @@ -42,13 +42,14 @@ BEGIN_HIPCUB_NAMESPACE struct DeviceMergeSort { template - HIPCUB_RUNTIME_FUNCTION static hipError_t SortPairs(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorT d_keys, - ValueIteratorT d_items, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t SortPairs(void* d_temp_storage, + size_t& temp_storage_bytes, + KeyIteratorT d_keys, + ValueIteratorT d_items, + OffsetT num_items, + CompareOpT compare_op, + hipStream_t stream = 0) { return ::rocprim::merge_sort(d_temp_storage, temp_storage_bytes, @@ -63,15 +64,15 @@ struct DeviceMergeSort } template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortPairs(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorT d_keys, - ValueIteratorT d_items, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream, - bool debug_synchronous) + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t SortPairs(void* d_temp_storage, + size_t& temp_storage_bytes, + KeyIteratorT d_keys, + ValueIteratorT d_items, + OffsetT num_items, + CompareOpT compare_op, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return SortPairs(d_temp_storage, @@ -89,15 +90,16 @@ struct DeviceMergeSort typename ValueIteratorT, typename OffsetT, typename CompareOpT> - HIPCUB_RUNTIME_FUNCTION static hipError_t SortPairsCopy(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyInputIteratorT d_input_keys, - ValueInputIteratorT d_input_items, - KeyIteratorT d_output_keys, - ValueIteratorT d_output_items, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t SortPairsCopy(void* d_temp_storage, + size_t& temp_storage_bytes, + KeyInputIteratorT d_input_keys, + ValueInputIteratorT d_input_items, + KeyIteratorT d_output_keys, + ValueIteratorT d_output_items, + OffsetT num_items, + CompareOpT compare_op, + hipStream_t stream = 0) { return ::rocprim::merge_sort(d_temp_storage, temp_storage_bytes, @@ -117,17 +119,17 @@ struct DeviceMergeSort typename ValueIteratorT, typename OffsetT, typename CompareOpT> - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortPairsCopy(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyInputIteratorT d_input_keys, - ValueInputIteratorT d_input_items, - KeyIteratorT d_output_keys, - ValueIteratorT d_output_items, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream, - bool debug_synchronous) + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t SortPairsCopy(void* d_temp_storage, + size_t& temp_storage_bytes, + KeyInputIteratorT d_input_keys, + ValueInputIteratorT d_input_items, + KeyIteratorT d_output_keys, + ValueIteratorT d_output_items, + OffsetT num_items, + CompareOpT compare_op, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return SortPairsCopy(d_temp_storage, @@ -142,12 +144,13 @@ struct DeviceMergeSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t SortKeys(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorT d_keys, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t SortKeys(void* d_temp_storage, + size_t& temp_storage_bytes, + KeyIteratorT d_keys, + OffsetT num_items, + CompareOpT compare_op, + hipStream_t stream = 0) { return ::rocprim::merge_sort(d_temp_storage, temp_storage_bytes, @@ -160,14 +163,14 @@ struct DeviceMergeSort } template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortKeys(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorT d_keys, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream, - bool debug_synchronous) + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t SortKeys(void* d_temp_storage, + size_t& temp_storage_bytes, + KeyIteratorT d_keys, + OffsetT num_items, + CompareOpT compare_op, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream); @@ -177,13 +180,14 @@ struct DeviceMergeSort typename KeyIteratorT, typename OffsetT, typename CompareOpT> - HIPCUB_RUNTIME_FUNCTION static hipError_t SortKeysCopy(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyInputIteratorT d_input_keys, - KeyIteratorT d_output_keys, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t SortKeysCopy(void* d_temp_storage, + size_t& temp_storage_bytes, + KeyInputIteratorT d_input_keys, + KeyIteratorT d_output_keys, + OffsetT num_items, + CompareOpT compare_op, + hipStream_t stream = 0) { return ::rocprim::merge_sort(d_temp_storage, @@ -200,15 +204,15 @@ struct DeviceMergeSort typename KeyIteratorT, typename OffsetT, typename CompareOpT> - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortKeysCopy(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyInputIteratorT d_input_keys, - KeyIteratorT d_output_keys, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream, - bool debug_synchronous) + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t SortKeysCopy(void* d_temp_storage, + size_t& temp_storage_bytes, + KeyInputIteratorT d_input_keys, + KeyIteratorT d_output_keys, + OffsetT num_items, + CompareOpT compare_op, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); @@ -222,13 +226,14 @@ struct DeviceMergeSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t StableSortPairs(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorT d_keys, - ValueIteratorT d_items, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortPairs(void* d_temp_storage, + size_t& temp_storage_bytes, + KeyIteratorT d_keys, + ValueIteratorT d_items, + OffsetT num_items, + CompareOpT compare_op, + hipStream_t stream = 0) { return ::rocprim::merge_sort(d_temp_storage, temp_storage_bytes, @@ -243,15 +248,15 @@ struct DeviceMergeSort } template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortPairs(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorT d_keys, - ValueIteratorT d_items, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream, - bool debug_synchronous) + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortPairs(void* d_temp_storage, + size_t& temp_storage_bytes, + KeyIteratorT d_keys, + ValueIteratorT d_items, + OffsetT num_items, + CompareOpT compare_op, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return StableSortPairs(d_temp_storage, @@ -264,12 +269,13 @@ struct DeviceMergeSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t StableSortKeys(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorT d_keys, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortKeys(void* d_temp_storage, + size_t& temp_storage_bytes, + KeyIteratorT d_keys, + OffsetT num_items, + CompareOpT compare_op, + hipStream_t stream = 0) { return ::rocprim::merge_sort(d_temp_storage, temp_storage_bytes, @@ -282,14 +288,14 @@ struct DeviceMergeSort } template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortKeys(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorT d_keys, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream, - bool debug_synchronous) + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortKeys(void* d_temp_storage, + size_t& temp_storage_bytes, + KeyIteratorT d_keys, + OffsetT num_items, + CompareOpT compare_op, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return StableSortKeys(d_temp_storage, @@ -304,13 +310,14 @@ struct DeviceMergeSort typename KeyIteratorT, typename OffsetT, typename CompareOpT> - HIPCUB_RUNTIME_FUNCTION static hipError_t StableSortKeysCopy(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyInputIteratorT d_input_keys, - KeyIteratorT d_output_keys, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortKeysCopy(void* d_temp_storage, + size_t& temp_storage_bytes, + KeyInputIteratorT d_input_keys, + KeyIteratorT d_output_keys, + OffsetT num_items, + CompareOpT compare_op, + hipStream_t stream = 0) { return ::rocprim::merge_sort(d_temp_storage, temp_storage_bytes, @@ -326,15 +333,15 @@ struct DeviceMergeSort typename KeyIteratorT, typename OffsetT, typename CompareOpT> - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortKeysCopy(void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyInputIteratorT d_input_keys, - KeyIteratorT d_output_keys, - OffsetT num_items, - CompareOpT compare_op, - hipStream_t stream, - bool debug_synchronous) + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortKeysCopy(void* d_temp_storage, + size_t& temp_storage_bytes, + KeyInputIteratorT d_input_keys, + KeyIteratorT d_output_keys, + OffsetT num_items, + CompareOpT compare_op, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return StableSortKeysCopy(d_temp_storage, diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_partition.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_partition.hpp index 55ca243d0a7..8d6a0094705 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_partition.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_partition.hpp @@ -155,7 +155,7 @@ struct DevicePartition typename NumItemsT> HIPCUB_RUNTIME_FUNCTION static hipError_t If(void* d_temp_storage, - std::size_t& temp_storage_bytes, + size_t& temp_storage_bytes, InputIteratorT d_in, FirstOutputIteratorT d_first_part_out, SecondOutputIteratorT d_second_part_out, @@ -190,7 +190,7 @@ struct DevicePartition typename NumItemsT> HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t If(void* d_temp_storage, - std::size_t& temp_storage_bytes, + size_t& temp_storage_bytes, InputIteratorT d_in, FirstOutputIteratorT d_first_part_out, SecondOutputIteratorT d_second_part_out, diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_scan.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_scan.hpp index 0a627af77a8..4262e26b369 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_scan.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_scan.hpp @@ -493,7 +493,7 @@ class DeviceScan typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename EqualityOpT = ::hipcub::Equality, - typename NumItemsT = std::uint32_t> + typename NumItemsT = uint32_t> HIPCUB_RUNTIME_FUNCTION static hipError_t ExclusiveSumByKey(void* d_temp_storage, size_t& temp_storage_bytes, @@ -522,7 +522,7 @@ class DeviceScan typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename EqualityOpT = ::hipcub::Equality, - typename NumItemsT = std::uint32_t> + typename NumItemsT = uint32_t> HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t ExclusiveSumByKey(void* d_temp_storage, size_t& temp_storage_bytes, @@ -551,7 +551,7 @@ class DeviceScan typename ScanOpT, typename InitValueT, typename EqualityOpT = ::hipcub::Equality, - typename NumItemsT = std::uint32_t> + typename NumItemsT = uint32_t> HIPCUB_RUNTIME_FUNCTION static hipError_t ExclusiveScanByKey(void* d_temp_storage, size_t& temp_storage_bytes, @@ -592,7 +592,7 @@ class DeviceScan typename ScanOpT, typename InitValueT, typename EqualityOpT = ::hipcub::Equality, - typename NumItemsT = std::uint32_t> + typename NumItemsT = uint32_t> HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t ExclusiveScanByKey(void* d_temp_storage, size_t& temp_storage_bytes, @@ -623,7 +623,7 @@ class DeviceScan typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename EqualityOpT = ::hipcub::Equality, - typename NumItemsT = std::uint32_t> + typename NumItemsT = uint32_t> HIPCUB_RUNTIME_FUNCTION static hipError_t InclusiveSumByKey(void* d_temp_storage, size_t& temp_storage_bytes, @@ -649,7 +649,7 @@ class DeviceScan typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename EqualityOpT = ::hipcub::Equality, - typename NumItemsT = std::uint32_t> + typename NumItemsT = uint32_t> HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t InclusiveSumByKey(void* d_temp_storage, size_t& temp_storage_bytes, @@ -677,7 +677,7 @@ class DeviceScan typename ValuesOutputIteratorT, typename ScanOpT, typename EqualityOpT = ::hipcub::Equality, - typename NumItemsT = std::uint32_t> + typename NumItemsT = uint32_t> HIPCUB_RUNTIME_FUNCTION static hipError_t InclusiveScanByKey(void* d_temp_storage, size_t& temp_storage_bytes, @@ -714,7 +714,7 @@ class DeviceScan typename ValuesOutputIteratorT, typename ScanOpT, typename EqualityOpT = ::hipcub::Equality, - typename NumItemsT = std::uint32_t> + typename NumItemsT = uint32_t> HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t InclusiveScanByKey(void* d_temp_storage, size_t& temp_storage_bytes, diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_load.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_load.hpp index 1f26e85ed56..c3b8f85e9d4 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_load.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_load.hpp @@ -100,7 +100,7 @@ HIPCUB_FORCEINLINE detail::it_value_t ThreadLoad(InputIteratorT { return ThreadLoad(itr, detail::int_constant_t(), - ::std::bool_constant<::std::is_pointer::value>()); + ::std::bool_constant<_HIPCUB_STD::is_pointer::value>()); } END_HIPCUB_NAMESPACE diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_store.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_store.hpp index 59ce95419ba..0de68d5a555 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_store.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_store.hpp @@ -84,7 +84,7 @@ HIPCUB_FORCEINLINE void ThreadStore(OutputIteratorT itr, T val) ThreadStore(itr, val, detail::int_constant_t{}, - ::std::bool_constant<::std::is_pointer::value>()); + ::std::bool_constant<_HIPCUB_STD::is_pointer::value>()); } namespace detail diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_temporary_storage.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_temporary_storage.hpp index c2e465393da..a3cb419bbdf 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_temporary_storage.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_temporary_storage.hpp @@ -42,7 +42,7 @@ BEGIN_HIPCUB_NAMESPACE namespace detail { // Base case: When N == 0 -template +template HIPCUB_HOST_DEVICE typename std::enable_if::type generate_partition(void* d_temp_storage, size_t& temp_storage_bytes, @@ -56,7 +56,7 @@ typename std::enable_if::type generate_partition(void* d_t } // Recursive case: When N > 0 -template +template HIPCUB_HOST_DEVICE typename std::enable_if<(N > 0), hipError_t>::type generate_partition(void* d_temp_storage, size_t& temp_storage_bytes, Generator gen, Ts... args) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp index 50b19c05814..9267bb7bc35 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp @@ -491,9 +491,9 @@ struct Uninitialized /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T using DeviceWord = typename UnitWord::DeviceWord; - static constexpr std::size_t DATA_SIZE = sizeof(T); - static constexpr std::size_t WORD_SIZE = sizeof(DeviceWord); - static constexpr std::size_t WORDS = DATA_SIZE / WORD_SIZE; + static constexpr size_t DATA_SIZE = sizeof(T); + static constexpr size_t WORD_SIZE = sizeof(DeviceWord); + static constexpr size_t WORDS = DATA_SIZE / WORD_SIZE; /// Backing storage DeviceWord storage[WORDS]; @@ -907,8 +907,8 @@ template struct is_extended_fp : std::integral_constant< bool, - std::is_same<__half, typename std::remove_cv::type>::value - || std::is_same::type>::value> + _HIPCUB_STD::is_same<__half, typename std::remove_cv::type>::value + || _HIPCUB_STD::is_same::type>::value> {}; // Gets "raw" type: drops reference and const qualifier. diff --git a/projects/hipcub/test/hipcub/bfloat16.hpp b/projects/hipcub/test/hipcub/bfloat16.hpp index 0e4976ed1ad..0de3b95a883 100644 --- a/projects/hipcub/test/hipcub/bfloat16.hpp +++ b/projects/hipcub/test/hipcub/bfloat16.hpp @@ -88,17 +88,17 @@ struct bfloat16_t *this = bfloat16_t(float(a)); } - /// Constructor from std::size_t - __host__ __device__ __forceinline__ bfloat16_t(std::size_t a) + /// Constructor from size_t + __host__ __device__ __forceinline__ bfloat16_t(size_t a) { *this = bfloat16_t(float(a)); } /// Constructor from unsigned long long int template - && (!std::is_same_v)>::type> + typename + = typename std::enable_if + && (!std::is_same_v)>::type> __host__ __device__ __forceinline__ bfloat16_t(T a) { *this = bfloat16_t(float(a)); diff --git a/projects/hipcub/test/hipcub/half.hpp b/projects/hipcub/test/hipcub/half.hpp index 7c1bdd1d653..1a8b3db1cad 100644 --- a/projects/hipcub/test/hipcub/half.hpp +++ b/projects/hipcub/test/hipcub/half.hpp @@ -77,17 +77,17 @@ struct half_t *this = half_t(float(a)); } - /// Constructor from std::size_t - __host__ __device__ __forceinline__ half_t(std::size_t a) + /// Constructor from size_t + __host__ __device__ __forceinline__ half_t(size_t a) { *this = half_t(float(a)); } /// Constructor from unsigned long long int template - && (!std::is_same_v)>::type> + typename + = typename std::enable_if + && (!std::is_same_v)>::type> __host__ __device__ __forceinline__ half_t(T a) { *this = half_t(float(a)); @@ -200,7 +200,7 @@ struct half_t f = (0xff << 23) | (sign << 31); // inf } } - static_assert(sizeof(float) == sizeof(std::uint32_t), "4-byte size check"); + static_assert(sizeof(float) == sizeof(uint32_t), "4-byte size check"); float ret{}; std::memcpy(&ret, &f, sizeof(float)); return ret; diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_adjacent_difference.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_adjacent_difference.cpp index d08d69b9b7a..7f04766fcbc 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_adjacent_difference.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_adjacent_difference.cpp @@ -457,7 +457,7 @@ TYPED_TEST(HipcubDeviceAdjacentDifferenceLargeTests, LargeIndicesAndOpOnce) static constexpr hipStream_t stream = 0; // default - for(std::size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; @@ -485,8 +485,8 @@ TYPED_TEST(HipcubDeviceAdjacentDifferenceLargeTests, LargeIndicesAndOpOnce) FocusIndex op; // Allocate temporary storage - std::size_t temp_storage_size = 0; - void* d_temp_storage = nullptr; + size_t temp_storage_size = 0; + void* d_temp_storage = nullptr; HIP_CHECK(dispatch_adjacent_difference(left_tag, copy_tag, d_temp_storage, diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp index 408a88dd751..ecd5d0da0a9 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp @@ -904,11 +904,11 @@ using HipcubDeviceForEachInExtentsParamGenerator using HipcubDeviceForEachInExtentsTestsParams = typename HipcubTestParamsMergeAll< HipcubDeviceForEachInExtentsParamGenerator, - HipcubDeviceForEachInExtentsParamGenerator, - HipcubDeviceForEachInExtentsParamGenerator, - HipcubDeviceForEachInExtentsParamGenerator, - HipcubDeviceForEachInExtentsParamGenerator, - HipcubDeviceForEachInExtentsParamGenerator>::type; + HipcubDeviceForEachInExtentsParamGenerator, + HipcubDeviceForEachInExtentsParamGenerator, + HipcubDeviceForEachInExtentsParamGenerator, + HipcubDeviceForEachInExtentsParamGenerator<_HIPCUB_STD::int64_t>, + HipcubDeviceForEachInExtentsParamGenerator>::type; template class HipcubDeviceForBulkTests : public HipcubDeviceForTests {}; -using HipcubDeviceForBulkTestsParams = ::testing::Types, - DeviceForParams, - DeviceForParams, - DeviceForParams>; +using HipcubDeviceForBulkTestsParams = ::testing::Types, + DeviceForParams, + DeviceForParams<_HIPCUB_STD::int64_t>, + DeviceForParams>; TYPED_TEST_SUITE(HipcubDeviceForBulkTests, HipcubDeviceForBulkTestsParams); diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp index 38f94d2cb51..0ea81edd64e 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp @@ -1218,7 +1218,7 @@ TYPED_TEST(HipcubDeviceReduceLargeIndicesTests, LargeIndices) HIP_CHECK(hipDeviceSynchronize()); // Check if output values are as expected - const std::size_t result = output[0]; + const size_t result = output[0]; ASSERT_EQ(result, size); HIP_CHECK(hipFree(d_output)); diff --git a/projects/hipcub/test/hipcub/test_utils_data_generation.hpp b/projects/hipcub/test/hipcub/test_utils_data_generation.hpp index d5a0526ea65..12026cd326d 100644 --- a/projects/hipcub/test/hipcub/test_utils_data_generation.hpp +++ b/projects/hipcub/test/hipcub/test_utils_data_generation.hpp @@ -137,9 +137,9 @@ class numeric_limits : public _HIPCUB_STD::numeric_limits #if _CCCL_HAS_INT128() template -using is_int128 = std::is_same<__int128_t, typename std::remove_cv::type>; +using is_int128 = _HIPCUB_STD::is_same<__int128_t, typename std::remove_cv::type>; template -using is_uint128 = std::is_same<__uint128_t, typename std::remove_cv::type>; +using is_uint128 = _HIPCUB_STD::is_same<__uint128_t, typename std::remove_cv::type>; #else template using is_int128 = std::false_type; @@ -148,17 +148,18 @@ using is_uint128 = std::false_type; #endif // _CCCL_HAS_INT128() template -using is_half = std::is_same::type>; +using is_half = _HIPCUB_STD::is_same::type>; template -using is_bfloat16 = std::is_same::type>; +using is_bfloat16 = _HIPCUB_STD::is_same::type>; template -using is_native_half = std::is_same::type>; +using is_native_half + = _HIPCUB_STD::is_same::type>; template using is_native_bfloat16 - = std::is_same::type>; + = _HIPCUB_STD::is_same::type>; template struct convert_to_native_t_impl From 8de8ac33df045e8457376542da362b0a7c1e177f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Wed, 10 Dec 2025 13:01:37 +0000 Subject: [PATCH 42/95] Increase test coverage for hipCUB batched device copy and add fix for zero-size elements --- .../backend/cub/device/device_memcpy.hpp | 6 + .../backend/rocprim/device/device_memcpy.hpp | 6 + .../test/hipcub/test_hipcub_device_memcpy.cpp | 303 ++++++++++++++++++ 3 files changed, 315 insertions(+) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_memcpy.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_memcpy.hpp index e5455be55e7..40a643a4047 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_memcpy.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_memcpy.hpp @@ -48,6 +48,12 @@ struct DeviceMemcpy _HIPCUB_STD::int64_t num_buffers, hipStream_t stream = 0) { + if(num_buffers == 0) + { + temp_storage_bytes = 0; + return hipSuccess; + } + return hipCUDAErrorTohipError(::cub::DeviceMemcpy::Batched(d_temp_storage, temp_storage_bytes, input_buffer_it, diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_memcpy.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_memcpy.hpp index fc7b01cadcd..0fb0c6b7a44 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_memcpy.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_memcpy.hpp @@ -48,6 +48,12 @@ struct DeviceMemcpy _HIPCUB_STD::int64_t num_buffers, hipStream_t stream = 0) { + if(num_buffers == 0) + { + temp_storage_bytes = 0; + return hipSuccess; + } + return rocprim::batch_memcpy(d_temp_storage, temp_storage_bytes, input_buffer_it, diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_memcpy.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_memcpy.cpp index b2b997ff8f1..80c01494feb 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_memcpy.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_memcpy.cpp @@ -338,3 +338,306 @@ TYPED_TEST(DeviceBatchMemcpyTests, SizeAndTypeVariation) HIP_CHECK(hipFree(d_temp_storage)); } + +TEST(DeviceMemcpyBatched, ZeroBuffersNoOp) +{ + using T = uint8_t; + T** d_srcs = nullptr; + T** d_dsts = nullptr; + size_t* d_sizes = nullptr; + + size_t temp_bytes = 0; + HIP_CHECK(hipcub::DeviceMemcpy::Batched(nullptr, temp_bytes, d_srcs, d_dsts, d_sizes, 0)); + void* d_temp = nullptr; + if(temp_bytes) + { + HIP_CHECK(hipMalloc(&d_temp, temp_bytes)); + } + + // Should be a no-op without crashing + HIP_CHECK(hipcub::DeviceMemcpy::Batched(d_temp, + temp_bytes, + d_srcs, + d_dsts, + d_sizes, + 0, + hipStreamDefault)); + if(d_temp) + { + HIP_CHECK(hipFree(d_temp)); + } +} + +TEST(DeviceMemcpyBatched, ZeroSizeEntries) +{ + using T = uint8_t; + const int num_buffers = 5; + const std::vector h_sizes = {0, 1, 0, 7, 0}; + + size_t total = 0; + for(auto s : h_sizes) + { + total += s; + } + + T* d_input = nullptr; + T* d_output = nullptr; + HIP_CHECK(hipMalloc(&d_input, total)); + HIP_CHECK(hipMalloc(&d_output, total)); + + // Build src/dst arrays + std::vector h_srcs(num_buffers), h_dsts(num_buffers); + size_t offset = 0; + for(int i = 0; i < num_buffers; ++i) + { + h_srcs[i] = d_input + offset; + h_dsts[i] = d_output + offset; + offset += h_sizes[i]; + } + + // Fill input + std::vector h_in(total); + std::iota(h_in.begin(), h_in.end(), static_cast(3)); + HIP_CHECK(hipMemcpy(d_input, h_in.data(), total, hipMemcpyHostToDevice)); + + // Device arrays + T** d_srcs = nullptr; + T** d_dsts = nullptr; + size_t* d_sizes = nullptr; + HIP_CHECK(hipMalloc(&d_srcs, num_buffers * sizeof(T*))); + HIP_CHECK(hipMalloc(&d_dsts, num_buffers * sizeof(T*))); + HIP_CHECK(hipMalloc(&d_sizes, num_buffers * sizeof(size_t))); + HIP_CHECK(hipMemcpy(d_srcs, h_srcs.data(), num_buffers * sizeof(T*), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_dsts, h_dsts.data(), num_buffers * sizeof(T*), hipMemcpyHostToDevice)); + HIP_CHECK( + hipMemcpy(d_sizes, h_sizes.data(), num_buffers * sizeof(size_t), hipMemcpyHostToDevice)); + + // Temp storage + size_t temp_bytes = 0; + HIP_CHECK( + hipcub::DeviceMemcpy::Batched(nullptr, temp_bytes, d_srcs, d_dsts, d_sizes, num_buffers)); + void* d_temp = nullptr; + HIP_CHECK(hipMalloc(&d_temp, temp_bytes)); + + HIP_CHECK( + hipcub::DeviceMemcpy::Batched(d_temp, temp_bytes, d_srcs, d_dsts, d_sizes, num_buffers)); + + // Verify + std::vector h_out(total); + HIP_CHECK(hipMemcpy(h_out.data(), d_output, total, hipMemcpyDeviceToHost)); + EXPECT_EQ(h_in, h_out); + + HIP_CHECK(hipFree(d_temp)); + HIP_CHECK(hipFree(d_sizes)); + HIP_CHECK(hipFree(d_dsts)); + HIP_CHECK(hipFree(d_srcs)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); +} + +TEST(DeviceMemcpyBatched, NonDefaultStreamAndTempReuse) +{ + using T = uint32_t; + const int num_buffers = 8; + std::mt19937 rng(123); + std::uniform_int_distribution dist(1, 4096); + std::vector sizes(num_buffers); + size_t total = 0; + for(int i = 0; i < num_buffers; ++i) + { + sizes[i] = dist(rng) * sizeof(T); + total += sizes[i]; + } + + T* d_in = nullptr; + T* d_out = nullptr; + HIP_CHECK(hipMalloc(&d_in, total)); + HIP_CHECK(hipMalloc(&d_out, total)); + + // Fill input + std::vector h_in(total / sizeof(T)); + for(size_t i = 0; i < h_in.size(); ++i) + h_in[i] = static_cast(i ^ 0xDEADBEEF); + HIP_CHECK(hipMemcpy(d_in, h_in.data(), total, hipMemcpyHostToDevice)); + + // Build offset/size pairs + struct Chunk + { + size_t offset; + size_t size; + }; + std::vector chunks(num_buffers); + size_t acc = 0; + for(int i = 0; i < num_buffers; ++i) + { + chunks[i].offset = acc; + chunks[i].size = sizes[i]; + acc += sizes[i]; + } + std::shuffle(chunks.begin(), chunks.end(), rng); + + std::vector h_srcs(num_buffers), h_dsts(num_buffers); + std::vector h_sizes(num_buffers); + for(int i = 0; i < num_buffers; ++i) + { + h_srcs[i] = d_in + chunks[i].offset / sizeof(T); + h_dsts[i] = d_out + chunks[i].offset / sizeof(T); + h_sizes[i] = chunks[i].size; + } + + T** d_srcs = nullptr; + T** d_dsts = nullptr; + size_t* d_sizes = nullptr; + HIP_CHECK(hipMalloc(&d_srcs, num_buffers * sizeof(T*))); + HIP_CHECK(hipMalloc(&d_dsts, num_buffers * sizeof(T*))); + HIP_CHECK(hipMalloc(&d_sizes, num_buffers * sizeof(size_t))); + + // Setup stream and event + hipStream_t setup; + hipEvent_t ready; + HIP_CHECK(hipStreamCreate(&setup)); + HIP_CHECK(hipEventCreateWithFlags(&ready, hipEventDisableTiming)); + + HIP_CHECK(hipMemcpyAsync(d_srcs, + h_srcs.data(), + num_buffers * sizeof(T*), + hipMemcpyHostToDevice, + setup)); + HIP_CHECK(hipMemcpyAsync(d_dsts, + h_dsts.data(), + num_buffers * sizeof(T*), + hipMemcpyHostToDevice, + setup)); + HIP_CHECK(hipMemcpyAsync(d_sizes, + h_sizes.data(), + num_buffers * sizeof(size_t), + hipMemcpyHostToDevice, + setup)); + HIP_CHECK(hipEventRecord(ready, setup)); + + // Query temp storage + size_t temp_bytes = 0; + HIP_CHECK( + hipcub::DeviceMemcpy::Batched(nullptr, temp_bytes, d_srcs, d_dsts, d_sizes, num_buffers)); + + void* d_tempA = nullptr; + void* d_tempB = nullptr; + HIP_CHECK(hipMalloc(&d_tempA, temp_bytes)); + HIP_CHECK(hipMalloc(&d_tempB, temp_bytes)); + + hipStream_t streamA, streamB; + HIP_CHECK(hipStreamCreate(&streamA)); + HIP_CHECK(hipStreamCreate(&streamB)); + HIP_CHECK(hipStreamWaitEvent(streamA, ready, 0)); + HIP_CHECK(hipStreamWaitEvent(streamB, ready, 0)); + + // Launch batched memcpy + HIP_CHECK(hipcub::DeviceMemcpy::Batched(d_tempA, + temp_bytes, + d_srcs, + d_dsts, + d_sizes, + num_buffers, + streamA)); + HIP_CHECK(hipcub::DeviceMemcpy::Batched(d_tempB, + temp_bytes, + d_srcs, + d_dsts, + d_sizes, + num_buffers, + streamB)); + + HIP_CHECK(hipStreamSynchronize(streamA)); + HIP_CHECK(hipStreamSynchronize(streamB)); + + // Verify + std::vector h_out(h_in.size()); + HIP_CHECK(hipMemcpy(h_out.data(), d_out, total, hipMemcpyDeviceToHost)); + + for(size_t i = 0; i < h_in.size(); ++i) + { + EXPECT_EQ(h_in[i], h_out[i]) << "Mismatch at index " << i; + } + + // Cleanup + HIP_CHECK(hipEventDestroy(ready)); + HIP_CHECK(hipStreamDestroy(setup)); + HIP_CHECK(hipStreamDestroy(streamA)); + HIP_CHECK(hipStreamDestroy(streamB)); + HIP_CHECK(hipFree(d_tempA)); + HIP_CHECK(hipFree(d_tempB)); + HIP_CHECK(hipFree(d_sizes)); + HIP_CHECK(hipFree(d_dsts)); + HIP_CHECK(hipFree(d_srcs)); + HIP_CHECK(hipFree(d_out)); + HIP_CHECK(hipFree(d_in)); +} + +struct PackedPair +{ + uint16_t a; + uint16_t b; +}; // alignment-sensitive +TEST(DeviceMemcpyBatched, PackedStructAlignment) +{ + using T = PackedPair; + const int num_buffers = 4; + const size_t elems_per_buffer = 1024; + const size_t bytes_per_buffer = elems_per_buffer * sizeof(T); + const size_t total_bytes = num_buffers * bytes_per_buffer; + + T* d_in = nullptr; + T* d_out = nullptr; + HIP_CHECK(hipMalloc(&d_in, total_bytes)); + HIP_CHECK(hipMalloc(&d_out, total_bytes)); + + std::vector h_in(num_buffers * elems_per_buffer); + for(size_t i = 0; i < h_in.size(); ++i) + { + h_in[i] = T{static_cast(i), static_cast(~i)}; + } + HIP_CHECK(hipMemcpy(d_in, h_in.data(), total_bytes, hipMemcpyHostToDevice)); + + std::vector h_sizes(num_buffers, bytes_per_buffer); + std::vector h_srcs(num_buffers), h_dsts(num_buffers); + for(int i = 0; i < num_buffers; ++i) + { + h_srcs[i] = d_in + i * elems_per_buffer; + h_dsts[i] = d_out + i * elems_per_buffer; + } + + T** d_srcs = nullptr; + T** d_dsts = nullptr; + size_t* d_sizes = nullptr; + HIP_CHECK(hipMalloc(&d_srcs, num_buffers * sizeof(T*))); + HIP_CHECK(hipMalloc(&d_dsts, num_buffers * sizeof(T*))); + HIP_CHECK(hipMalloc(&d_sizes, num_buffers * sizeof(size_t))); + HIP_CHECK(hipMemcpy(d_srcs, h_srcs.data(), num_buffers * sizeof(T*), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_dsts, h_dsts.data(), num_buffers * sizeof(T*), hipMemcpyHostToDevice)); + HIP_CHECK( + hipMemcpy(d_sizes, h_sizes.data(), num_buffers * sizeof(size_t), hipMemcpyHostToDevice)); + + size_t temp_bytes = 0; + HIP_CHECK( + hipcub::DeviceMemcpy::Batched(nullptr, temp_bytes, d_srcs, d_dsts, d_sizes, num_buffers)); + void* d_temp = nullptr; + HIP_CHECK(hipMalloc(&d_temp, temp_bytes)); + + HIP_CHECK( + hipcub::DeviceMemcpy::Batched(d_temp, temp_bytes, d_srcs, d_dsts, d_sizes, num_buffers)); + + std::vector h_out(h_in.size()); + HIP_CHECK(hipMemcpy(h_out.data(), d_out, total_bytes, hipMemcpyDeviceToHost)); + EXPECT_TRUE(std::equal(h_in.begin(), + h_in.end(), + h_out.begin(), + [](const PackedPair& x, const PackedPair& y) + { return x.a == y.a && x.b == y.b; })); + + HIP_CHECK(hipFree(d_temp)); + HIP_CHECK(hipFree(d_sizes)); + HIP_CHECK(hipFree(d_dsts)); + HIP_CHECK(hipFree(d_srcs)); + HIP_CHECK(hipFree(d_out)); + HIP_CHECK(hipFree(d_in)); +} From 53242d32925d18fa60e5408f373d5e9c0b006b7d Mon Sep 17 00:00:00 2001 From: Nara Prasetya Date: Tue, 16 Dec 2025 08:55:43 +0000 Subject: [PATCH 43/95] fix(hipcub): fix broken bitwise representation derivation for extended types --- .../block/radix_rank_sort_operations.hpp | 5 +- .../hipcub/test_hipcub_device_radix_sort.hpp | 4 +- .../hipcub/test_utils_sort_comparator.hpp | 74 +++++++------------ 3 files changed, 31 insertions(+), 52 deletions(-) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/radix_rank_sort_operations.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/radix_rank_sort_operations.hpp index b9660713250..b7b17ec419e 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/radix_rank_sort_operations.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/radix_rank_sort_operations.hpp @@ -36,6 +36,7 @@ #define HIPCUB_ROCPRIM_BLOCK_RADIX_RANK_SORT_OPERATIONS_HPP_ #include "../../../config.hpp" +#include "../../../libcxx.hpp" #include "../util_type.hpp" #include // IWYU pragma: export @@ -94,7 +95,9 @@ struct RadixSortTwiddle enum { - FLOAT_KEY = std::is_floating_point::value, + FLOAT_KEY = _HIPCUB_STD::is_floating_point_v + || std::is_same_v + || std::is_same_v, }; static __device__ __forceinline__ UnsignedBits ProcessFloatMinusZero(UnsignedBits key) diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp b/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp index de5189890c0..183b30bc1e6 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp @@ -88,7 +88,7 @@ TYPED_TEST_SUITE_P(HipcubDeviceRadixSort); template auto generate_key_input(size_t size, unsigned int seed_value) - -> std::enable_if_t::value, std::vector> + -> std::enable_if_t<_HIPCUB_STD::is_floating_point_v, std::vector> { auto result = test_utils::get_random_data(size, test_utils::numeric_limits::min(), @@ -100,7 +100,7 @@ auto generate_key_input(size_t size, unsigned int seed_value) template auto generate_key_input(size_t size, unsigned int seed_value) - -> std::enable_if_t::value, std::vector> + -> std::enable_if_t, std::vector> { using inner_t = typename test_utils::inner_type::type; return test_utils::get_random_data(size, diff --git a/projects/hipcub/test/hipcub/test_utils_sort_comparator.hpp b/projects/hipcub/test/hipcub/test_utils_sort_comparator.hpp index 03f80829f13..ce759c555e5 100644 --- a/projects/hipcub/test/hipcub/test_utils_sort_comparator.hpp +++ b/projects/hipcub/test/hipcub/test_utils_sort_comparator.hpp @@ -33,6 +33,7 @@ #include "test_utils_custom_test_types.hpp" #include "test_utils_half.hpp" +#include #include #include @@ -42,10 +43,20 @@ namespace test_utils namespace detail { +template +constexpr bool is_extended_int + = std::is_same_v || std::is_same_v; template::value, int> = 0> + std::enable_if_t< + // Catch integral types and extended integral types. + // The std::is_same_v<...> clauses can be removed once + // libhipcxx is a hard depedency and test types half_t + // and bfloat_t are removed. + _HIPCUB_STD::is_integral_v || is_extended_int, + int> + = 0> Key to_bits(const Key key) { using Bits = typename hipcub::Traits::UnsignedBits; @@ -58,13 +69,25 @@ Key to_bits(const Key key) return key & radix_mask; } +template +constexpr bool is_extended_fp + = std::is_same_v || std::is_same_v + || std::is_same_v || std::is_same_v; template::value, int> = 0> + std::enable_if_t< + // Catch floating types and extended floating types. + // The std::is_same_v<...> clauses can be removed once + // libhipcxx is a hard depedency and test types half_t + // and bfloat_t are removed. + _HIPCUB_STD::is_floating_point_v || is_extended_fp, + int> + = 0> auto to_bits(const Key key) { using unsigned_bits_type = typename hipcub::NumericTraits::UnsignedBits; + static_assert(sizeof(unsigned_bits_type) == sizeof(Key)); unsigned_bits_type bit_key; std::memcpy(&bit_key, &key, sizeof(unsigned_bits_type)); @@ -126,53 +149,6 @@ auto to_bits(const Key& key) return to_bits(bit_key); } -template -auto to_bits(const hip_bfloat16& key) -{ - float f = static_cast(key); - return to_bits(f); -} - -template -auto to_bits(const __half& key) -{ - float f = static_cast(key); - return to_bits(f); -} - -template -auto to_bits(const __int128 key) -{ - using U = unsigned __int128; - U bits = static_cast(key); - bits ^= (U(1) << 127); - constexpr unsigned width = EndBit - StartBit; - if constexpr(width == 128) - { - return bits; - } - else - { - const U mask = (static_cast(1) << width) - 1; - return (bits >> StartBit) & mask; - } -} - -template -auto to_bits(const unsigned __int128 key) -{ - constexpr unsigned width = EndBit - StartBit; - if constexpr(width == 128) - { - return key; - } - else - { - const unsigned __int128 mask = (static_cast(1) << width) - 1; - return (key >> StartBit) & mask; - } -} - } // namespace detail template From 1e9b0bc36dc82b68cdf480be0a2105c16c33c85c Mon Sep 17 00:00:00 2001 From: Cenxuan Tian Date: Thu, 18 Dec 2025 12:54:05 +0000 Subject: [PATCH 44/95] Resolve "Fix is_floating_point value in hipCUB/rocPRIM" --- projects/rocprim/CHANGELOG.md | 14 ++++++++ .../rocprim/include/rocprim/type_traits.hpp | 35 +++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/projects/rocprim/CHANGELOG.md b/projects/rocprim/CHANGELOG.md index 96c929a6189..d9684b9e293 100644 --- a/projects/rocprim/CHANGELOG.md +++ b/projects/rocprim/CHANGELOG.md @@ -2,6 +2,20 @@ Full documentation for rocPRIM is available at [https://rocm.docs.amd.com/projects/rocPRIM/en/latest/](https://rocm.docs.amd.com/projects/rocPRIM/en/latest/). +## rocPRIM 4.5.0 for ROCm 7.14 + +### Added + +* Added C++ 17 style type_traits utilities + * is_floating_point_v + * is_integral_v + * is_arithmetic_v + * is_fundamental_v + * is_unsigned_v + * is_signed_v + * is_scalar_v + * is_compound_v + ## Since last release ROCm 7.12 ### Added diff --git a/projects/rocprim/rocprim/include/rocprim/type_traits.hpp b/projects/rocprim/rocprim/include/rocprim/type_traits.hpp index 10e602c869a..f8e5cffb2b8 100644 --- a/projects/rocprim/rocprim/include/rocprim/type_traits.hpp +++ b/projects/rocprim/rocprim/include/rocprim/type_traits.hpp @@ -1425,6 +1425,34 @@ template struct is_compound : std::integral_constant().is_compound()> {}; +#ifndef DOXYGEN_DOCUMENTATION_BUILD + +template +constexpr bool is_floating_point_v = is_floating_point::value; + +template +constexpr bool is_integral_v = is_integral::value; + +template +constexpr bool is_arithmetic_v = is_arithmetic::value; + +template +constexpr bool is_fundamental_v = is_fundamental::value; + +template +constexpr bool is_unsigned_v = is_unsigned::value; + +template +constexpr bool is_signed_v = is_unsigned::value; + +template +constexpr bool is_scalar_v = is_scalar::value; + +template +constexpr bool is_compound_v = is_compound::value; + +#endif + static_assert(::rocprim::traits::radix_key_codec::radix_key_fundamental::value, "'int' should be fundamental"); static_assert(!::rocprim::traits::radix_key_codec::radix_key_fundamental::value, @@ -1436,6 +1464,13 @@ static_assert(::rocprim::traits::radix_key_codec::radix_key_fundamental::value, "'rocprim::int128_t*' should not be fundamental"); +static_assert(::rocprim::is_floating_point_v<__half>, "__half should be a floating point type"); +static_assert(::rocprim::is_floating_point_v, "bfloat16 should be a floating point type"); +static_assert(::rocprim::is_integral_v<::rocprim::int128_t>, + "::rocprim::int128_t should be a integral type"); +static_assert(::rocprim::is_integral_v<::rocprim::uint128_t>, + "::rocprim::uint128_t should be a integral type"); + END_ROCPRIM_NAMESPACE #endif From 040d9ab551706171e2f95d9db34a5b2919708126 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Mon, 5 Jan 2026 10:10:48 +0000 Subject: [PATCH 45/95] Update copyright year --- projects/hipcub/CMakeLists.txt | 2 +- projects/hipcub/benchmark/CMakeLists.txt | 2 +- .../hipcub/benchmark/benchmark_block_adjacent_difference.cpp | 2 +- projects/hipcub/benchmark/benchmark_block_discontinuity.cpp | 2 +- projects/hipcub/benchmark/benchmark_block_exchange.cpp | 2 +- projects/hipcub/benchmark/benchmark_block_histogram.cpp | 2 +- projects/hipcub/benchmark/benchmark_block_merge_sort.cpp | 2 +- projects/hipcub/benchmark/benchmark_block_radix_rank.cpp | 2 +- projects/hipcub/benchmark/benchmark_block_radix_sort.cpp | 2 +- projects/hipcub/benchmark/benchmark_block_reduce.cpp | 2 +- .../hipcub/benchmark/benchmark_block_run_length_decode.cpp | 2 +- projects/hipcub/benchmark/benchmark_block_scan.cpp | 2 +- projects/hipcub/benchmark/benchmark_block_shuffle.cpp | 2 +- .../hipcub/benchmark/benchmark_device_adjacent_difference.cpp | 2 +- projects/hipcub/benchmark/benchmark_device_histogram.cpp | 2 +- projects/hipcub/benchmark/benchmark_device_memory.cpp | 2 +- projects/hipcub/benchmark/benchmark_device_reduce_by_key.cpp | 2 +- .../hipcub/benchmark/benchmark_device_run_length_encode.cpp | 2 +- projects/hipcub/benchmark/benchmark_utils.hpp | 2 +- projects/hipcub/benchmark/benchmark_warp_exchange.cpp | 2 +- projects/hipcub/benchmark/benchmark_warp_load.cpp | 2 +- projects/hipcub/benchmark/benchmark_warp_merge_sort.cpp | 2 +- projects/hipcub/benchmark/benchmark_warp_reduce.cpp | 2 +- projects/hipcub/benchmark/benchmark_warp_scan.cpp | 2 +- projects/hipcub/benchmark/benchmark_warp_store.cpp | 2 +- projects/hipcub/benchmark/common_benchmark_header.hpp | 2 +- projects/hipcub/cmake/Dependencies.cmake | 2 +- projects/hipcub/cmake/SetupNVCC.cmake | 2 +- projects/hipcub/examples/example_utils.hpp | 2 +- .../hipcub/backend/cub/agent/single_pass_scan_operators.hpp | 2 +- .../hipcub/backend/cub/device/device_adjacent_difference.hpp | 2 +- .../hipcub/include/hipcub/backend/cub/device/device_copy.hpp | 2 +- .../hipcub/include/hipcub/backend/cub/device/device_for.hpp | 2 +- .../include/hipcub/backend/cub/device/device_histogram.hpp | 2 +- .../include/hipcub/backend/cub/device/device_memcpy.hpp | 2 +- .../hipcub/include/hipcub/backend/cub/device/device_merge.hpp | 4 ++-- .../include/hipcub/backend/cub/device/device_merge_sort.hpp | 2 +- .../include/hipcub/backend/cub/device/device_partition.hpp | 2 +- .../include/hipcub/backend/cub/device/device_radix_sort.hpp | 2 +- .../include/hipcub/backend/cub/device/device_reduce.hpp | 2 +- .../hipcub/backend/cub/device/device_run_length_encode.hpp | 2 +- .../hipcub/include/hipcub/backend/cub/device/device_scan.hpp | 2 +- .../hipcub/backend/cub/device/device_segmented_radix_sort.hpp | 2 +- .../hipcub/backend/cub/device/device_segmented_reduce.hpp | 2 +- .../hipcub/backend/cub/device/device_segmented_sort.hpp | 2 +- .../include/hipcub/backend/cub/device/device_select.hpp | 2 +- projects/hipcub/hipcub/include/hipcub/backend/cub/hipcub.hpp | 2 +- .../hipcub/hipcub/include/hipcub/backend/cub/util_macro.hpp | 2 +- .../include/hipcub/backend/cub/util_temporary_storage.hpp | 2 +- .../backend/rocprim/agent/single_pass_scan_operators.hpp | 2 +- .../include/hipcub/backend/rocprim/block/block_merge_sort.hpp | 2 +- .../include/hipcub/backend/rocprim/block/block_radix_sort.hpp | 2 +- .../hipcub/backend/rocprim/block/block_run_length_decode.hpp | 2 +- .../backend/rocprim/block/radix_rank_sort_operations.hpp | 2 +- .../backend/rocprim/device/device_adjacent_difference.hpp | 2 +- .../include/hipcub/backend/rocprim/device/device_copy.hpp | 2 +- .../include/hipcub/backend/rocprim/device/device_for.hpp | 2 +- .../hipcub/backend/rocprim/device/device_histogram.hpp | 2 +- .../include/hipcub/backend/rocprim/device/device_memcpy.hpp | 2 +- .../include/hipcub/backend/rocprim/device/device_merge.hpp | 4 ++-- .../hipcub/backend/rocprim/device/device_merge_sort.hpp | 2 +- .../hipcub/backend/rocprim/device/device_partition.hpp | 2 +- .../include/hipcub/backend/rocprim/device/device_reduce.hpp | 2 +- .../include/hipcub/backend/rocprim/device/device_scan.hpp | 2 +- .../include/hipcub/backend/rocprim/grid/grid_even_share.hpp | 2 +- .../hipcub/hipcub/include/hipcub/backend/rocprim/hipcub.hpp | 2 +- .../backend/rocprim/iterator/arg_index_input_iterator.hpp | 2 +- .../include/hipcub/backend/rocprim/thread/thread_load.hpp | 2 +- .../hipcub/backend/rocprim/thread/thread_operators.hpp | 2 +- .../include/hipcub/backend/rocprim/thread/thread_reduce.hpp | 2 +- .../include/hipcub/backend/rocprim/thread/thread_scan.hpp | 2 +- .../include/hipcub/backend/rocprim/thread/thread_search.hpp | 2 +- .../include/hipcub/backend/rocprim/thread/thread_sort.hpp | 2 +- .../include/hipcub/backend/rocprim/thread/thread_store.hpp | 2 +- .../hipcub/include/hipcub/backend/rocprim/util_macro.hpp | 4 ++-- .../hipcub/hipcub/include/hipcub/backend/rocprim/util_ptx.hpp | 2 +- .../hipcub/include/hipcub/backend/rocprim/util_sync.hpp | 2 +- .../include/hipcub/backend/rocprim/util_temporary_storage.hpp | 2 +- .../hipcub/include/hipcub/backend/rocprim/util_type.hpp | 2 +- projects/hipcub/hipcub/include/hipcub/config.hpp | 2 +- projects/hipcub/hipcub/include/hipcub/libcxx.hpp | 2 +- projects/hipcub/test/extra/CMakeLists.txt | 2 +- projects/hipcub/test/hipcub/CMakeLists.txt | 2 +- projects/hipcub/test/hipcub/bfloat16.hpp | 1 + projects/hipcub/test/hipcub/common_test_header.hpp | 2 +- projects/hipcub/test/hipcub/half.hpp | 1 + projects/hipcub/test/hipcub/single_index_iterator.hpp | 2 +- .../test/hipcub/test_hipcub_block_adjacent_difference.cpp | 2 +- .../hipcub/test/hipcub/test_hipcub_block_discontinuity.cpp | 2 +- projects/hipcub/test/hipcub/test_hipcub_block_histogram.cpp | 2 +- projects/hipcub/test/hipcub/test_hipcub_block_load_store.cpp | 2 +- projects/hipcub/test/hipcub/test_hipcub_block_load_store.hpp | 2 +- .../test/hipcub/test_hipcub_block_load_store.kernels.hpp | 2 +- projects/hipcub/test/hipcub/test_hipcub_block_merge_sort.cpp | 2 +- projects/hipcub/test/hipcub/test_hipcub_block_radix_sort.cpp | 2 +- .../test/hipcub/test_hipcub_block_run_length_decode.cpp | 2 +- .../test/hipcub/test_hipcub_device_adjacent_difference.cpp | 2 +- projects/hipcub/test/hipcub/test_hipcub_device_for.cpp | 2 +- projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp | 2 +- projects/hipcub/test/hipcub/test_hipcub_device_memcpy.cpp | 2 +- projects/hipcub/test/hipcub/test_hipcub_device_merge.cpp | 2 +- .../hipcub/test/hipcub/test_hipcub_device_radix_sort.cpp.in | 2 +- projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp | 2 +- projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp | 2 +- .../hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp | 2 +- .../test/hipcub/test_hipcub_device_run_length_encode.cpp | 2 +- projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp | 2 +- .../test/hipcub/test_hipcub_device_segmented_reduce.cpp | 2 +- .../hipcub/test/hipcub/test_hipcub_device_segmented_sort.hpp | 2 +- projects/hipcub/test/hipcub/test_hipcub_device_select.cpp | 2 +- projects/hipcub/test/hipcub/test_hipcub_grid.cpp | 2 +- projects/hipcub/test/hipcub/test_hipcub_iterators.cpp | 2 +- .../test/hipcub/test_hipcub_single_pass_scan_operators.cpp | 2 +- projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp | 2 +- projects/hipcub/test/hipcub/test_hipcub_util_ptx.cpp | 2 +- projects/hipcub/test/hipcub/test_hipcub_warp_exchange.cpp | 2 +- projects/hipcub/test/hipcub/test_hipcub_warp_load.cpp | 2 +- projects/hipcub/test/hipcub/test_hipcub_warp_merge_sort.cpp | 2 +- projects/hipcub/test/hipcub/test_utils.hpp | 2 +- projects/hipcub/test/hipcub/test_utils_assertions.hpp | 2 +- projects/hipcub/test/hipcub/test_utils_data_generation.hpp | 2 +- projects/hipcub/test/hipcub/test_utils_sort_comparator.hpp | 2 +- projects/hipcub/test/hipcub/test_utils_thread_operators.hpp | 2 +- projects/rocprim/rmake.py | 2 +- 124 files changed, 127 insertions(+), 125 deletions(-) diff --git a/projects/hipcub/CMakeLists.txt b/projects/hipcub/CMakeLists.txt index dd6ef46a795..2b3cef1f1d6 100644 --- a/projects/hipcub/CMakeLists.txt +++ b/projects/hipcub/CMakeLists.txt @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/benchmark/CMakeLists.txt b/projects/hipcub/benchmark/CMakeLists.txt index cf109273bed..fb21e56247c 100644 --- a/projects/hipcub/benchmark/CMakeLists.txt +++ b/projects/hipcub/benchmark/CMakeLists.txt @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2020-2026 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/benchmark/benchmark_block_adjacent_difference.cpp b/projects/hipcub/benchmark/benchmark_block_adjacent_difference.cpp index 079977409fe..4581111bb93 100644 --- a/projects/hipcub/benchmark/benchmark_block_adjacent_difference.cpp +++ b/projects/hipcub/benchmark/benchmark_block_adjacent_difference.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2020-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/benchmark/benchmark_block_discontinuity.cpp b/projects/hipcub/benchmark/benchmark_block_discontinuity.cpp index f205430d2dc..1471ccb6a11 100644 --- a/projects/hipcub/benchmark/benchmark_block_discontinuity.cpp +++ b/projects/hipcub/benchmark/benchmark_block_discontinuity.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2020-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/benchmark/benchmark_block_exchange.cpp b/projects/hipcub/benchmark/benchmark_block_exchange.cpp index 2b3e96784c8..602c77bc523 100644 --- a/projects/hipcub/benchmark/benchmark_block_exchange.cpp +++ b/projects/hipcub/benchmark/benchmark_block_exchange.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2020-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/benchmark/benchmark_block_histogram.cpp b/projects/hipcub/benchmark/benchmark_block_histogram.cpp index c825e75c2bd..e593023d6bf 100644 --- a/projects/hipcub/benchmark/benchmark_block_histogram.cpp +++ b/projects/hipcub/benchmark/benchmark_block_histogram.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2020-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/benchmark/benchmark_block_merge_sort.cpp b/projects/hipcub/benchmark/benchmark_block_merge_sort.cpp index f1ac69b768b..42221bc6607 100644 --- a/projects/hipcub/benchmark/benchmark_block_merge_sort.cpp +++ b/projects/hipcub/benchmark/benchmark_block_merge_sort.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2021-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/benchmark/benchmark_block_radix_rank.cpp b/projects/hipcub/benchmark/benchmark_block_radix_rank.cpp index cba7f813e27..0f12ebd20fa 100644 --- a/projects/hipcub/benchmark/benchmark_block_radix_rank.cpp +++ b/projects/hipcub/benchmark/benchmark_block_radix_rank.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/benchmark/benchmark_block_radix_sort.cpp b/projects/hipcub/benchmark/benchmark_block_radix_sort.cpp index e1fc336c933..ca2a0809391 100644 --- a/projects/hipcub/benchmark/benchmark_block_radix_sort.cpp +++ b/projects/hipcub/benchmark/benchmark_block_radix_sort.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2020-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/benchmark/benchmark_block_reduce.cpp b/projects/hipcub/benchmark/benchmark_block_reduce.cpp index 82c11e08e9b..7c448693d17 100644 --- a/projects/hipcub/benchmark/benchmark_block_reduce.cpp +++ b/projects/hipcub/benchmark/benchmark_block_reduce.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2020-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/benchmark/benchmark_block_run_length_decode.cpp b/projects/hipcub/benchmark/benchmark_block_run_length_decode.cpp index 9f061370c62..ab317604c30 100644 --- a/projects/hipcub/benchmark/benchmark_block_run_length_decode.cpp +++ b/projects/hipcub/benchmark/benchmark_block_run_length_decode.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2021-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/benchmark/benchmark_block_scan.cpp b/projects/hipcub/benchmark/benchmark_block_scan.cpp index b4e79f44b5c..c08976b19c7 100644 --- a/projects/hipcub/benchmark/benchmark_block_scan.cpp +++ b/projects/hipcub/benchmark/benchmark_block_scan.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2020-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/benchmark/benchmark_block_shuffle.cpp b/projects/hipcub/benchmark/benchmark_block_shuffle.cpp index 7e9eeed2059..6a958198f0f 100644 --- a/projects/hipcub/benchmark/benchmark_block_shuffle.cpp +++ b/projects/hipcub/benchmark/benchmark_block_shuffle.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/benchmark/benchmark_device_adjacent_difference.cpp b/projects/hipcub/benchmark/benchmark_device_adjacent_difference.cpp index 6bb0e0af290..d6a55ba2aef 100644 --- a/projects/hipcub/benchmark/benchmark_device_adjacent_difference.cpp +++ b/projects/hipcub/benchmark/benchmark_device_adjacent_difference.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/benchmark/benchmark_device_histogram.cpp b/projects/hipcub/benchmark/benchmark_device_histogram.cpp index f724db54389..d8ff79734b2 100644 --- a/projects/hipcub/benchmark/benchmark_device_histogram.cpp +++ b/projects/hipcub/benchmark/benchmark_device_histogram.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2020-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/benchmark/benchmark_device_memory.cpp b/projects/hipcub/benchmark/benchmark_device_memory.cpp index 4b71b4de749..fed54b86270 100644 --- a/projects/hipcub/benchmark/benchmark_device_memory.cpp +++ b/projects/hipcub/benchmark/benchmark_device_memory.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/benchmark/benchmark_device_reduce_by_key.cpp b/projects/hipcub/benchmark/benchmark_device_reduce_by_key.cpp index 7cdf1f0501a..6a11732e1ff 100644 --- a/projects/hipcub/benchmark/benchmark_device_reduce_by_key.cpp +++ b/projects/hipcub/benchmark/benchmark_device_reduce_by_key.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2020-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/benchmark/benchmark_device_run_length_encode.cpp b/projects/hipcub/benchmark/benchmark_device_run_length_encode.cpp index e8a21bf497a..e3e24e56cca 100644 --- a/projects/hipcub/benchmark/benchmark_device_run_length_encode.cpp +++ b/projects/hipcub/benchmark/benchmark_device_run_length_encode.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2020-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/benchmark/benchmark_utils.hpp b/projects/hipcub/benchmark/benchmark_utils.hpp index 2902dd08672..ea1532939d7 100644 --- a/projects/hipcub/benchmark/benchmark_utils.hpp +++ b/projects/hipcub/benchmark/benchmark_utils.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2020-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/benchmark/benchmark_warp_exchange.cpp b/projects/hipcub/benchmark/benchmark_warp_exchange.cpp index 3b40b17884b..3da97a063cc 100644 --- a/projects/hipcub/benchmark/benchmark_warp_exchange.cpp +++ b/projects/hipcub/benchmark/benchmark_warp_exchange.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2021-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/benchmark/benchmark_warp_load.cpp b/projects/hipcub/benchmark/benchmark_warp_load.cpp index 4b05bab8ec5..95525f335b1 100644 --- a/projects/hipcub/benchmark/benchmark_warp_load.cpp +++ b/projects/hipcub/benchmark/benchmark_warp_load.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2021-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/benchmark/benchmark_warp_merge_sort.cpp b/projects/hipcub/benchmark/benchmark_warp_merge_sort.cpp index cf823609a5f..ba35c1c0e43 100644 --- a/projects/hipcub/benchmark/benchmark_warp_merge_sort.cpp +++ b/projects/hipcub/benchmark/benchmark_warp_merge_sort.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2021-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/benchmark/benchmark_warp_reduce.cpp b/projects/hipcub/benchmark/benchmark_warp_reduce.cpp index e18743243f7..870ebb0f373 100644 --- a/projects/hipcub/benchmark/benchmark_warp_reduce.cpp +++ b/projects/hipcub/benchmark/benchmark_warp_reduce.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2020-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/benchmark/benchmark_warp_scan.cpp b/projects/hipcub/benchmark/benchmark_warp_scan.cpp index d1d9b778aa9..35efedd3995 100644 --- a/projects/hipcub/benchmark/benchmark_warp_scan.cpp +++ b/projects/hipcub/benchmark/benchmark_warp_scan.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2020-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/benchmark/benchmark_warp_store.cpp b/projects/hipcub/benchmark/benchmark_warp_store.cpp index 8c63ec5489a..8d846b46009 100644 --- a/projects/hipcub/benchmark/benchmark_warp_store.cpp +++ b/projects/hipcub/benchmark/benchmark_warp_store.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2021-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/benchmark/common_benchmark_header.hpp b/projects/hipcub/benchmark/common_benchmark_header.hpp index a0c0469645c..093a0079ef1 100644 --- a/projects/hipcub/benchmark/common_benchmark_header.hpp +++ b/projects/hipcub/benchmark/common_benchmark_header.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2020-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/cmake/Dependencies.cmake b/projects/hipcub/cmake/Dependencies.cmake index c5c88c29f47..e5d758b30a7 100644 --- a/projects/hipcub/cmake/Dependencies.cmake +++ b/projects/hipcub/cmake/Dependencies.cmake @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/cmake/SetupNVCC.cmake b/projects/hipcub/cmake/SetupNVCC.cmake index 88f520f20a7..48b59eb2f40 100644 --- a/projects/hipcub/cmake/SetupNVCC.cmake +++ b/projects/hipcub/cmake/SetupNVCC.cmake @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2018-2026 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/examples/example_utils.hpp b/projects/hipcub/examples/example_utils.hpp index 9c0cb8e06d5..ef6ea06bc3b 100644 --- a/projects/hipcub/examples/example_utils.hpp +++ b/projects/hipcub/examples/example_utils.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2021-2024, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2021-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/agent/single_pass_scan_operators.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/agent/single_pass_scan_operators.hpp index 03a98e16b1c..f0554671560 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/agent/single_pass_scan_operators.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/agent/single_pass_scan_operators.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_adjacent_difference.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_adjacent_difference.hpp index 6c864f406df..29d08eda201 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_adjacent_difference.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_adjacent_difference.hpp @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2022-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2022-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_copy.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_copy.hpp index b7f9d56fec2..64f22a982d5 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_copy.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_copy.hpp @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_for.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_for.hpp index cbc159f97b2..7bc48bafdde 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_for.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_for.hpp @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_histogram.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_histogram.hpp index 995d3b61303..5b18cac7754 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_histogram.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_histogram.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_memcpy.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_memcpy.hpp index 40a643a4047..8c4f873c966 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_memcpy.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_memcpy.hpp @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2023-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_merge.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_merge.hpp index 6bd57bef5e8..61c3b2dde59 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_merge.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_merge.hpp @@ -1,6 +1,6 @@ /****************************************************************************** - * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2025-2026, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2025-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_merge_sort.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_merge_sort.hpp index eabc2a64194..c047b61beae 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_merge_sort.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_merge_sort.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_partition.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_partition.hpp index a347cd09b82..b7a6f12f99a 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_partition.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_partition.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_radix_sort.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_radix_sort.hpp index 91129eacea8..6a71f472af2 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_radix_sort.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_radix_sort.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp index 6dd0b93ebb2..4f3a95baf1c 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_run_length_encode.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_run_length_encode.hpp index b23721084ae..dbfae1b63b4 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_run_length_encode.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_run_length_encode.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp index 2751c7efb5f..31f5c53a89d 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_radix_sort.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_radix_sort.hpp index a5c74bf2ac6..8d4807eb72d 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_radix_sort.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_radix_sort.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_reduce.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_reduce.hpp index 1edb929c8b0..942643b3bfd 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_reduce.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_reduce.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_sort.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_sort.hpp index 718788bda0e..5b193d15b7c 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_sort.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_segmented_sort.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_select.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_select.hpp index 5d9ee18ec64..87703809e4f 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_select.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_select.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/hipcub.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/hipcub.hpp index 3a35a7a63cf..299d843a7bd 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/hipcub.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/hipcub.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/util_macro.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/util_macro.hpp index ef7f734121c..2489e3af162 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/util_macro.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/util_macro.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/util_temporary_storage.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/util_temporary_storage.hpp index 3c2a84485c4..daf1e265b1d 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/util_temporary_storage.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/util_temporary_storage.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2024, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/agent/single_pass_scan_operators.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/agent/single_pass_scan_operators.hpp index 691fd5bf322..59060b602f8 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/agent/single_pass_scan_operators.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/agent/single_pass_scan_operators.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_merge_sort.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_merge_sort.hpp index ae9b89c1aeb..29d29dee939 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_merge_sort.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_merge_sort.hpp @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. -* Modifications Copyright (c) 2021-2025, Advanced Micro Devices, Inc. All rights reserved. +* Modifications Copyright (c) 2021-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_radix_sort.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_radix_sort.hpp index e997a202b97..b4ad51da814 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_radix_sort.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_radix_sort.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_run_length_decode.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_run_length_decode.hpp index a2ca04740bc..39fe245f8b9 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_run_length_decode.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_run_length_decode.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2021-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2021-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/radix_rank_sort_operations.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/radix_rank_sort_operations.hpp index b7b17ec419e..74c0fd9df64 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/radix_rank_sort_operations.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/radix_rank_sort_operations.hpp @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2021-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2021-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_adjacent_difference.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_adjacent_difference.hpp index fc87cec8dda..13ec5bf386c 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_adjacent_difference.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_adjacent_difference.hpp @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2022-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2022-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_copy.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_copy.hpp index 7c4b30b6117..68f2f2744ec 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_copy.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_copy.hpp @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_for.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_for.hpp index ec156316c8f..fe9a1f554eb 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_for.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_for.hpp @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_histogram.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_histogram.hpp index 8f9ef71e757..ab51c2e02ac 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_histogram.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_histogram.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_memcpy.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_memcpy.hpp index 0fb0c6b7a44..36dac611cff 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_memcpy.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_memcpy.hpp @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2023-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_merge.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_merge.hpp index b4e7e0762f3..dc80f72a249 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_merge.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_merge.hpp @@ -1,6 +1,6 @@ /****************************************************************************** - * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2025-2026, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2025-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_merge_sort.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_merge_sort.hpp index c3da0ca00f7..5bf1e08e9e9 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_merge_sort.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_merge_sort.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_partition.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_partition.hpp index 8d6a0094705..cf50fe27416 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_partition.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_partition.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp index a9ef8480e54..0c5ad32fc97 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_scan.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_scan.hpp index 4262e26b369..c29881a7de6 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_scan.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_scan.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/grid/grid_even_share.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/grid/grid_even_share.hpp index 9485b7e6338..88a80c2dad6 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/grid/grid_even_share.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/grid/grid_even_share.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2021-2024, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2021-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/hipcub.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/hipcub.hpp index fdf3c4f24eb..d7ea5726334 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/hipcub.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/hipcub.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/iterator/arg_index_input_iterator.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/iterator/arg_index_input_iterator.hpp index 199aadc78fa..51754688bf2 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/iterator/arg_index_input_iterator.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/iterator/arg_index_input_iterator.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_load.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_load.hpp index c3b8f85e9d4..e06e14edc94 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_load.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_load.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2021-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2021-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp index 27d900de974..3413034a6a0 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_reduce.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_reduce.hpp index dadb0e8dd38..b9188933220 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_reduce.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_reduce.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_scan.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_scan.hpp index 526743c8724..ae32a2c8406 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_scan.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_scan.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2021, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2021-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_search.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_search.hpp index b77c8f85de4..6fca005dc50 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_search.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_search.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2021-2024, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2021-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_sort.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_sort.hpp index 7d8aaf5253e..bf3c6937b13 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_sort.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_sort.hpp @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2021-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2021-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_store.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_store.hpp index 0de68d5a555..32651a8408b 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_store.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_store.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2021-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2021-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_macro.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_macro.hpp index 97b36a7654d..054811a078f 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_macro.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_macro.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2025, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_ptx.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_ptx.hpp index 3d8c49dea07..7675c74e9fd 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_ptx.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_ptx.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_sync.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_sync.hpp index 22235ea2af8..8a798b3ead6 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_sync.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_sync.hpp @@ -1,5 +1,5 @@ /****************************************************************************** - * Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_temporary_storage.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_temporary_storage.hpp index a3cb419bbdf..f5bef3e3c4b 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_temporary_storage.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_temporary_storage.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2024, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp index 9267bb7bc35..c17208595c9 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_type.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2021-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2021-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/config.hpp b/projects/hipcub/hipcub/include/hipcub/config.hpp index 610b145078c..7285cd9e5b2 100644 --- a/projects/hipcub/hipcub/include/hipcub/config.hpp +++ b/projects/hipcub/hipcub/include/hipcub/config.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/hipcub/include/hipcub/libcxx.hpp b/projects/hipcub/hipcub/include/hipcub/libcxx.hpp index d141c8816d6..65ee721b5d0 100644 --- a/projects/hipcub/hipcub/include/hipcub/libcxx.hpp +++ b/projects/hipcub/hipcub/include/hipcub/libcxx.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/extra/CMakeLists.txt b/projects/hipcub/test/extra/CMakeLists.txt index 082c64a9bea..801ba1a5c0b 100644 --- a/projects/hipcub/test/extra/CMakeLists.txt +++ b/projects/hipcub/test/extra/CMakeLists.txt @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/CMakeLists.txt b/projects/hipcub/test/hipcub/CMakeLists.txt index 92b7eec2629..d69d1d855c5 100644 --- a/projects/hipcub/test/hipcub/CMakeLists.txt +++ b/projects/hipcub/test/hipcub/CMakeLists.txt @@ -1,5 +1,5 @@ # MIT License # -# Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/bfloat16.hpp b/projects/hipcub/test/hipcub/bfloat16.hpp index 0de3b95a883..69c87cf0bd0 100644 --- a/projects/hipcub/test/hipcub/bfloat16.hpp +++ b/projects/hipcub/test/hipcub/bfloat16.hpp @@ -1,5 +1,6 @@ /****************************************************************************** * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/test/hipcub/common_test_header.hpp b/projects/hipcub/test/hipcub/common_test_header.hpp index fc04ec1fb4b..6be1b6a97a3 100755 --- a/projects/hipcub/test/hipcub/common_test_header.hpp +++ b/projects/hipcub/test/hipcub/common_test_header.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2020-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/half.hpp b/projects/hipcub/test/hipcub/half.hpp index 1a8b3db1cad..3ec682bc1d2 100644 --- a/projects/hipcub/test/hipcub/half.hpp +++ b/projects/hipcub/test/hipcub/half.hpp @@ -1,6 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/test/hipcub/single_index_iterator.hpp b/projects/hipcub/test/hipcub/single_index_iterator.hpp index eab3c83f005..2a61c43c65a 100644 --- a/projects/hipcub/test/hipcub/single_index_iterator.hpp +++ b/projects/hipcub/test/hipcub/single_index_iterator.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_adjacent_difference.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_adjacent_difference.cpp index 3b0c204e7cd..6ca858bbcc5 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_adjacent_difference.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_adjacent_difference.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_discontinuity.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_discontinuity.cpp index 28765be3a9a..fa6cd68ed2d 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_discontinuity.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_discontinuity.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_histogram.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_histogram.cpp index dcef6ab3857..78f55bdd626 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_histogram.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_histogram.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_load_store.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_load_store.cpp index 2adfdc0e91b..04fbb2b522d 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_load_store.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_load_store.cpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_load_store.hpp b/projects/hipcub/test/hipcub/test_hipcub_block_load_store.hpp index cbd5f203ed2..004e651bf19 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_load_store.hpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_load_store.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_load_store.kernels.hpp b/projects/hipcub/test/hipcub/test_hipcub_block_load_store.kernels.hpp index 5e6192857a0..ce457f58046 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_load_store.kernels.hpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_load_store.kernels.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2019-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_merge_sort.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_merge_sort.cpp index 9aab4a6e56f..acb78eb3556 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_merge_sort.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_merge_sort.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2021-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_radix_sort.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_radix_sort.cpp index 56c8be53220..7a156732f9a 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_radix_sort.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_radix_sort.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp index 1f277c94c1b..0063718ca26 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2021-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_adjacent_difference.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_adjacent_difference.cpp index 7f04766fcbc..3a4443c1fbf 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_adjacent_difference.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_adjacent_difference.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp index ecd5d0da0a9..4c7dda78292 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp index 0a8de808c08..891d99a6841 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_memcpy.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_memcpy.cpp index 80c01494feb..ec7b3f62fbd 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_memcpy.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_memcpy.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_merge.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_merge.cpp index 8594d2ac5f6..1c14a196e9f 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_merge.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_merge.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.cpp.in b/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.cpp.in index 7cac5df9e1e..2731de1cbc4 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.cpp.in +++ b/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.cpp.in @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp b/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp index 183b30bc1e6..8b12185ca22 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp index 0ea81edd64e..45e6a409df5 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp index 894cf3871af..2a6f16a0881 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_run_length_encode.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_run_length_encode.cpp index d32210327b8..316a00943a1 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_run_length_encode.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_run_length_encode.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp index cc23baf7270..cd167e82510 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp index 2cbc11d35bc..dcce1b00103 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_segmented_sort.hpp b/projects/hipcub/test/hipcub/test_hipcub_device_segmented_sort.hpp index 66d58468f64..4efa9524179 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_segmented_sort.hpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_segmented_sort.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp index 2e8ae3863f8..02c21990492 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_grid.cpp b/projects/hipcub/test/hipcub/test_hipcub_grid.cpp index 654bf550e32..be9c116ee56 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_grid.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_grid.cpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/test/hipcub/test_hipcub_iterators.cpp b/projects/hipcub/test/hipcub/test_hipcub_iterators.cpp index 62d579a6334..f0763ce03d7 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_iterators.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_iterators.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_single_pass_scan_operators.cpp b/projects/hipcub/test/hipcub/test_hipcub_single_pass_scan_operators.cpp index f440e7f6f88..9aaf38cc002 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_single_pass_scan_operators.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_single_pass_scan_operators.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp b/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp index 66750ccb471..9a0d0d0981a 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_util_ptx.cpp b/projects/hipcub/test/hipcub/test_hipcub_util_ptx.cpp index a68cb5a09fb..5c59df72354 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_util_ptx.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_util_ptx.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_warp_exchange.cpp b/projects/hipcub/test/hipcub/test_hipcub_warp_exchange.cpp index 6fc5171b525..a01d6318a20 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_warp_exchange.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_warp_exchange.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_warp_load.cpp b/projects/hipcub/test/hipcub/test_hipcub_warp_load.cpp index 7a0c30a96ca..f35adb743c9 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_warp_load.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_warp_load.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_hipcub_warp_merge_sort.cpp b/projects/hipcub/test/hipcub/test_hipcub_warp_merge_sort.cpp index 24180e97f69..e87ebbe91d2 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_warp_merge_sort.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_warp_merge_sort.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_utils.hpp b/projects/hipcub/test/hipcub/test_utils.hpp index 961178fba18..b0674b64a31 100644 --- a/projects/hipcub/test/hipcub/test_utils.hpp +++ b/projects/hipcub/test/hipcub/test_utils.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_utils_assertions.hpp b/projects/hipcub/test/hipcub/test_utils_assertions.hpp index 57574f5229a..2b3838b5011 100644 --- a/projects/hipcub/test/hipcub/test_utils_assertions.hpp +++ b/projects/hipcub/test/hipcub/test_utils_assertions.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2021-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_utils_data_generation.hpp b/projects/hipcub/test/hipcub/test_utils_data_generation.hpp index 12026cd326d..e1228aec294 100644 --- a/projects/hipcub/test/hipcub/test_utils_data_generation.hpp +++ b/projects/hipcub/test/hipcub/test_utils_data_generation.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2021-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_utils_sort_comparator.hpp b/projects/hipcub/test/hipcub/test_utils_sort_comparator.hpp index ce759c555e5..91782408cb9 100644 --- a/projects/hipcub/test/hipcub/test_utils_sort_comparator.hpp +++ b/projects/hipcub/test/hipcub/test_utils_sort_comparator.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/hipcub/test/hipcub/test_utils_thread_operators.hpp b/projects/hipcub/test/hipcub/test_utils_thread_operators.hpp index 335565e2dd9..bba4a328e80 100644 --- a/projects/hipcub/test/hipcub/test_utils_thread_operators.hpp +++ b/projects/hipcub/test/hipcub/test_utils_thread_operators.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/projects/rocprim/rmake.py b/projects/rocprim/rmake.py index ae627487678..c1f25a4c2ba 100644 --- a/projects/rocprim/rmake.py +++ b/projects/rocprim/rmake.py @@ -1,5 +1,5 @@ #!/usr/bin/python3 -""" Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All rights reserved. +""" Copyright (c) 2021-2026 Advanced Micro Devices, Inc. All rights reserved. Manage build and installation""" import re From 9c4bf187397ba2211f44bc83ca291b10e1ef2ebc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Mon, 26 Jan 2026 10:19:00 +0000 Subject: [PATCH 46/95] Update CHANGELOG --- projects/hipcub/CHANGELOG.md | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/projects/hipcub/CHANGELOG.md b/projects/hipcub/CHANGELOG.md index 5111892ad6f..caac9b2da3d 100644 --- a/projects/hipcub/CHANGELOG.md +++ b/projects/hipcub/CHANGELOG.md @@ -17,17 +17,7 @@ Full documentation for hipCUB is available at [https://rocm.docs.amd.com/project ### Added * Added `generate_resource_spec.cpp` to the test directory. It is now built as a new target by CMake. It generates the resource spec file required by CTest when running tests in parallel. - -### Removed - -* Removed `ConstantInputIterator`, `CountingInputIterator`, `DiscardOutputIterator` and `TransformInputIterator` which were deprecated in hipCUB-4.1.0. -* Removed `DeviceSpmv`, which was removed from CUB after CCCL's 2.8.0 release. Use `hipSPARSE` or `rocSPARSE` libraries instead. -* Removed `GridBarrier`. -* Removed `HIPCUB_MIN`, `HIPCUB_MAX`, `HIPCUB_QUOTIENT_FLOOR`, `HIPCUB_QUOTIENT_CEILING`, `HIPCUB_ROUND_UP_NEAREST` and `HIPCUB_ROUND_DOWN_NEAREST` which were deprecated in hipCUB-4.1.0. -* Removed `LEGACY_PTX_ARCH`. -* Removed `hipcub:max` and `hipcub:min`, which were deprecated. Use `hip::std::max` and `hip::std::min` instead. -* Deprecated `hipcub::Swap`, use `rocprim::swap` instead. -* Deprecated `HIPCUB_IS_INT128_ENABLED`, use `_CCCL_HAS_INT128()` instead. +* Added `::hip::std` support. ### Changed @@ -39,7 +29,16 @@ Full documentation for hipCUB is available at [https://rocm.docs.amd.com/project ### Removed +* Removed `hipcub::BaseTraits::CATEGORY`, `hipcub::BaseTraits::nullptr_TYPE` and `hipcub::BaseTraits::PRIMITIVE`. +* Removed `ConstantInputIterator`, `CountingInputIterator`, `DiscardOutputIterator` and `TransformInputIterator` which were deprecated in hipCUB-4.1.0. +* Removed `DeviceSpmv`, which was removed from CUB after CCCL's 2.8.0 release. Use `hipSPARSE` or `rocSPARSE` libraries instead. * Removed the `GenerateResourceSpec.cmake` script - it is replaced by the added `generate_resource_spec.cpp` code mentioned above. +* Removed `GridBarrier`. +* Removed `HIPCUB_MIN`, `HIPCUB_MAX`, `HIPCUB_QUOTIENT_FLOOR`, `HIPCUB_QUOTIENT_CEILING`, `HIPCUB_ROUND_UP_NEAREST` and `HIPCUB_ROUND_DOWN_NEAREST` which were deprecated in hipCUB-4.1.0. +* Removed `LEGACY_PTX_ARCH`. +* Removed `hipcub:max` and `hipcub:min`, which were deprecated. Use `hip::std::max` and `hip::std::min` instead. +* Deprecated `hipcub::Swap`, use `rocprim::swap` instead. +* Deprecated `HIPCUB_IS_INT128_ENABLED`, use `_CCCL_HAS_INT128()` instead. ## hipCUB-4.2.0 for ROCm 7.2 From be22a9d3b3cb9d73191e1cac2ea42216ad3bbedd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Mon, 26 Jan 2026 12:24:04 +0000 Subject: [PATCH 47/95] Fix clang format --- projects/hipcub/benchmark/benchmark_utils.hpp | 12 ++- projects/hipcub/test/hipcub/bfloat16.hpp | 6 +- projects/hipcub/test/hipcub/half.hpp | 6 +- .../test/hipcub/single_index_iterator.hpp | 3 +- .../hipcub/test_hipcub_warp_merge_sort.cpp | 96 +++++++++---------- .../hipcub/test_utils_thread_operators.hpp | 10 +- 6 files changed, 69 insertions(+), 64 deletions(-) diff --git a/projects/hipcub/benchmark/benchmark_utils.hpp b/projects/hipcub/benchmark/benchmark_utils.hpp index ea1532939d7..1cb93a821b0 100644 --- a/projects/hipcub/benchmark/benchmark_utils.hpp +++ b/projects/hipcub/benchmark/benchmark_utils.hpp @@ -424,10 +424,14 @@ using engine_type = std::default_random_engine; // generate_random_data_n() generates only part of sequence and replicates it, // because benchmarks usually do not need "true" random sequence. template -inline auto generate_random_data_n( - OutputIter it, size_t size, U min, V max, Generator& gen, size_t max_random_size = 1024 * 1024) - -> typename std::enable_if_t>::value, - OutputIter> +inline auto generate_random_data_n(OutputIter it, + size_t size, + U min, + V max, + Generator& gen, + size_t max_random_size = 1024 * 1024) -> + typename std::enable_if_t>::value, + OutputIter> { using T = ::hipcub::detail::it_value_t; diff --git a/projects/hipcub/test/hipcub/bfloat16.hpp b/projects/hipcub/test/hipcub/bfloat16.hpp index 69c87cf0bd0..6cf698a86b2 100644 --- a/projects/hipcub/test/hipcub/bfloat16.hpp +++ b/projects/hipcub/test/hipcub/bfloat16.hpp @@ -90,7 +90,8 @@ struct bfloat16_t } /// Constructor from size_t - __host__ __device__ __forceinline__ bfloat16_t(size_t a) + __host__ __device__ __forceinline__ + bfloat16_t(size_t a) { *this = bfloat16_t(float(a)); } @@ -100,7 +101,8 @@ struct bfloat16_t typename = typename std::enable_if && (!std::is_same_v)>::type> - __host__ __device__ __forceinline__ bfloat16_t(T a) + __host__ __device__ __forceinline__ + bfloat16_t(T a) { *this = bfloat16_t(float(a)); } diff --git a/projects/hipcub/test/hipcub/half.hpp b/projects/hipcub/test/hipcub/half.hpp index 3ec682bc1d2..f5d87dd785d 100644 --- a/projects/hipcub/test/hipcub/half.hpp +++ b/projects/hipcub/test/hipcub/half.hpp @@ -79,7 +79,8 @@ struct half_t } /// Constructor from size_t - __host__ __device__ __forceinline__ half_t(size_t a) + __host__ __device__ __forceinline__ + half_t(size_t a) { *this = half_t(float(a)); } @@ -89,7 +90,8 @@ struct half_t typename = typename std::enable_if && (!std::is_same_v)>::type> - __host__ __device__ __forceinline__ half_t(T a) + __host__ __device__ __forceinline__ + half_t(T a) { *this = half_t(float(a)); } diff --git a/projects/hipcub/test/hipcub/single_index_iterator.hpp b/projects/hipcub/test/hipcub/single_index_iterator.hpp index 2a61c43c65a..b33c73c5495 100644 --- a/projects/hipcub/test/hipcub/single_index_iterator.hpp +++ b/projects/hipcub/test/hipcub/single_index_iterator.hpp @@ -52,8 +52,7 @@ class single_index_iterator // Implicit conversion for read access HIPCUB_HOST_DEVICE - inline - operator T() const + inline operator T() const { return *value_; } diff --git a/projects/hipcub/test/hipcub/test_hipcub_warp_merge_sort.cpp b/projects/hipcub/test/hipcub/test_hipcub_warp_merge_sort.cpp index e87ebbe91d2..1d0b7a888e7 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_warp_merge_sort.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_warp_merge_sort.cpp @@ -399,15 +399,15 @@ TYPED_TEST(HipcubWarpMergeSort, SortKeysSegmented) auto keys = test_utils::is_floating_point::value ? test_utils::get_random_data( - size, - test_utils::convert_to_device(-1000), - test_utils::convert_to_device(1000), - seed_value) + size, + test_utils::convert_to_device(-1000), + test_utils::convert_to_device(1000), + seed_value) : test_utils::get_random_data( - size, - _HIPCUB_STD::numeric_limits::lowest(), - _HIPCUB_STD::numeric_limits::max(), - seed_value); + size, + _HIPCUB_STD::numeric_limits::lowest(), + _HIPCUB_STD::numeric_limits::max(), + seed_value); const auto segment_sizes = test_utils::get_random_data( num_warps, 0u, max_segment_size, ~seed_value); @@ -512,29 +512,29 @@ TYPED_TEST(HipcubWarpMergeSort, SortKeysValuesSegmented) auto keys = test_utils::is_floating_point::value ? test_utils::get_random_data( - size, - test_utils::convert_to_device(-1000), - test_utils::convert_to_device(1000), - seed_value) + size, + test_utils::convert_to_device(-1000), + test_utils::convert_to_device(1000), + seed_value) : test_utils::get_random_data( - size, - _HIPCUB_STD::numeric_limits::lowest(), - _HIPCUB_STD::numeric_limits::max(), - seed_value); + size, + _HIPCUB_STD::numeric_limits::lowest(), + _HIPCUB_STD::numeric_limits::max(), + seed_value); using value_wrapped_type = typename test_utils::inner_type::type; auto values = test_utils::is_floating_point::value ? test_utils::get_random_data( - size, - test_utils::convert_to_device(-1000), - test_utils::convert_to_device(1000), - seed_value) + size, + test_utils::convert_to_device(-1000), + test_utils::convert_to_device(1000), + seed_value) : test_utils::get_random_data( - size, - _HIPCUB_STD::numeric_limits::lowest(), - _HIPCUB_STD::numeric_limits::max(), - seed_value ^ (seed_value >> 1ul)); + size, + _HIPCUB_STD::numeric_limits::lowest(), + _HIPCUB_STD::numeric_limits::max(), + seed_value ^ (seed_value >> 1ul)); const auto segment_sizes = test_utils::get_random_data( num_warps, 0u, max_segment_size, ~seed_value); @@ -660,15 +660,15 @@ TYPED_TEST(HipcubWarpMergeSort, SortKeys) auto keys = test_utils::is_floating_point::value ? test_utils::get_random_data( - size, - test_utils::convert_to_device(-1000), - test_utils::convert_to_device(1000), - seed_value) + size, + test_utils::convert_to_device(-1000), + test_utils::convert_to_device(1000), + seed_value) : test_utils::get_random_data( - size, - _HIPCUB_STD::numeric_limits::lowest(), - _HIPCUB_STD::numeric_limits::max(), - seed_value); + size, + _HIPCUB_STD::numeric_limits::lowest(), + _HIPCUB_STD::numeric_limits::max(), + seed_value); const auto compare = typename params::compare_function{}; @@ -759,29 +759,29 @@ TYPED_TEST(HipcubWarpMergeSort, SortKeysValues) auto keys = test_utils::is_floating_point::value ? test_utils::get_random_data( - size, - test_utils::convert_to_device(-1000), - test_utils::convert_to_device(1000), - seed_value) + size, + test_utils::convert_to_device(-1000), + test_utils::convert_to_device(1000), + seed_value) : test_utils::get_random_data( - size, - _HIPCUB_STD::numeric_limits::lowest(), - _HIPCUB_STD::numeric_limits::max(), - seed_value); + size, + _HIPCUB_STD::numeric_limits::lowest(), + _HIPCUB_STD::numeric_limits::max(), + seed_value); using value_wrapped_type = typename test_utils::inner_type::type; auto values = test_utils::is_floating_point::value ? test_utils::get_random_data( - size, - test_utils::convert_to_device(-1000), - test_utils::convert_to_device(1000), - seed_value) + size, + test_utils::convert_to_device(-1000), + test_utils::convert_to_device(1000), + seed_value) : test_utils::get_random_data( - size, - _HIPCUB_STD::numeric_limits::lowest(), - _HIPCUB_STD::numeric_limits::max(), - seed_value ^ (seed_value >> 1ul)); + size, + _HIPCUB_STD::numeric_limits::lowest(), + _HIPCUB_STD::numeric_limits::max(), + seed_value ^ (seed_value >> 1ul)); const auto compare = typename params::compare_function{}; diff --git a/projects/hipcub/test/hipcub/test_utils_thread_operators.hpp b/projects/hipcub/test/hipcub/test_utils_thread_operators.hpp index bba4a328e80..1ae57cbbd47 100644 --- a/projects/hipcub/test/hipcub/test_utils_thread_operators.hpp +++ b/projects/hipcub/test/hipcub/test_utils_thread_operators.hpp @@ -160,9 +160,8 @@ struct ArgMax bool> = true> HIPCUB_HOST_DEVICE __forceinline__ - hipcub::KeyValuePair - operator()(const hipcub::KeyValuePair& a, - const hipcub::KeyValuePair& b) const + hipcub::KeyValuePair operator()(const hipcub::KeyValuePair& a, + const hipcub::KeyValuePair& b) const { const hipcub::KeyValuePair native_a(a.key, a.value); const hipcub::KeyValuePair native_b(b.key, b.value); @@ -185,9 +184,8 @@ struct ArgMin bool> = true> HIPCUB_HOST_DEVICE __forceinline__ - hipcub::KeyValuePair - operator()(const hipcub::KeyValuePair& a, - const hipcub::KeyValuePair& b) const + hipcub::KeyValuePair operator()(const hipcub::KeyValuePair& a, + const hipcub::KeyValuePair& b) const { const hipcub::KeyValuePair native_a(a.key, a.value); const hipcub::KeyValuePair native_b(b.key, b.value); From 16944ff3fa7acf042a4c5f00e288133cf1f5eb19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Thu, 30 Oct 2025 16:09:14 +0000 Subject: [PATCH 48/95] Update counting, discard and transform iterator CUB backend for tests --- .../hipcub/test_hipcub_block_exchange.cpp | 10 +- .../hipcub/test_hipcub_block_load_store.hpp | 2 - ...test_hipcub_device_adjacent_difference.cpp | 2 +- .../test/hipcub/test_hipcub_device_for.cpp | 23 ++--- .../hipcub/test_hipcub_device_histogram.cpp | 12 ++- .../test/hipcub/test_hipcub_device_merge.cpp | 4 +- .../test/hipcub/test_hipcub_device_reduce.cpp | 6 +- .../test/hipcub/test_hipcub_device_scan.cpp | 20 ++-- .../test_hipcub_device_segmented_reduce.cpp | 2 +- .../test/hipcub/test_hipcub_device_select.cpp | 18 ++-- .../test/hipcub/test_hipcub_iterators.cpp | 13 +-- .../hipcub/test/hipcub/test_hipcub_thread.cpp | 14 ++- .../hipcub/test_hipcub_thread_operators.cpp | 3 +- .../test/hipcub/test_hipcub_util_ptx.cpp | 14 ++- projects/hipcub/test/hipcub/test_utils.hpp | 95 ++++++++++++++++++- .../test/hipcub/test_utils_assertions.hpp | 10 +- 16 files changed, 183 insertions(+), 65 deletions(-) diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_exchange.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_exchange.cpp index afa6f4c6265..4db4d9ea137 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_exchange.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_exchange.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -908,7 +908,7 @@ TYPED_TEST(HipcubBlockExchangeTests, ScatterToStripedGuarded) const size_t i0 = offset + ti * items_per_thread + ii; const size_t i1 = offset + host_ranks[i0] % block_size * items_per_thread + host_ranks[i0] / block_size; - if(i1 >= 0 && i1 < size) + if(i1 < size) host_expected[i1] = host_input[i0]; } } @@ -1025,7 +1025,7 @@ TYPED_TEST(HipcubBlockExchangeTests, ScatterToStripedFlagged) const size_t i0 = offset + ti * items_per_thread + ii; const size_t i1 = offset + host_ranks[i0] % block_size * items_per_thread + host_ranks[i0] / block_size; - if(i1 >= 0 && i1 < size) + if(i1 < size) host_expected[i1] = host_input[i0]; host_flags[i0] = (ti == block_size - 1) && (ii == items_per_thread - 1) ? false : true; @@ -1818,7 +1818,7 @@ TYPED_TEST(HipcubBlockExchangeTests, ScatterToStripedGuardedNoOutputParam) const size_t i0 = offset + ti * items_per_thread + ii; const size_t i1 = offset + host_ranks[i0] % block_size * items_per_thread + host_ranks[i0] / block_size; - if(i1 >= 0 && i1 < size) + if(i1 < size) host_expected[i1] = host_input[i0]; } } @@ -1930,7 +1930,7 @@ TYPED_TEST(HipcubBlockExchangeTests, ScatterToStripedFlaggedNoOutputParam) const size_t i0 = offset + ti * items_per_thread + ii; const size_t i1 = offset + host_ranks[i0] % block_size * items_per_thread + host_ranks[i0] / block_size; - if(i1 >= 0 && i1 < size) + if(i1 < size) host_expected[i1] = host_input[i0]; host_flags[i0] = (ti == block_size - 1) && (ii == items_per_thread - 1) ? false : true; diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_load_store.hpp b/projects/hipcub/test/hipcub/test_hipcub_block_load_store.hpp index 004e651bf19..bd39aeb70d9 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_load_store.hpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_load_store.hpp @@ -20,8 +20,6 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include - test_suite_type_def(suite_name, name_suffix) typed_test_suite_def(HipcubBlockLoadStoreTests, name_suffix, load_store_params); diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_adjacent_difference.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_adjacent_difference.cpp index 3a4443c1fbf..a071b676abe 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_adjacent_difference.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_adjacent_difference.cpp @@ -478,7 +478,7 @@ TYPED_TEST(HipcubDeviceAdjacentDifferenceLargeTests, LargeIndicesAndOpOnce) HIP_CHECK(hipMemset(d_counter, 0, sizeof(*d_counter))); OutputIterator output(d_incorrect_flag, d_counter); - const auto input = rocprim::counting_iterator(T{0}); + const auto input = test_utils::counting_iterator(T{0}); static constexpr auto left_tag = std::integral_constant{}; static constexpr auto copy_tag = std::integral_constant{}; diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp index 4c7dda78292..36840d4aa86 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp @@ -691,7 +691,7 @@ TEST(HipcubDeviceForTests, ForCountingIterator) // Device pointers unsigned int* d_count; - const auto it = rocprim::counting_iterator{0}; + const auto it = test_utils::counting_iterator{0}; // Allocate memory HIP_CHECK(test_common_utils::hipMallocHelper(&d_count, sizeof(unsigned int))); @@ -744,7 +744,7 @@ TEST(HipcubDeviceForTests, ForCopyCountingIterator) // Device pointers unsigned int* d_count; - const auto it = rocprim::counting_iterator{0}; + const auto it = test_utils::counting_iterator{0}; // Allocate memory HIP_CHECK(test_common_utils::hipMallocHelper(&d_count, sizeof(unsigned int))); @@ -896,11 +896,11 @@ struct HipcubDeviceForEachInExtentsTests : public ::testing::Test template using HipcubDeviceForEachInExtentsParamGenerator - = ::testing::Types>, - DeviceForEachInExtentsParams<::hipcub::extents>, - DeviceForEachInExtentsParams<::hipcub::extents>, - DeviceForEachInExtentsParams<::hipcub::extents>, - DeviceForEachInExtentsParams<::hipcub::extents>>; + = ::testing::Types>, + DeviceForEachInExtentsParams<::test_utils::extents>, + DeviceForEachInExtentsParams<::test_utils::extents>, + DeviceForEachInExtentsParams<::test_utils::extents>, + DeviceForEachInExtentsParams<::test_utils::extents>>; using HipcubDeviceForEachInExtentsTestsParams = typename HipcubTestParamsMergeAll< HipcubDeviceForEachInExtentsParamGenerator, @@ -940,7 +940,8 @@ template< } template -inline void fill_linear(std::vector& vector, const ::hipcub::extents& ext) +inline void fill_linear(std::vector& vector, + const ::test_utils::extents& ext) { size_t pos = 0; fill_linear_impl(vector, ext, pos); @@ -973,8 +974,8 @@ TEST(HipcubDeviceForEachInExtentsTests, ForEachInExtentsAPI) using item_t = int; using data_t = std::array; - using extents_type = hipcub::extents; - constexpr auto extents_size = hipcub::extents_size::value; + using extents_type = test_utils::extents; + constexpr auto extents_size = test_utils::extents_size::value; constexpr auto memory_size = extents_size * sizeof(data_t); constexpr extents_type ext{}; @@ -1032,7 +1033,7 @@ TYPED_TEST(HipcubDeviceForEachInExtentsTests, ForEachInExtentsStatic) using item_t = index_type; using data_t = std::array; - constexpr auto extents_size = hipcub::extents_size::value; + constexpr auto extents_size = test_utils::extents_size::value; constexpr auto memory_size = extents_size * sizeof(data_t); constexpr auto rank = extents_type::rank(); using store_op_t = LinearStore; diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp index 891d99a6841..55bb120c22f 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp @@ -390,7 +390,7 @@ TYPED_TEST(HipcubDeviceHistogramEvenOverflow, EvenOverflow) SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate data - auto d_input = rocprim::counting_iterator(0); + auto d_input = test_utils::counting_iterator{0UL}; counter_type* d_histogram; HIP_CHECK(test_common_utils::hipMallocHelper(&d_histogram, bins * sizeof(counter_type))); @@ -883,8 +883,9 @@ TYPED_TEST(HipcubDeviceHistogramMultiEven, MultiEven) } } } - rocprim::transform_iterator, sample_type> - d_input2(d_input, transform_op()); + test_utils::transform_iterator> d_input2( + d_input, + transform_op()); size_t temporary_storage_bytes = 0; if(rows == 1) { @@ -1213,8 +1214,9 @@ TYPED_TEST(HipcubDeviceHistogramMultiRange, MultiRange) } } } - rocprim::transform_iterator, sample_type> - d_input2(d_input, transform_op()); + test_utils::transform_iterator> d_input2( + d_input, + transform_op()); size_t temporary_storage_bytes = 0; if(rows == 1) { diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_merge.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_merge.cpp index 1c14a196e9f..08a006b247c 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_merge.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_merge.cpp @@ -495,9 +495,9 @@ TEST(HipcubDeviceMerge, MergeLargeSizeIterators) compare_function compare_op; // Generate data - const auto input1 = rocprim::counting_iterator(key_type{0}); + const auto input1 = test_utils::counting_iterator(key_type{0}); const auto input2 - = rocprim::counting_iterator(key_type{static_cast(size1)}); + = test_utils::counting_iterator(key_type{static_cast(size1)}); std::vector vec_input1(size1); std::vector vec_input2(size2); std::iota(vec_input1.begin(), vec_input1.end(), 0); diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp index 45e6a409df5..03362ec987a 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp @@ -685,8 +685,8 @@ void test_argminmax2(typename TestFixture::input_type empty_value) using T = typename TestFixture::input_type; using Iterator = typename hipcub::ArgIndexInputIterator; using argidx_type = typename Iterator::value_type; - using extremum_type = typename argidx_type::value_type; - using index_type = typename argidx_type::key_type; + using extremum_type = decltype(std::declval().value); + using index_type = decltype(std::declval().key); DispatchFunction function; @@ -1159,7 +1159,7 @@ TYPED_TEST(HipcubDeviceReduceLargeIndicesTests, LargeIndices) using T = typename TestFixture::input_type; using U = typename TestFixture::output_type; - using IteratorType = rocprim::constant_iterator; + using IteratorType = test_utils::constant_iterator; const std::vector exponents = {30, 31, 32, 33, 34}; for(auto exponent : exponents) { diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp index cd167e82510..d4dff6505eb 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp @@ -136,7 +136,7 @@ TYPED_TEST(HipcubDeviceScanTests, InclusiveScan) // use float as device-side accumulator and double as host-side accumulator using is_add_op = test_utils::is_add_operator; using acc_type = typename accum_type::type; - using IteratorType = rocprim::transform_iterator, acc_type>; + using IteratorType = test_utils::transform_iterator>; constexpr bool inplace = std::is_same_v && std::is_same_v; // for non-associative operations in inclusive scan @@ -328,7 +328,7 @@ TYPED_TEST(HipcubDeviceScanTests, InclusiveScanInit) // use float as device-side accumulator and double as host-side accumulator using is_add_op = test_utils::is_add_operator; using acc_type = typename accum_type::type; - using IteratorType = rocprim::transform_iterator, acc_type>; + using IteratorType = test_utils::transform_iterator>; constexpr bool inplace = std::is_same_v && std::is_same_v; // for non-associative operations in inclusive scan @@ -512,7 +512,7 @@ TYPED_TEST(HipcubDeviceScanTests, InclusiveScanByKey) // use float as device-side accumulator and double as host-side accumulator using is_add_op = test_utils::is_add_operator; using acc_type = typename accum_type::type; - using IteratorType = rocprim::transform_iterator, acc_type>; + using IteratorType = test_utils::transform_iterator>; // for non-associative operations in inclusive scan // intermediate results use the type of input iterator, then // as all conversions in the tests are to more precise types, @@ -697,7 +697,7 @@ TYPED_TEST(HipcubDeviceScanTests, ExclusiveScan) // use float as device-side accumulator and double as host-side accumulator using is_add_op = test_utils::is_add_operator; using acc_type = typename accum_type::type; - using IteratorType = rocprim::transform_iterator, acc_type>; + using IteratorType = test_utils::transform_iterator>; constexpr bool inplace = std::is_same_v && std::is_same_v; // for non-associative operations in inclusive scan @@ -902,7 +902,7 @@ TYPED_TEST(HipcubDeviceScanTests, ExclusiveScanByKey) // use float as device-side accumulator and double as host-side accumulator using is_add_op = test_utils::is_add_operator; using acc_type = typename accum_type::type; - using IteratorType = rocprim::transform_iterator, acc_type>; + using IteratorType = test_utils::transform_iterator>; // for non-associative operations in inclusive scan // intermediate results use the type of input iterator, then // as all conversions in the tests are to more precise types, @@ -1090,7 +1090,7 @@ TYPED_TEST(HipcubDeviceScanTests, ExclusiveScanByKey) TEST(HipcubDeviceScanTests, LargeIndicesInclusiveScan) { using T = unsigned int; - using InputIterator = rocprim::counting_iterator; + using InputIterator = test_utils::counting_iterator; using OutputIterator = test_utils::single_index_iterator; const size_t size = (1ul << 31) + 1ul; @@ -1100,7 +1100,7 @@ TEST(HipcubDeviceScanTests, LargeIndicesInclusiveScan) unsigned int seed_value = rand(); SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); - // Create rocprim::counting_iterator with random starting point + // Create test_utils::counting_iterator with random starting point InputIterator input_begin(test_utils::get_random_value(0, 200, seed_value)); T* d_output; @@ -1160,7 +1160,7 @@ TEST(HipcubDeviceScanTests, LargeIndicesInclusiveScan) TEST(HipcubDeviceScanTests, LargeIndicesExclusiveScan) { using T = unsigned int; - using InputIterator = rocprim::counting_iterator; + using InputIterator = test_utils::counting_iterator; using OutputIterator = test_utils::single_index_iterator; const size_t size = (1ul << 31) + 1ul; @@ -1170,7 +1170,7 @@ TEST(HipcubDeviceScanTests, LargeIndicesExclusiveScan) unsigned int seed_value = rand(); SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); - // Create rocprim::counting_iterator with random starting point + // Create test_utils::counting_iterator with random starting point InputIterator input_begin(test_utils::get_random_value(0, 200, seed_value)); T initial_value = test_utils::get_random_value(1, 10, seed_value); @@ -1255,7 +1255,7 @@ TYPED_TEST(HipcubDeviceScanTests, ExclusiveScanFuture) // use float as device-side accumulator and double as host-side accumulator using is_add_op = test_utils::is_add_operator; using acc_type = typename accum_type::type; - using IteratorType = rocprim::transform_iterator, acc_type>; + using IteratorType = test_utils::transform_iterator>; // for non-associative operations in inclusive scan // intermediate results use the type of input iterator, then // as all conversions in the tests are to more precise types, diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp index dcce1b00103..5470629ed7c 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp @@ -1143,7 +1143,7 @@ TEST(HipcubDeviceSegmentedReduceLargeIndicesTests, LargeIndices) using T = size_t; using input_type = T; using output_type = T; - using IteratorType = rocprim::counting_iterator; + using IteratorType = test_utils::counting_iterator; using reduce_op_type = typename hipcub::Sum; using offset_type = T; diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp index 02c21990492..4fec0539858 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp @@ -242,8 +242,8 @@ TEST(HipcubDeviceSelectTests, FlagNormalization) for(size_t size : test_utils::get_sizes(seed_value)) { SCOPED_TRACE(testing::Message() << "with size= " << size); - rocprim::counting_iterator d_input(0); - rocprim::counting_iterator d_flags(1); + test_utils::counting_iterator d_input(0); + test_utils::counting_iterator d_flags(1); U* d_output; unsigned int* d_selected_count_output; @@ -805,9 +805,11 @@ TEST(HipcubDeviceSelectTests, UniqueDiscardOutputIterator) for(size_t size : test_utils::get_sizes(seed_value)) { SCOPED_TRACE(testing::Message() << "with size= " << size); - rocprim::counting_iterator d_input(0); - rocprim::discard_iterator d_output; - size_t* d_selected_count_output; + test_utils::counting_iterator d_input(0); + + auto d_output = test_utils::make_discard_iterator(); + + size_t* d_selected_count_output; HIP_CHECK(test_common_utils::hipMallocHelper((&d_selected_count_output), sizeof(size_t))); @@ -899,7 +901,7 @@ TEST_P(HipcubDeviceSelectLargeIndicesTests, LargeIndicesSelectOp) #endif // Generate data - rocprim::counting_iterator d_input(0); + test_utils::counting_iterator d_input(0); U* d_output; selected_count_type* d_selected_count_output; selected_count_type expected_output_size = selected_size; @@ -1239,8 +1241,8 @@ TEST(HipcubDeviceUniqueByKeyTests, LargeIndicesUniqueByKey) = (size + TestUniqueEqualityOp::segment - 1) / TestUniqueEqualityOp::segment; const size_t output_index = selected_count - 1; const size_t input_index = output_index * TestUniqueEqualityOp::segment; - rocprim::counting_iterator d_keys_input(0); - rocprim::counting_iterator d_values_input(123); + test_utils::counting_iterator d_keys_input(0); + test_utils::counting_iterator d_values_input(123); key_type* d_keys_output; value_type* d_values_output; HIP_CHECK(test_common_utils::hipMallocHelper(&d_keys_output, sizeof(*d_keys_output))); diff --git a/projects/hipcub/test/hipcub/test_hipcub_iterators.cpp b/projects/hipcub/test/hipcub/test_hipcub_iterators.cpp index f0763ce03d7..eb46de4f09a 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_iterators.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_iterators.cpp @@ -230,7 +230,7 @@ TYPED_TEST(HipcubIteratorTests, TestConstant) HIP_CHECK(hipSetDevice(device_id)); using T = typename TestFixture::input_type; - using IteratorType = rocprim::constant_iterator; + using IteratorType = test_utils::constant_iterator; constexpr uint32_t array_size = 8; std::vector h_reference(array_size); @@ -257,7 +257,7 @@ TYPED_TEST(HipcubIteratorTests, TestCounting) HIP_CHECK(hipSetDevice(device_id)); using T = typename TestFixture::input_type; - using IteratorType = rocprim::counting_iterator; + using IteratorType = test_utils::counting_iterator; constexpr uint32_t array_size = 8; std::vector h_reference(array_size); @@ -289,7 +289,7 @@ TYPED_TEST(HipcubIteratorTests, TestTransform) using T = typename TestFixture::input_type; using CastT = typename TestFixture::input_type; - using IteratorType = rocprim::transform_iterator, T>; + using IteratorType = test_utils::transform_iterator>; constexpr int TEST_VALUES = 11000; std::vector h_data(TEST_VALUES); @@ -537,11 +537,12 @@ TYPED_TEST(HipcubIteratorTests, TestTexTransform) HIP_CHECK(d_tex_itr.BindTexture(d_data, sizeof(T) * TEST_VALUES)); // Create transform iterator - rocprim::transform_iterator, T> xform_itr(d_tex_itr, + test_utils::transform_iterator> xform_itr(d_tex_itr, op); - iterator_test_function, T>, - T>(xform_itr, h_reference); + iterator_test_function>>( + xform_itr, + h_reference); HIP_CHECK(g_allocator.DeviceFree(d_data)); } } diff --git a/projects/hipcub/test/hipcub/test_hipcub_thread.cpp b/projects/hipcub/test/hipcub/test_hipcub_thread.cpp index d47f85746e6..0cc347d7aef 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_thread.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_thread.cpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -484,8 +484,16 @@ void thread_reduce_kernel(Type* const device_input, Type* device_output) { size_t input_index = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * Length; size_t output_index = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * Length; - device_output[output_index] - = hipcub::ThreadReduce(&device_input[input_index], sum_op()); + + // Load into a local array + Type values[Length]; +#pragma unroll + for(int i = 0; i < Length; i++) + { + values[i] = device_input[input_index + i]; + } + + device_output[output_index] = hipcub::ThreadReduce(values, sum_op()); } TYPED_TEST(HipcubThreadOperationTests, Reduction) diff --git a/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp b/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp index 9a0d0d0981a..1fa95f64f47 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp @@ -702,7 +702,8 @@ TYPED_TEST(HipcubNCThreadOperatorsTests, CastOp) using input_type = typename TestFixture::input_type; using output_type = typename TestFixture::output_type; using IteratorType - = rocprim::transform_iterator, output_type>; + = test_utils::transform_iterator, output_type>; + const std::vector sizes = get_sizes(); for(auto input_size : sizes) { diff --git a/projects/hipcub/test/hipcub/test_hipcub_util_ptx.cpp b/projects/hipcub/test/hipcub/test_hipcub_util_ptx.cpp index 5c59df72354..26ce3faa6a7 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_util_ptx.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_util_ptx.cpp @@ -671,7 +671,11 @@ __global__ void warp_id_kernel(unsigned int* output) { const unsigned int index = (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x; - output[index] = ::rocprim::warp_id(); +#ifdef __HIP_PLATFORM_NVIDIA__ + output[index] = hipThreadIdx_x / warpSize; +#else + output[index] = ::rocprim::warp_id(); +#endif } TEST(HipcubUtilPtxTests, WarpId) @@ -755,7 +759,11 @@ template HIPCUB_DEVICE std::enable_if_t<(HIPCUB_DEVICE_WARP_THREADS >= LogicalWarpSize), TestStatus> test_warp_mask_pow_two() { +#ifdef __HIP_PLATFORM_NVIDIA__ + const unsigned int logical_warp_id = (hipThreadIdx_x % warpSize) / LogicalWarpSize; +#else const unsigned int logical_warp_id = ::rocprim::lane_id() / LogicalWarpSize; +#endif const uint64_t mask = hipcub::WarpMask(logical_warp_id); const unsigned int warp_start = logical_warp_id * LogicalWarpSize; @@ -796,7 +804,11 @@ template HIPCUB_DEVICE std::enable_if_t<(HIPCUB_DEVICE_WARP_THREADS >= LogicalWarpSize), TestStatus> test_warp_mask_non_pow_two() { +#ifdef __HIP_PLATFORM_NVIDIA__ + const unsigned int logical_warp_id = (hipThreadIdx_x % warpSize) / LogicalWarpSize; +#else const unsigned int logical_warp_id = ::rocprim::lane_id() / LogicalWarpSize; +#endif const uint64_t mask = hipcub::WarpMask(logical_warp_id); for(unsigned int lane = 0; lane < LogicalWarpSize; ++lane) diff --git a/projects/hipcub/test/hipcub/test_utils.hpp b/projects/hipcub/test/hipcub/test_utils.hpp index b0674b64a31..89cdb8cf0a8 100644 --- a/projects/hipcub/test/hipcub/test_utils.hpp +++ b/projects/hipcub/test/hipcub/test_utils.hpp @@ -27,10 +27,20 @@ // hipCUB API #ifdef __HIP_PLATFORM_AMD__ - #include + #include + #include + #include + #include + #include #elif defined(__HIP_PLATFORM_NVIDIA__) + #include + #include + #include #include - #include + #include + #include + #include + #include #endif #include "test_utils_assertions.hpp" @@ -45,6 +55,8 @@ // Seed values #include "test_seed.hpp" +#include +#include #include namespace test_utils @@ -620,6 +632,85 @@ inline constexpr auto ceiling_div(const T a, const U b) return a / b + (a % b > 0 ? 1 : 0); } +#if defined(__HIP_PLATFORM_AMD__) + +template +using extents = ::hipcub::extents; + +template +struct extents_size; + +template +struct extents_size> +{ + static constexpr std::size_t value = (Dims * ... * 1); +}; + +template +using constant_iterator = ::rocprim::constant_iterator; + +template +using counting_iterator = ::rocprim::counting_iterator; + +template> +using transform_iterator = ::rocprim::transform_iterator; + +struct discard_iterator : public ::rocprim::discard_iterator +{ + using base_type = ::rocprim::discard_iterator; + using value_type = void; + using difference_type = std::ptrdiff_t; + using iterator_category = std::random_access_iterator_tag; + + using base_type::base_type; + + discard_iterator(const ::rocprim::discard_iterator& other) : base_type(other) {} +}; + +using discard_output_iterator = discard_iterator; + +inline auto make_discard_iterator() -> discard_iterator +{ + return discard_iterator(::rocprim::make_discard_iterator()); +} + +#elif defined(__HIP_PLATFORM_NVIDIA__) + +template +using extents = ::cuda::std::extents; + +template +struct extents_size; + +template +struct extents_size> +{ + static constexpr std::size_t value = (Dims * ... * 1); +}; + +template +using constant_iterator = ::cub::ConstantInputIterator; + +template +using counting_iterator = ::cub::CountingInputIterator; + +template> +using transform_iterator = ::cub::TransformInputIterator; + +template +using discard_iterator = ::cub::DiscardOutputIterator; + +template +using discard_output_iterator = ::cub::DiscardOutputIterator; + +template +inline auto make_discard_iterator() -> ::cub::DiscardOutputIterator +{ + return ::cub::DiscardOutputIterator(); +} + +#endif + } // namespace test_utils // Need for hipcub::DeviceReduce::Min/Max etc. diff --git a/projects/hipcub/test/hipcub/test_utils_assertions.hpp b/projects/hipcub/test/hipcub/test_utils_assertions.hpp index 2b3838b5011..826255d7103 100644 --- a/projects/hipcub/test/hipcub/test_utils_assertions.hpp +++ b/projects/hipcub/test/hipcub/test_utils_assertions.hpp @@ -236,8 +236,10 @@ inline auto assert_near(const custom_test_type& result, const custom_test_typ { auto diff1 = std::abs(percent * expected.x); auto diff2 = std::abs(percent * expected.y); - if(!bit_equal(result.x, expected.x)) ASSERT_NEAR(result.x, expected.x, diff1); - if(!bit_equal(result.x, expected.x)) ASSERT_NEAR(result.y, expected.y, diff2); + if(!bit_equal(result.x, expected.x)) + ASSERT_NEAR(result.x, expected.x, diff1); + if(!bit_equal(result.y, expected.y)) + ASSERT_NEAR(result.y, expected.y, diff2); } template @@ -245,7 +247,7 @@ inline auto assert_near(const custom_test_type& result, const custom_test_typ -> typename std::enable_if::value>::type { ASSERT_EQ(result.x,expected.x); - ASSERT_EQ(result.y,expected.y); + ASSERT_EQ(result.x,expected.x); } // End assert_near @@ -348,5 +350,5 @@ inline void assert_type(ExpectedT /*obj1*/, ActualT /*obj2*/) { testing::StaticAssertTypeEq(); } -} +} // namespace test_utils #endif // HIPCUB_TEST_HIPCUB_TEST_UTILS_ASSERTIONS_HPP_ From cf1f9bb1106552322fe65b524477c709c243b8d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Wed, 14 Jan 2026 10:51:39 +0000 Subject: [PATCH 49/95] Fix assert_near --- projects/hipcub/test/hipcub/test_utils_assertions.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/hipcub/test/hipcub/test_utils_assertions.hpp b/projects/hipcub/test/hipcub/test_utils_assertions.hpp index 826255d7103..ffed4ccfdd8 100644 --- a/projects/hipcub/test/hipcub/test_utils_assertions.hpp +++ b/projects/hipcub/test/hipcub/test_utils_assertions.hpp @@ -247,7 +247,7 @@ inline auto assert_near(const custom_test_type& result, const custom_test_typ -> typename std::enable_if::value>::type { ASSERT_EQ(result.x,expected.x); - ASSERT_EQ(result.x,expected.x); + ASSERT_EQ(result.y,expected.y); } // End assert_near From c9c9341379200b7d81baac35bd5088fb310a052d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Wed, 7 Jan 2026 09:21:11 +0000 Subject: [PATCH 50/95] Fix is_floating_point_v error for half/bfloat16 with CUDA --- projects/hipcub/test/hipcub/bfloat16.hpp | 29 ++++++++++++++++++++++++ projects/hipcub/test/hipcub/half.hpp | 25 ++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/projects/hipcub/test/hipcub/bfloat16.hpp b/projects/hipcub/test/hipcub/bfloat16.hpp index 6cf698a86b2..8aed6fc7c22 100644 --- a/projects/hipcub/test/hipcub/bfloat16.hpp +++ b/projects/hipcub/test/hipcub/bfloat16.hpp @@ -40,6 +40,8 @@ #include #if defined(__HIP_PLATFORM_NVIDIA__) + #include + #include #include #else #include @@ -51,6 +53,33 @@ #pragma GCC diagnostic ignored "-Wstrict-aliasing" #endif +struct bfloat16_t; + +#if defined(__HIP_PLATFORM_NVIDIA__) + #include + #include + +using hip_bfloat16 = bfloat16_t; +namespace cuda +{ +namespace std +{ + +template<> +struct is_floating_point : true_type +{}; + +template<> +class numeric_limits +{ +public: + static constexpr bool is_specialized = true; +}; + +} // namespace std +} // namespace cuda + +#endif // __HIP_PLATFORM_NVIDIA__ /****************************************************************************** * bfloat16_t diff --git a/projects/hipcub/test/hipcub/half.hpp b/projects/hipcub/test/hipcub/half.hpp index f5d87dd785d..82b78a6096f 100644 --- a/projects/hipcub/test/hipcub/half.hpp +++ b/projects/hipcub/test/hipcub/half.hpp @@ -39,6 +39,8 @@ #include #if defined(__HIP_PLATFORM_NVIDIA__) + #include + #include #include #else #include @@ -53,6 +55,29 @@ #pragma GCC diagnostic ignored "-Wstrict-aliasing" #endif +struct half_t; + +#if defined(__HIP_PLATFORM_NVIDIA__) + +namespace cuda +{ +namespace std +{ +template<> +struct is_floating_point : true_type +{}; + +template<> +class numeric_limits +{ +public: + static constexpr bool is_specialized = true; +}; + +} // namespace std +} // namespace cuda + +#endif // __HIP_PLATFORM_NVIDIA__ /****************************************************************************** * half_t From 55b1831049f6177a008db5d1e9518a012241d2d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Wed, 7 Jan 2026 10:45:03 +0000 Subject: [PATCH 51/95] Replace util_arch.cuh macros with inline constexpr variables --- projects/hipcub/hipcub/include/hipcub/config.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/projects/hipcub/hipcub/include/hipcub/config.hpp b/projects/hipcub/hipcub/include/hipcub/config.hpp index 7285cd9e5b2..a82d3cb96b9 100644 --- a/projects/hipcub/hipcub/include/hipcub/config.hpp +++ b/projects/hipcub/hipcub/include/hipcub/config.hpp @@ -120,9 +120,9 @@ END_HIPCUB_NAMESPACE #define HIPCUB_RUNTIME_FUNCTION CUB_RUNTIME_FUNCTION #include - #define HIPCUB_WARP_THREADS warp_threads - #define HIPCUB_DEVICE_WARP_THREADS warp_threads - #define HIPCUB_HOST_WARP_THREADS warp_threads + #define HIPCUB_WARP_THREADS CUB_PTX_WARP_THREADS + #define HIPCUB_DEVICE_WARP_THREADS CUB_PTX_WARP_THREADS + #define HIPCUB_HOST_WARP_THREADS CUB_PTX_WARP_THREADS #define HIPCUB_ARCH CUB_PTX_ARCH BEGIN_HIPCUB_NAMESPACE using namespace cub; From 10b803765cb097378455a86f84b3f89f0fcf0b6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Wed, 7 Jan 2026 11:13:28 +0000 Subject: [PATCH 52/95] Add cuda headers --- projects/hipcub/hipcub/include/hipcub/config.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/projects/hipcub/hipcub/include/hipcub/config.hpp b/projects/hipcub/hipcub/include/hipcub/config.hpp index a82d3cb96b9..e5ed5a94e07 100644 --- a/projects/hipcub/hipcub/include/hipcub/config.hpp +++ b/projects/hipcub/hipcub/include/hipcub/config.hpp @@ -120,6 +120,8 @@ END_HIPCUB_NAMESPACE #define HIPCUB_RUNTIME_FUNCTION CUB_RUNTIME_FUNCTION #include + #include + #include #define HIPCUB_WARP_THREADS CUB_PTX_WARP_THREADS #define HIPCUB_DEVICE_WARP_THREADS CUB_PTX_WARP_THREADS #define HIPCUB_HOST_WARP_THREADS CUB_PTX_WARP_THREADS From b072d5da6c3423e78ade5b4ee4daf3c772da6b67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Wed, 7 Jan 2026 23:08:02 +0000 Subject: [PATCH 53/95] Fix device histogram --- .../backend/cub/device/device_histogram.hpp | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_histogram.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_histogram.hpp index 5b18cac7754..f184c4d2ebd 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_histogram.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_histogram.hpp @@ -147,6 +147,81 @@ struct DeviceHistogram stream); } + template + HIPCUB_RUNTIME_FUNCTION + static hipError_t MultiHistogramEven(void* d_temp_storage, + size_t& temp_storage_bytes, + SampleIteratorT d_samples, + CounterT* d_histogram[NUM_ACTIVE_CHANNELS], + int num_levels[NUM_ACTIVE_CHANNELS], + LevelT lower_level[NUM_ACTIVE_CHANNELS], + LevelT upper_level[NUM_ACTIVE_CHANNELS], + OffsetT num_row_pixels, + OffsetT num_rows, + size_t row_stride_bytes, + hipStream_t stream = 0) + { + return hipCUDAErrorTohipError( + ::cub::DeviceHistogram::MultiHistogramEven( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_histogram, + num_levels, + lower_level, + upper_level, + num_row_pixels, + num_rows, + row_stride_bytes, + reinterpret_cast(stream))); + } + + template +HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS +HIPCUB_RUNTIME_FUNCTION + static hipError_t MultiHistogramEven(void* d_temp_storage, + size_t& temp_storage_bytes, + SampleIteratorT d_samples, + CounterT* d_histogram[NUM_ACTIVE_CHANNELS], + int num_levels[NUM_ACTIVE_CHANNELS], + LevelT lower_level[NUM_ACTIVE_CHANNELS], + LevelT upper_level[NUM_ACTIVE_CHANNELS], + OffsetT num_row_pixels, + OffsetT num_rows, + size_t row_stride_bytes, + hipStream_t stream, + bool debug_synchronous) + { + HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); + + return MultiHistogramEven(d_temp_storage, + temp_storage_bytes, + d_samples, + d_histogram, + num_levels, + lower_level, + upper_level, + num_row_pixels, + num_rows, + row_stride_bytes, + stream); + } + template HIPCUB_RUNTIME_FUNCTION static hipError_t HistogramRange(void* d_temp_storage, size_t& temp_storage_bytes, From 05ea181cd097a5d91596da169b2e7254a565685c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Thu, 8 Jan 2026 08:27:27 +0000 Subject: [PATCH 54/95] Fix to_bits --- projects/hipcub/test/hipcub/test_utils_sort_comparator.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/projects/hipcub/test/hipcub/test_utils_sort_comparator.hpp b/projects/hipcub/test/hipcub/test_utils_sort_comparator.hpp index 91782408cb9..00fe6bb8d86 100644 --- a/projects/hipcub/test/hipcub/test_utils_sort_comparator.hpp +++ b/projects/hipcub/test/hipcub/test_utils_sort_comparator.hpp @@ -72,7 +72,9 @@ Key to_bits(const Key key) template constexpr bool is_extended_fp = std::is_same_v || std::is_same_v - || std::is_same_v || std::is_same_v; + || std::is_same_v || std::is_same_v + || std::is_same_v; + template Date: Thu, 8 Jan 2026 08:39:06 +0000 Subject: [PATCH 55/95] Fix ForEachInExtentsAPI test --- .../test/hipcub/test_hipcub_device_for.cpp | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp index 36840d4aa86..5e291994a1f 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp @@ -964,6 +964,22 @@ struct LinearStore } }; +template +struct ForEachInExtentsOp + +{ + using op_data_t = item_t[3]; + void* d_data; + + __device__ __host__ __forceinline__ + void operator()(int idx, int x, int y, int z) + { + auto& i = static_cast(d_data)[idx]; + // We use the "placement new" operator to copy the data from an initializer list. + new(&i) op_data_t{x, y, z}; + } +}; + TYPED_TEST_SUITE(HipcubDeviceForEachInExtentsTests, HipcubDeviceForEachInExtentsTestsParams); TEST(HipcubDeviceForEachInExtentsTests, ForEachInExtentsAPI) @@ -999,21 +1015,7 @@ TEST(HipcubDeviceForEachInExtentsTests, ForEachInExtentsAPI) HIP_CHECK(test_common_utils::hipMallocHelper(&d_input, memory_size)); HIP_CHECK(hipMemset(d_input, 0, memory_size)); - struct Op - { - using op_data_t = item_t[3]; - void* d_data; - - __device__ __host__ __forceinline__ - void operator()(int idx, int x, int y, int z) - { - auto& i = static_cast(d_data)[idx]; - // We use the "placement new" operator to copy the data from an initializer list. - new(&i) op_data_t{x, y, z}; - } - }; - - HIP_CHECK(hipcub::DeviceFor::ForEachInExtents(ext, Op{d_input})); + HIP_CHECK(hipcub::DeviceFor::ForEachInExtents(ext, ForEachInExtentsOp{d_input})); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipDeviceSynchronize()); From 9d685f1408281f38b91d85973e71dcebef0b1f77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Thu, 8 Jan 2026 12:11:18 +0000 Subject: [PATCH 56/95] Fix device_test_enabled_for_warp_size_v for CUDA --- projects/hipcub/test/hipcub/test_utils.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/hipcub/test/hipcub/test_utils.hpp b/projects/hipcub/test/hipcub/test_utils.hpp index 89cdb8cf0a8..25bac1d9448 100644 --- a/projects/hipcub/test/hipcub/test_utils.hpp +++ b/projects/hipcub/test/hipcub/test_utils.hpp @@ -621,8 +621,8 @@ constexpr T get_min_warp_size(const T block_size, const T max_warp_size) } template -__device__ constexpr bool device_test_enabled_for_warp_size_v - = HIPCUB_DEVICE_WARP_THREADS >= LogicalWarpSize; +constexpr bool device_test_enabled_for_warp_size_v + = (HIPCUB_DEVICE_WARP_THREADS >= LogicalWarpSize); template Date: Fri, 9 Jan 2026 09:28:41 +0000 Subject: [PATCH 57/95] Fix device_reduce CUB backend --- .../hipcub/include/hipcub/backend/cub/device/device_reduce.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp index 4f3a95baf1c..71dfa7ed384 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp @@ -164,7 +164,7 @@ class DeviceReduce ExtremumOutIteratorT d_max_out, IndexOutIteratorT d_index_out, std::int64_t num_items, - hipError_t stream = 0) + hipStream_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, From 27892a1f2ce7991912010f7ab577212bf9a9cb96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Wed, 14 Jan 2026 12:55:32 +0000 Subject: [PATCH 58/95] Fix cub util_type include --- projects/hipcub/test/hipcub/test_utils.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/projects/hipcub/test/hipcub/test_utils.hpp b/projects/hipcub/test/hipcub/test_utils.hpp index 25bac1d9448..fff732ed764 100644 --- a/projects/hipcub/test/hipcub/test_utils.hpp +++ b/projects/hipcub/test/hipcub/test_utils.hpp @@ -37,12 +37,13 @@ #include #include #include - #include #include #include #include #endif +#include + #include "test_utils_assertions.hpp" #include "test_utils_bfloat16.hpp" #include "test_utils_custom_test_types.hpp" From f130538a19f52324d3de4edb1e9d84f6acb4863d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Thu, 15 Jan 2026 14:40:34 +0000 Subject: [PATCH 59/95] Update WarpExchange rocPRIM backend for CUB compatibility --- projects/hipcub/CHANGELOG.md | 3 +- .../benchmark/benchmark_warp_exchange.cpp | 6 +- .../backend/cub/thread/thread_store.hpp | 134 ++++++++++++++++++ .../include/hipcub/backend/cub/util_type.hpp | 3 + .../backend/rocprim/warp/warp_exchange.hpp | 3 +- .../hipcub/backend/rocprim/warp/warp_load.hpp | 9 +- .../backend/rocprim/warp/warp_store.hpp | 9 +- .../include/hipcub/thread/thread_store.hpp | 6 +- .../test/hipcub/test_hipcub_warp_exchange.cpp | 6 +- 9 files changed, 149 insertions(+), 30 deletions(-) create mode 100644 projects/hipcub/hipcub/include/hipcub/backend/cub/thread/thread_store.hpp diff --git a/projects/hipcub/CHANGELOG.md b/projects/hipcub/CHANGELOG.md index caac9b2da3d..b7984eb8d05 100644 --- a/projects/hipcub/CHANGELOG.md +++ b/projects/hipcub/CHANGELOG.md @@ -25,7 +25,8 @@ Full documentation for hipCUB is available at [https://rocm.docs.amd.com/project * Changed `CCCL_MINIMUM_VERSION` to `3.0.0` to align with CUB. * Add support for large num_items `DeviceMerge` and `DeviceSegmentedSort`. * Replace `#pragma unroll` by `_CCCL_PRAGMA_UNROLL_FULL()` and `_CCCL_PRAGMA_NOUNROLL()` by `_CCCL_PRAGMA_NOUNROLL()`. -* Add `_CCCL_SORT_MAYBE_UNROLL()` in block merge sort and thread sort. +* Add `_CCCL_SORT_MAYBE_UNROLL()` in block merge sort and thread sort. +* Update `WarpExchange` template parameters for CUB compatibility. ### Removed diff --git a/projects/hipcub/benchmark/benchmark_warp_exchange.cpp b/projects/hipcub/benchmark/benchmark_warp_exchange.cpp index 3da97a063cc..d780e1a2c53 100644 --- a/projects/hipcub/benchmark/benchmark_warp_exchange.cpp +++ b/projects/hipcub/benchmark/benchmark_warp_exchange.cpp @@ -47,11 +47,7 @@ __device__ auto warp_exchange_benchmark(T* d_output) thread_data[i] = static_cast(i); } - using WarpExchangeT = ::hipcub::WarpExchange; + using WarpExchangeT = ::hipcub::WarpExchange; constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_in_block]; const unsigned warp_id = threadIdx.x / LogicalWarpSize; diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/thread/thread_store.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/thread/thread_store.hpp new file mode 100644 index 00000000000..02a260e5c5f --- /dev/null +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/thread/thread_store.hpp @@ -0,0 +1,134 @@ +// Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef HIPCUB_BACKEND_CUB_THREAD_STORE_HPP_ +#define HIPCUB_BACKEND_CUB_THREAD_STORE_HPP_ + +#include "../../../config.hpp" +#include "../util_type.hpp" + +#include // CUB thread store + +#include +#include + +BEGIN_HIPCUB_NAMESPACE + +enum CacheStoreModifier +{ + STORE_DEFAULT = 0, + STORE_WB = 1, + STORE_CG = 2, + STORE_CS = 3, + STORE_WT = 4, + STORE_VOLATILE = 5 +}; + +template +struct cub_cache_store_modifier_map +{ + static constexpr ::cub::CacheStoreModifier value = static_cast<::cub::CacheStoreModifier>(MOD); +}; + +template +HIPCUB_DEVICE +HIPCUB_FORCEINLINE void ThreadStoreVolatilePtr(T* ptr, T val, Fundamental /*is_fundamental*/) +{ + ::cub::ThreadStore<::cub::STORE_VOLATILE>(ptr, val); +} + +template +HIPCUB_DEVICE +HIPCUB_FORCEINLINE void ThreadStore(T* ptr, + T val, + ::std::integral_constant /*modifier*/, + ::std::true_type /*is_pointer*/) +{ + ::cub::ThreadStore::value>(ptr, val); +} + +template +HIPCUB_DEVICE +HIPCUB_FORCEINLINE void ThreadStore(OutputIteratorT itr, + T val, + ::std::integral_constant /*modifier*/, + ::std::false_type /*is_pointer*/) +{ + ThreadStore(&(*itr), + val, + ::std::integral_constant{}, + ::std::true_type{}); +} + +template +HIPCUB_DEVICE +HIPCUB_FORCEINLINE void ThreadStore(OutputIteratorT itr, T val) +{ + ThreadStore(itr, + val, + ::std::integral_constant{}, + ::std::bool_constant<_HIPCUB_STD::is_pointer::value>()); +} + +namespace detail +{ + +template +struct iterate_thread_store +{ + template + static HIPCUB_DEVICE + HIPCUB_FORCEINLINE void Store(T* ptr, T* vals) + { + ThreadStore(ptr + COUNT, + vals[COUNT], + ::std::integral_constant{}, + ::std::true_type{}); + iterate_thread_store::template Store(ptr, vals); + } + + template + static HIPCUB_DEVICE + HIPCUB_FORCEINLINE void Dereference(OutputIteratorT ptr, T* vals) + { + ptr[COUNT] = vals[COUNT]; + iterate_thread_store::Dereference(ptr, vals); + } +}; + +template +struct iterate_thread_store +{ + template + static HIPCUB_DEVICE + HIPCUB_FORCEINLINE void Store(T* /*ptr*/, T* /*vals*/) + {} + + template + static HIPCUB_DEVICE + HIPCUB_FORCEINLINE void Dereference(OutputIteratorT /*ptr*/, T* /*vals*/) + {} +}; + +} // namespace detail + +END_HIPCUB_NAMESPACE + +#endif // HIPCUB_BACKEND_CUB_THREAD_STORE_HPP_ diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/util_type.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/util_type.hpp index aaa05042d76..3ec0ebd64db 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/util_type.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/util_type.hpp @@ -83,6 +83,9 @@ struct lazy_trait using type = Trait; }; +template +using int_constant_t = _HIPCUB_STD::integral_constant; + } // namespace detail END_HIPCUB_NAMESPACE diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/warp/warp_exchange.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/warp/warp_exchange.hpp index 20a85eab8d5..7fac1a910fb 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/warp/warp_exchange.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/warp/warp_exchange.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -63,7 +63,6 @@ using InternalWarpExchangeImpl template class WarpExchange : private detail::InternalWarpExchangeImpl struct LoadInternal { - using WarpExchangeT = WarpExchange< - InputT, - ITEMS_PER_THREAD, - LOGICAL_WARP_THREADS, - ARCH - >; + using WarpExchangeT = WarpExchange; using TempStorage = typename WarpExchangeT::TempStorage; TempStorage& temp_storage; int linear_tid; diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/warp/warp_store.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/warp/warp_store.hpp index 23e0c52f6cd..a3be7d521c6 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/warp/warp_store.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/warp/warp_store.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -200,12 +200,7 @@ class WarpStore template <> struct StoreInternal { - using WarpExchangeT = WarpExchange< - T, - ITEMS_PER_THREAD, - LOGICAL_WARP_THREADS, - ARCH - >; + using WarpExchangeT = WarpExchange; using TempStorage = typename WarpExchangeT::TempStorage; TempStorage& temp_storage; int linear_tid; diff --git a/projects/hipcub/hipcub/include/hipcub/thread/thread_store.hpp b/projects/hipcub/hipcub/include/hipcub/thread/thread_store.hpp index c982be81ec7..6dbd6ed4434 100644 --- a/projects/hipcub/hipcub/include/hipcub/thread/thread_store.hpp +++ b/projects/hipcub/hipcub/include/hipcub/thread/thread_store.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2021-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2021-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -38,8 +38,8 @@ #include "../backend/rocprim/thread/thread_store.hpp" // IWYU pragma: export #elif defined(__HIP_PLATFORM_NVIDIA__) - #include "../config.hpp" - #include // IWYU pragma: export + #include "../backend/cub/thread/thread_store.hpp" // IWYU pragma: export + #endif #endif diff --git a/projects/hipcub/test/hipcub/test_hipcub_warp_exchange.cpp b/projects/hipcub/test/hipcub/test_hipcub_warp_exchange.cpp index a01d6318a20..25bd35263d5 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_warp_exchange.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_warp_exchange.cpp @@ -133,11 +133,7 @@ __device__ auto warp_exchange_test(T* d_input, T* d_output) thread_data[i] = d_input[threadIdx.x * ItemsPerThread + i]; } - using WarpExchangeT = ::hipcub::WarpExchange; + using WarpExchangeT = ::hipcub::WarpExchange; constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_in_block]; const unsigned warp_id = threadIdx.x / LogicalWarpSize; From 0727caf0fe5a4d5f143be6cb8d1b262ebdb2c867 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Fri, 9 Jan 2026 12:23:42 +0000 Subject: [PATCH 60/95] Fix for ArgMax/Min --- .../backend/cub/device/device_reduce.hpp | 117 ++++++++++++++---- 1 file changed, 96 insertions(+), 21 deletions(-) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp index 71dfa7ed384..97660c72b5f 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp @@ -32,9 +32,12 @@ #include "../../../config.hpp" #include "../../../util_deprecated.hpp" +#include "../../../util_type.hpp" #include // IWYU pragma: export +#include + BEGIN_HIPCUB_NAMESPACE class DeviceReduce @@ -103,9 +106,18 @@ class DeviceReduce InputIteratorT d_in, ExtremumOutIteratorT d_min_out, IndexOutIteratorT d_index_out, - std::int64_t num_items, + int num_items, hipStream_t stream = 0) { + + using value_type = ::hipcub::detail::it_value_t; + using index_type = int64_t; + using pair_type = ::cub::KeyValuePair; + + static_cast(sizeof(pair_type)); + static_cast(sizeof(index_type)); + static_cast(sizeof(value_type)); + return hipCUDAErrorTohipError(::cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, @@ -130,13 +142,23 @@ class DeviceReduce NumItemsT num_items, hipStream_t stream = 0) { + using value_type = ::hipcub::detail::it_value_t; + using index_type = int64_t; + using pair_type = ::hipcub::KeyValuePair; + + pair_type* out_pair = reinterpret_cast(d_out); + + value_type* d_min_out = &(out_pair->value); + index_type* d_index_out = &(out_pair->key); + _CCCL_SUPPRESS_DEPRECATED_PUSH - return hipCUDAErrorTohipError(::cub::DeviceReduce::ArgMin(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_items, - stream)); + return ArgMin(d_temp_storage, + temp_storage_bytes, + d_in, + d_min_out, + d_index_out, + static_cast(num_items), + stream); _CCCL_SUPPRESS_DEPRECATED_POP } @@ -163,9 +185,17 @@ class DeviceReduce InputIteratorT d_in, ExtremumOutIteratorT d_max_out, IndexOutIteratorT d_index_out, - std::int64_t num_items, + int num_items, hipStream_t stream = 0) { + using value_type = ::hipcub::detail::it_value_t; + using index_type = int64_t; + using pair_type = ::cub::KeyValuePair; + + static_cast(sizeof(pair_type)); + static_cast(sizeof(index_type)); + static_cast(sizeof(value_type)); + return hipCUDAErrorTohipError(::cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, @@ -176,13 +206,11 @@ class DeviceReduce } template - HIPCUB_DEPRECATED_BECAUSE( - "CUB has superseded this interface in favor of the ArgMax interface " - "that takes two separate " - "iterators: one iterator to which the extremum is written and another " - "iterator to which the " - "index of the found extremum is written. ") - HIPCUB_RUNTIME_FUNCTION + HIPCUB_DEPRECATED_BECAUSE("CUB has superseded this interface in favor of the ArgMax interface " + "that takes two separate iterators: one iterator to which the " + "extremum is written and another " + "iterator to which the index of the found extremum is written. ") +HIPCUB_RUNTIME_FUNCTION static hipError_t ArgMax(void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, @@ -190,13 +218,24 @@ class DeviceReduce NumItemsT num_items, hipStream_t stream = 0) { + + using value_type = ::hipcub::detail::it_value_t; + using index_type = int64_t; + using pair_type = ::hipcub::KeyValuePair; + + pair_type* out_pair = reinterpret_cast(d_out); + + value_type* d_max_out = &(out_pair->value); + index_type* d_index_out = &(out_pair->key); + _CCCL_SUPPRESS_DEPRECATED_PUSH - return hipCUDAErrorTohipError(::cub::DeviceReduce::ArgMax(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_items, - stream)); + return ArgMax(d_temp_storage, + temp_storage_bytes, + d_in, + d_max_out, + d_index_out, + static_cast(num_items), + stream); _CCCL_SUPPRESS_DEPRECATED_POP } @@ -258,6 +297,42 @@ class DeviceReduce num_items, stream)); } + +private: + template + struct value_only_pair_output_iterator + { + ScalarOutputIt out; + using value_type = ::cub::KeyValuePair; + HIPCUB_HOST_DEVICE + value_only_pair_output_iterator(ScalarOutputIt o) + : out(o) + {} + HIPCUB_HOST_DEVICE + value_only_pair_output_iterator& operator*() + { + return *this; + } + HIPCUB_HOST_DEVICE + value_only_pair_output_iterator& operator=(value_type const& p) + { + *out = p.value; + return *this; + } + HIPCUB_HOST_DEVICE + value_only_pair_output_iterator& operator++() + { + ++out; + return *this; + } + HIPCUB_HOST_DEVICE + value_only_pair_output_iterator operator++(int) + { + value_only_pair_output_iterator tmp = *this; + ++out; + return tmp; + } + }; }; END_HIPCUB_NAMESPACE From 26d19854a01fb80952486f55b1310c53399e3599 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Tue, 20 Jan 2026 10:43:48 +0000 Subject: [PATCH 61/95] Fix type in test --- .../hipcub/test/hipcub/test_hipcub_device_reduce.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp index 03362ec987a..120af930895 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp @@ -885,13 +885,15 @@ void test_argminmax_allinf(TypeParam value, TypeParam empty_value) if(size > 0) { // all +/- infinity should produce +/- infinity - ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output[0].key, 0)); + ASSERT_NO_FATAL_FAILURE( + test_utils::assert_eq(output[0].key, static_cast(0))); ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output[0].value, value)); } else { // empty input should produce a special value - ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output[0].key, 1)); + ASSERT_NO_FATAL_FAILURE( + test_utils::assert_eq(output[0].key, static_cast(1))); ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output[0].value, empty_value)); } } @@ -951,13 +953,15 @@ void test_argminmax_extremum(TypeParam value, TypeParam empty_value) if(size > 0) { // all +/- infinity should produce +/- infinity - ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output[0].key, 0)); + ASSERT_NO_FATAL_FAILURE( + test_utils::assert_eq(output[0].key, static_cast(0))); ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output[0].value, value)); } else { // empty input should produce a special value - ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output[0].key, 1)); + ASSERT_NO_FATAL_FAILURE( + test_utils::assert_eq(output[0].key, static_cast(1))); ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output[0].value, empty_value)); } } From 21ff3faa39a33580f3b012265e2626337a95b869 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Tue, 20 Jan 2026 15:20:38 +0000 Subject: [PATCH 62/95] Fix device select UniqueByKey --- .../include/hipcub/backend/cub/device/device_select.hpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_select.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_select.hpp index 87703809e4f..9e8f76f1a73 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_select.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_select.hpp @@ -294,7 +294,7 @@ class DeviceSelect typename OutputValueIteratorT, typename NumSelectedIteratorT, typename NumItemsT> - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + HIPCUB_RUNTIME_FUNCTION static hipError_t UniqueByKey(void* d_temp_storage, size_t& temp_storage_bytes, KeyIteratorT d_keys_input, @@ -303,10 +303,8 @@ class DeviceSelect OutputValueIteratorT d_values_output, NumSelectedIteratorT d_num_selected_out, NumItemsT num_items, - hipStream_t stream, - bool debug_synchronous) + hipStream_t stream = 0) { - HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return UniqueByKey(d_temp_storage, temp_storage_bytes, d_keys_input, From ffb1b7aa8bcb5ac4ce576445ed4d46dec5221c0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Tue, 20 Jan 2026 16:49:16 +0000 Subject: [PATCH 63/95] Fix clang format --- .../backend/cub/device/device_radix_sort.hpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_radix_sort.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_radix_sort.hpp index 6a71f472af2..30dd10b3180 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_radix_sort.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_radix_sort.hpp @@ -320,14 +320,15 @@ struct DeviceRadixSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t SortKeys(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - NumItemsT num_items, - int begin_bit = 0, - int end_bit = sizeof(KeyT) * 8, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t SortKeys(void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + NumItemsT num_items, + int begin_bit = 0, + int end_bit = sizeof(KeyT) * 8, + hipStream_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, From 9957d9fd51a284aebba2fa05c5c6a9cdee580cc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Tue, 20 Jan 2026 14:57:04 +0000 Subject: [PATCH 64/95] Fix type in benchmark segmented sort --- .../benchmark/benchmark_device_segmented_sort.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/projects/hipcub/benchmark/benchmark_device_segmented_sort.cpp b/projects/hipcub/benchmark/benchmark_device_segmented_sort.cpp index db69075e7ce..11121bfdb21 100644 --- a/projects/hipcub/benchmark/benchmark_device_segmented_sort.cpp +++ b/projects/hipcub/benchmark/benchmark_device_segmented_sort.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2020-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -46,8 +46,8 @@ void run_sort_keys_benchmark(benchmark::State& state, size_t&, const key_type*, key_type*, - int, - int, + int64_t, + int64_t, offset_type*, offset_type*, hipStream_t); @@ -181,8 +181,8 @@ void run_sort_pairs_benchmark(benchmark::State& state, key_type*, const value_type*, value_type*, - int, - int, + int64_t, + int64_t, offset_type*, offset_type*, hipStream_t); From 61aae79d1e7b68cfc2b2d7fb339d7e5cfb3c0973 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Tue, 20 Jan 2026 16:49:27 +0000 Subject: [PATCH 65/95] Fix types in segmented sort --- .../rocprim/device/device_segmented_sort.hpp | 38 ++++++++++--------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_sort.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_sort.hpp index e266bd635aa..a5f119277f9 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_sort.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_sort.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -269,15 +269,16 @@ struct DeviceSegmentedSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t SortKeys(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t SortKeys(void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return ::rocprim::segmented_radix_sort_keys(d_temp_storage, temp_storage_bytes, @@ -319,14 +320,15 @@ struct DeviceSegmentedSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t SortKeys(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t SortKeys(void* d_temp_storage, + size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { ::rocprim::double_buffer d_keys_db = detail::to_double_buffer(d_keys); hipError_t error = ::rocprim::segmented_radix_sort_keys(d_temp_storage, From e81342c3b5e553a7bdd82febba7b518c1d0e531c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Tue, 20 Jan 2026 22:17:28 +0000 Subject: [PATCH 66/95] Device segmented sort - support large indices --- .../rocprim/device/device_segmented_sort.hpp | 640 +++++++++--------- 1 file changed, 325 insertions(+), 315 deletions(-) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_sort.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_sort.hpp index a5f119277f9..6765a273d7d 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_sort.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_sort.hpp @@ -42,17 +42,18 @@ BEGIN_HIPCUB_NAMESPACE struct DeviceSegmentedSort { template - HIPCUB_RUNTIME_FUNCTION static hipError_t SortPairs(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t SortPairs(void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + const ValueT* d_values_in, + ValueT* d_values_out, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return ::rocprim::segmented_radix_sort_pairs(d_temp_storage, temp_storage_bytes, @@ -71,19 +72,19 @@ struct DeviceSegmentedSort } template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortPairs(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t SortPairs(void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + const ValueT* d_values_in, + ValueT* d_values_out, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return SortPairs(d_temp_storage, @@ -100,15 +101,16 @@ struct DeviceSegmentedSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t SortPairs(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t SortPairs(void* d_temp_storage, + size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + DoubleBuffer& d_values, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { ::rocprim::double_buffer d_keys_db = detail::to_double_buffer(d_keys); ::rocprim::double_buffer d_values_db = detail::to_double_buffer(d_values); @@ -130,17 +132,17 @@ struct DeviceSegmentedSort } template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortPairs(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t SortPairs(void* d_temp_storage, + size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + DoubleBuffer& d_values, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return SortPairs(d_temp_storage, @@ -155,17 +157,18 @@ struct DeviceSegmentedSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t SortPairsDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t SortPairsDescending(void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + const ValueT* d_values_in, + ValueT* d_values_out, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return ::rocprim::segmented_radix_sort_pairs_desc(d_temp_storage, temp_storage_bytes, @@ -184,19 +187,19 @@ struct DeviceSegmentedSort } template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortPairsDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t SortPairsDescending(void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + const ValueT* d_values_in, + ValueT* d_values_out, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return SortPairsDescending(d_temp_storage, @@ -213,15 +216,16 @@ struct DeviceSegmentedSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t SortPairsDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t SortPairsDescending(void* d_temp_storage, + size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + DoubleBuffer& d_values, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { ::rocprim::double_buffer d_keys_db = detail::to_double_buffer(d_keys); ::rocprim::double_buffer d_values_db = detail::to_double_buffer(d_values); @@ -244,17 +248,17 @@ struct DeviceSegmentedSort } template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortPairsDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t SortPairsDescending(void* d_temp_storage, + size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + DoubleBuffer& d_values, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return SortPairsDescending(d_temp_storage, @@ -295,17 +299,17 @@ struct DeviceSegmentedSort } template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortKeys(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t SortKeys(void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return SortKeys(d_temp_storage, @@ -347,16 +351,16 @@ struct DeviceSegmentedSort } template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortKeys(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t SortKeys(void* d_temp_storage, + size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return SortKeys(d_temp_storage, @@ -370,15 +374,16 @@ struct DeviceSegmentedSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t SortKeysDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t SortKeysDescending(void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return ::rocprim::segmented_radix_sort_keys_desc(d_temp_storage, temp_storage_bytes, @@ -395,17 +400,17 @@ struct DeviceSegmentedSort } template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortKeysDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t SortKeysDescending(void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return SortKeysDescending(d_temp_storage, @@ -420,14 +425,15 @@ struct DeviceSegmentedSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t SortKeysDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t SortKeysDescending(void* d_temp_storage, + size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { ::rocprim::double_buffer d_keys_db = detail::to_double_buffer(d_keys); hipError_t error @@ -447,16 +453,16 @@ struct DeviceSegmentedSort } template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - SortKeysDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t SortKeysDescending(void* d_temp_storage, + size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return SortKeysDescending(d_temp_storage, @@ -470,17 +476,18 @@ struct DeviceSegmentedSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t StableSortPairs(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortPairs(void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + const ValueT* d_values_in, + ValueT* d_values_out, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return SortPairs(d_temp_storage, temp_storage_bytes, @@ -496,19 +503,19 @@ struct DeviceSegmentedSort } template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortPairs(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortPairs(void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + const ValueT* d_values_in, + ValueT* d_values_out, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return StableSortPairs(d_temp_storage, @@ -525,15 +532,16 @@ struct DeviceSegmentedSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t StableSortPairs(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortPairs(void* d_temp_storage, + size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + DoubleBuffer& d_values, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return SortPairs(d_temp_storage, temp_storage_bytes, @@ -547,17 +555,17 @@ struct DeviceSegmentedSort } template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortPairs(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortPairs(void* d_temp_storage, + size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + DoubleBuffer& d_values, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return StableSortPairs(d_temp_storage, @@ -572,18 +580,18 @@ struct DeviceSegmentedSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortPairsDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortPairsDescending(void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + const ValueT* d_values_in, + ValueT* d_values_out, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return SortPairsDescending(d_temp_storage, temp_storage_bytes, @@ -599,19 +607,19 @@ struct DeviceSegmentedSort } template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortPairsDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortPairsDescending(void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + const ValueT* d_values_in, + ValueT* d_values_out, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return StableSortPairsDescending(d_temp_storage, @@ -628,16 +636,16 @@ struct DeviceSegmentedSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortPairsDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortPairsDescending(void* d_temp_storage, + size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + DoubleBuffer& d_values, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return SortPairsDescending(d_temp_storage, temp_storage_bytes, @@ -651,17 +659,17 @@ struct DeviceSegmentedSort } template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortPairsDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortPairsDescending(void* d_temp_storage, + size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + DoubleBuffer& d_values, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return StableSortPairsDescending(d_temp_storage, @@ -676,15 +684,16 @@ struct DeviceSegmentedSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t StableSortKeys(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortKeys(void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return SortKeys(d_temp_storage, temp_storage_bytes, @@ -698,17 +707,17 @@ struct DeviceSegmentedSort } template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortKeys(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortKeys(void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return StableSortKeys(d_temp_storage, @@ -723,14 +732,15 @@ struct DeviceSegmentedSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t StableSortKeys(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortKeys(void* d_temp_storage, + size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return SortKeys(d_temp_storage, temp_storage_bytes, @@ -743,16 +753,16 @@ struct DeviceSegmentedSort } template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortKeys(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortKeys(void* d_temp_storage, + size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return StableSortKeys(d_temp_storage, @@ -766,16 +776,16 @@ struct DeviceSegmentedSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortKeysDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortKeysDescending(void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return SortKeysDescending(d_temp_storage, temp_storage_bytes, @@ -789,17 +799,17 @@ struct DeviceSegmentedSort } template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortKeysDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortKeysDescending(void* d_temp_storage, + size_t& temp_storage_bytes, + const KeyT* d_keys_in, + KeyT* d_keys_out, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return StableSortKeysDescending(d_temp_storage, @@ -814,15 +824,15 @@ struct DeviceSegmentedSort } template - HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortKeysDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream = 0) + HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortKeysDescending(void* d_temp_storage, + size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream = 0) { return SortKeysDescending(d_temp_storage, temp_storage_bytes, @@ -835,16 +845,16 @@ struct DeviceSegmentedSort } template - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t - StableSortKeysDescending(void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - int num_items, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - hipStream_t stream, - bool debug_synchronous) + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t StableSortKeysDescending(void* d_temp_storage, + size_t& temp_storage_bytes, + DoubleBuffer& d_keys, + int64_t num_items, + int64_t num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + hipStream_t stream, + bool debug_synchronous) { HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); return StableSortKeysDescending(d_temp_storage, From d7621f10b3daaf235592d6d112baf958010bef0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Wed, 7 Jan 2026 23:12:11 +0000 Subject: [PATCH 67/95] Patch thread operators - add missing headers --- projects/hipcub/benchmark/benchmark_block_scan.cpp | 1 + projects/hipcub/benchmark/benchmark_device_scan.cpp | 3 ++- projects/hipcub/benchmark/benchmark_device_select.cpp | 3 ++- projects/hipcub/benchmark/benchmark_utils.hpp | 1 + projects/hipcub/benchmark/benchmark_warp_reduce.cpp | 1 + projects/hipcub/benchmark/benchmark_warp_scan.cpp | 3 ++- projects/hipcub/benchmark/common_benchmark_header.hpp | 1 + .../hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp | 1 + projects/hipcub/test/hipcub/test_hipcub_block_scan.cpp | 3 ++- .../test/hipcub/test_hipcub_device_adjacent_difference.cpp | 1 + .../hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp | 1 + .../hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp | 1 + projects/hipcub/test/hipcub/test_hipcub_device_select.cpp | 1 + projects/hipcub/test/hipcub/test_hipcub_warp_exchange.cpp | 1 + projects/hipcub/test/hipcub/test_hipcub_warp_reduce.cpp | 3 ++- projects/hipcub/test/hipcub/test_hipcub_warp_scan.cpp | 3 ++- 16 files changed, 22 insertions(+), 6 deletions(-) diff --git a/projects/hipcub/benchmark/benchmark_block_scan.cpp b/projects/hipcub/benchmark/benchmark_block_scan.cpp index c08976b19c7..3ef35356bf1 100644 --- a/projects/hipcub/benchmark/benchmark_block_scan.cpp +++ b/projects/hipcub/benchmark/benchmark_block_scan.cpp @@ -24,6 +24,7 @@ // hipCUB API #include +#include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; diff --git a/projects/hipcub/benchmark/benchmark_device_scan.cpp b/projects/hipcub/benchmark/benchmark_device_scan.cpp index 5d38b9628b5..d9a0c77aebd 100644 --- a/projects/hipcub/benchmark/benchmark_device_scan.cpp +++ b/projects/hipcub/benchmark/benchmark_device_scan.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2020-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -30,6 +30,7 @@ // HIP API #include +#include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; diff --git a/projects/hipcub/benchmark/benchmark_device_select.cpp b/projects/hipcub/benchmark/benchmark_device_select.cpp index 04097eca9be..d52c4dcbde2 100644 --- a/projects/hipcub/benchmark/benchmark_device_select.cpp +++ b/projects/hipcub/benchmark/benchmark_device_select.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2020-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -24,6 +24,7 @@ // HIP API #include +#include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; diff --git a/projects/hipcub/benchmark/benchmark_utils.hpp b/projects/hipcub/benchmark/benchmark_utils.hpp index 1cb93a821b0..6af618b92b8 100644 --- a/projects/hipcub/benchmark/benchmark_utils.hpp +++ b/projects/hipcub/benchmark/benchmark_utils.hpp @@ -36,6 +36,7 @@ #endif #include +#include #ifndef HIPCUB_CUB_API #define HIPCUB_WARP_THREADS_MACRO warpSize diff --git a/projects/hipcub/benchmark/benchmark_warp_reduce.cpp b/projects/hipcub/benchmark/benchmark_warp_reduce.cpp index 870ebb0f373..9847ff80d71 100644 --- a/projects/hipcub/benchmark/benchmark_warp_reduce.cpp +++ b/projects/hipcub/benchmark/benchmark_warp_reduce.cpp @@ -23,6 +23,7 @@ #include "common_benchmark_header.hpp" // HIP API +#include #include #ifndef DEFAULT_N diff --git a/projects/hipcub/benchmark/benchmark_warp_scan.cpp b/projects/hipcub/benchmark/benchmark_warp_scan.cpp index 35efedd3995..ef63a4ee9bd 100644 --- a/projects/hipcub/benchmark/benchmark_warp_scan.cpp +++ b/projects/hipcub/benchmark/benchmark_warp_scan.cpp @@ -22,7 +22,8 @@ #include "common_benchmark_header.hpp" -// HIP API +// HIP +#include #include #ifndef DEFAULT_N diff --git a/projects/hipcub/benchmark/common_benchmark_header.hpp b/projects/hipcub/benchmark/common_benchmark_header.hpp index 093a0079ef1..28693abcec6 100644 --- a/projects/hipcub/benchmark/common_benchmark_header.hpp +++ b/projects/hipcub/benchmark/common_benchmark_header.hpp @@ -42,6 +42,7 @@ #include #include +#include #include _HIPCUB_LIBCXX_INCLUDE(cmath) #include _HIPCUB_STD_INCLUDE(limits) diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp index 0063718ca26..3d043d3d5c8 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp @@ -24,6 +24,7 @@ #include #include #include +#include template +#include // Params for tests template +#include #include "test_utils.hpp" #include "test_utils_data_generation.hpp" diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp index 2a6f16a0881..013019bac18 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp @@ -24,6 +24,7 @@ // hipcub API #include +#include #include "test_utils_data_generation.hpp" diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp index 5470629ed7c..c0f3b0d3556 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp @@ -28,6 +28,7 @@ // hipcub API #include +#include template +#include #include "single_index_iterator.hpp" #include "test_utils_bfloat16.hpp" diff --git a/projects/hipcub/test/hipcub/test_hipcub_warp_exchange.cpp b/projects/hipcub/test/hipcub/test_hipcub_warp_exchange.cpp index 25bd35263d5..d0325d40d2a 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_warp_exchange.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_warp_exchange.cpp @@ -24,6 +24,7 @@ #include "test_utils_data_generation.hpp" #include "test_utils_half.hpp" +#include #include #include diff --git a/projects/hipcub/test/hipcub/test_hipcub_warp_reduce.cpp b/projects/hipcub/test/hipcub/test_hipcub_warp_reduce.cpp index d60cb0333cc..ea1f69aff3e 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_warp_reduce.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_warp_reduce.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -22,6 +22,7 @@ #include "common_test_header.hpp" +#include #include #include diff --git a/projects/hipcub/test/hipcub/test_hipcub_warp_scan.cpp b/projects/hipcub/test/hipcub/test_hipcub_warp_scan.cpp index 9a91412211f..55014c51a66 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_warp_scan.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_warp_scan.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -22,6 +22,7 @@ #include "common_test_header.hpp" +#include #include #include From 567e65f1aabb13a2d0c0f2bb9098c41eb4f8deb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Wed, 7 Jan 2026 22:21:24 +0000 Subject: [PATCH 68/95] Patch thread operators - add cub backend --- .../backend/cub/thread/thread_operators.hpp | 110 +++++++++++++++++- 1 file changed, 109 insertions(+), 1 deletion(-) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/thread/thread_operators.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/thread/thread_operators.hpp index ab0a1f7630c..7ce5f71061d 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/thread/thread_operators.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/thread/thread_operators.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -27,6 +27,8 @@ #include +#include + BEGIN_HIPCUB_NAMESPACE namespace detail @@ -37,6 +39,112 @@ using accumulator_t = ::cuda::std::__accumulator_t; } // namespace detail +// TODO: this is deprecated in cub, we should also mark this as deprecated when we have libhipcxx +struct Equality +{ + template + HIPCUB_HOST_DEVICE + inline constexpr bool operator()(T&& t, U&& u) const + { + return ::cuda::std::forward(t) == ::cuda::std::forward(u); + } +}; + +// TODO: this is deprecated in cub, we should also mark this as deprecated when we have libhipcxx +struct Inequality +{ + template + HIPCUB_HOST_DEVICE + inline constexpr bool operator()(T&& t, U&& u) const + { + return ::cuda::std::forward(t) != ::cuda::std::forward(u); + } +}; + +// TODO: this is deprecated in cub, we should also mark this as deprecated when we have libhipcxx +struct Sum +{ + template + HIPCUB_HOST_DEVICE + inline constexpr auto operator()(T&& t, U&& u) const -> decltype(auto) + { + return ::cuda::std::forward(t) + ::cuda::std::forward(u); + } +}; + +// TODO: this is deprecated in cub, we should also mark this as deprecated when we have libhipcxx +struct Difference +{ + template + HIPCUB_HOST_DEVICE + inline constexpr auto operator()(T&& t, U&& u) const -> decltype(auto) + { + return ::cuda::std::forward(t) - ::cuda::std::forward(u); + } +}; + +// TODO: this is deprecated in cub, we should also mark this as deprecated when we have libhipcxx +struct Division +{ + template + HIPCUB_HOST_DEVICE + inline constexpr auto operator()(T&& t, U&& u) const -> decltype(auto) + { + return std::forward(t) / std::forward(u); + } +}; + +// TODO: this is deprecated in cub, we should also mark this as deprecated when we have libhipcxx +struct Max +{ + template + HIPCUB_HOST_DEVICE + inline constexpr auto operator()(const T& t, const U& u) const -> + typename ::cuda::std::common_type::type + { + using R = typename ::cuda::std::common_type::type; + return (t < u) ? static_cast(u) : static_cast(t); + } +}; + +struct Min +{ + template + HIPCUB_HOST_DEVICE + inline constexpr auto operator()(const T& t, const U& u) const -> + typename ::cuda::std::common_type::type + { + using R = typename ::cuda::std::common_type::type; + return (u < t) ? static_cast(u) : static_cast(t); + } +}; + +// TODO: this is deprecated in cub, we should also mark this as deprecated when we have libhipcxx +struct ArgMax +{ + template + HIPCUB_HOST_DEVICE + inline constexpr ::cub::KeyValuePair + operator()(const ::cub::KeyValuePair& a, + const ::cub::KeyValuePair& b) const + { + return ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a; + } +}; + +// TODO: this is deprecated in cub, we should also mark this as deprecated when we have libhipcxx +struct ArgMin +{ + template + HIPCUB_HOST_DEVICE + inline constexpr ::cub::KeyValuePair + operator()(const ::cub::KeyValuePair& a, + const ::cub::KeyValuePair& b) const + { + return ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a; + } +}; + END_HIPCUB_NAMESPACE #endif // HIPCUB_CUB_THREAD_THREAD_OPERATORS_HPP_ From cc1e483589c436ce1ff11bd31b538c4a156d8c75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Thu, 22 Jan 2026 16:15:46 +0000 Subject: [PATCH 69/95] Patch thread operators - fix --- .../backend/cub/device/device_adjacent_difference.hpp | 9 +++++---- .../include/hipcub/backend/cub/device/device_scan.hpp | 1 + 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_adjacent_difference.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_adjacent_difference.hpp index 29d08eda201..b4a457610d5 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_adjacent_difference.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_adjacent_difference.hpp @@ -31,6 +31,7 @@ #include "../../../config.hpp" #include "../../../util_deprecated.hpp" +#include "../thread/thread_operators.hpp" // for Difference #include // IWYU pragma: export @@ -40,7 +41,7 @@ struct DeviceAdjacentDifference { template static HIPCUB_RUNTIME_FUNCTION hipError_t SubtractLeftCopy(void* d_temp_storage, @@ -62,7 +63,7 @@ struct DeviceAdjacentDifference } template static HIPCUB_RUNTIME_FUNCTION hipError_t SubtractLeft(void* d_temp_storage, @@ -83,7 +84,7 @@ struct DeviceAdjacentDifference template static HIPCUB_RUNTIME_FUNCTION hipError_t SubtractRightCopy(void* d_temp_storage, @@ -105,7 +106,7 @@ struct DeviceAdjacentDifference } template static HIPCUB_RUNTIME_FUNCTION hipError_t SubtractRight(void* d_temp_storage, diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp index 31f5c53a89d..6bbe0ecdfe7 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp @@ -32,6 +32,7 @@ #include "../../../config.hpp" #include "../../../util_deprecated.hpp" +#include "../thread/thread_operators.hpp" // for Equality #include // IWYU pragma: export From bcb082cb28046618bd307b0eec05f32ae39a6e26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Mon, 26 Jan 2026 12:32:49 +0000 Subject: [PATCH 70/95] Fix types and formatting in generate resource spec --- projects/hipcub/test/CMakeLists.txt | 11 ++++++++++- projects/hipcub/test/generate_resource_spec.cpp | 6 +++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/projects/hipcub/test/CMakeLists.txt b/projects/hipcub/test/CMakeLists.txt index 6953f4fdcae..e5478f6a43d 100644 --- a/projects/hipcub/test/CMakeLists.txt +++ b/projects/hipcub/test/CMakeLists.txt @@ -67,7 +67,16 @@ endfunction() # We'll use this small program to detect available GPUs and build the resource JSON file. set(GEN_RES_SPEC_PATH ${CMAKE_SOURCE_DIR}/test/generate_resource_spec.cpp) add_executable(generate_resource_spec ${GEN_RES_SPEC_PATH}) -set_target_properties(generate_resource_spec PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}") +set_target_properties(generate_resource_spec + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" +) + +if(HIP_COMPILER STREQUAL "nvcc") + set_source_files_properties(${GEN_RES_SPEC_PATH} + PROPERTIES LANGUAGE CUDA) +endif() + +target_compile_options(generate_resource_spec PRIVATE -Wno-unused-command-line-argument) target_link_libraries(generate_resource_spec PRIVATE hip::host) # This test may still get the offload-compress flag passed. Since it does not include any kernel # code it will give an unused-command-line-argument warning. diff --git a/projects/hipcub/test/generate_resource_spec.cpp b/projects/hipcub/test/generate_resource_spec.cpp index 3fae3cc1618..77569124291 100644 --- a/projects/hipcub/test/generate_resource_spec.cpp +++ b/projects/hipcub/test/generate_resource_spec.cpp @@ -10,7 +10,7 @@ // ./enum_device // // Sample output: -// { +// { // "version": { // "major": 1, // "minor": 0 @@ -113,7 +113,7 @@ int main(int argc, char* argv[]) // Add one object for each gfxID. // Each gfxID-keyed object will contain an array of device IDs. - unsigned int key_index = 0; + size_t key_index = 0; for(auto& name_it : names_to_ids) { out_file << " \"" << name_it.first << "\": [" << std::endl; @@ -124,7 +124,7 @@ int main(int argc, char* argv[]) // to have a consistent output on each run so that the resource // spec file stays the same. std::sort(name_it.second.begin(), name_it.second.end()); - unsigned int id_index = 0; + size_t id_index = 0; for(const auto& id_it : name_it.second) { out_file << " {" << std::endl; From 1cb4b5469b1ecf0d54fd4744b956067962a93e63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Wed, 28 Jan 2026 08:05:22 +0000 Subject: [PATCH 71/95] Remove internal `hipcub::Division` usage --- .../hipcub/test_hipcub_thread_operators.cpp | 4 ++- .../test/hipcub/test_utils_functional.hpp | 33 ++++++++++++++----- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp b/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp index 1fa95f64f47..3cc5e8d980a 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp @@ -24,6 +24,7 @@ #include "test_utils_assertions.hpp" #include "test_utils_data_generation.hpp" +#include "test_utils_functional.hpp" #include "test_utils_thread_operators.hpp" #include #include @@ -31,6 +32,7 @@ #include #include +#include #include #include #include @@ -227,7 +229,7 @@ TYPED_TEST(HipcubDivisionOperatorTests, Division) { using input_type = typename TestFixture::input_type; using output_type = typename TestFixture::output_type; - using Division = typename AlgebraicSelector::type; + using Division = typename AlgebraicSelector::type; for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { diff --git a/projects/hipcub/test/hipcub/test_utils_functional.hpp b/projects/hipcub/test/hipcub/test_utils_functional.hpp index 404ad12b11c..7da5860f48d 100644 --- a/projects/hipcub/test/hipcub/test_utils_functional.hpp +++ b/projects/hipcub/test/hipcub/test_utils_functional.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -35,7 +35,8 @@ namespace test_utils struct less { template - HIPCUB_HOST_DEVICE constexpr bool operator()(const T& a, const T& b) const + HIPCUB_HOST_DEVICE + constexpr bool operator()(const T& a, const T& b) const { return a < b; } @@ -44,7 +45,8 @@ struct less struct less_equal { template - HIPCUB_HOST_DEVICE constexpr bool operator()(const T& a, const T& b) const + HIPCUB_HOST_DEVICE + constexpr bool operator()(const T& a, const T& b) const { return a <= b; } @@ -53,7 +55,8 @@ struct less_equal struct greater { template - HIPCUB_HOST_DEVICE constexpr bool operator()(const T& a, const T& b) const + HIPCUB_HOST_DEVICE + constexpr bool operator()(const T& a, const T& b) const { return a > b; } @@ -62,7 +65,8 @@ struct greater struct greater_equal { template - HIPCUB_HOST_DEVICE constexpr bool operator()(const T& a, const T& b) const + HIPCUB_HOST_DEVICE + constexpr bool operator()(const T& a, const T& b) const { return a >= b; } @@ -71,7 +75,8 @@ struct greater_equal struct plus { template - HIPCUB_HOST_DEVICE inline constexpr T operator()(const T& a, const T& b) const + HIPCUB_HOST_DEVICE + inline constexpr T operator()(const T& a, const T& b) const { return a + b; } @@ -80,7 +85,8 @@ struct plus struct minus { template - HIPCUB_HOST_DEVICE inline constexpr T operator()(const T& a, const T& b) const + HIPCUB_HOST_DEVICE + inline constexpr T operator()(const T& a, const T& b) const { return a - b; } @@ -89,12 +95,23 @@ struct minus struct multiplies { template - HIPCUB_HOST_DEVICE inline constexpr T operator()(const T& a, const T& b) const + HIPCUB_HOST_DEVICE + inline constexpr T operator()(const T& a, const T& b) const { return a * b; } }; +struct divides +{ + template + HIPCUB_HOST_DEVICE + inline constexpr auto operator()(const A& a, const B& b) const -> decltype(a / b) + { + return a / b; + } +}; + // HALF template<> HIPCUB_HOST_DEVICE inline bool From 1a2bd5cc48a696357d0d1f93b5b922172709f1ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Wed, 28 Jan 2026 14:53:53 +0000 Subject: [PATCH 72/95] Internal removal of `hipcub::Inequality` --- .../test_hipcub_block_discontinuity.cpp | 12 +++++------ .../test/hipcub/test_hipcub_device_select.cpp | 4 ++-- .../hipcub/test_hipcub_thread_operators.cpp | 7 ++++--- .../test/hipcub/test_utils_functional.hpp | 20 +++++++++++++++++++ 4 files changed, 32 insertions(+), 11 deletions(-) diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_discontinuity.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_discontinuity.cpp index fa6cd68ed2d..35d341e1789 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_discontinuity.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_discontinuity.cpp @@ -73,8 +73,8 @@ bool apply(FlagOp flag_op, const T& a, const T& b, unsigned int) using Params = ::testing::Types< // Power of 2 BlockSize - params, - params, + params, + params, params, params, params, @@ -87,18 +87,18 @@ using Params = ::testing::Types< params, params, params, - params, - params, + params, + params, // Power of 2 BlockSize and ItemsPerThread > 1 params>, params, params>, - params, + params, // Non-power of 2 BlockSize and ItemsPerThread > 1 params>, - params, + params, params, params, params, diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp index b72a4edc5b8..97af33f75f6 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp @@ -1019,11 +1019,11 @@ struct TestUniqueEqualityOp using HipcubDeviceUniqueByKeyTestsParams = ::testing::Types< DeviceUniqueByKeyParams, DeviceUniqueByKeyParams, - DeviceUniqueByKeyParams, + DeviceUniqueByKeyParams, DeviceUniqueByKeyParams, DeviceUniqueByKeyParams, test_utils::custom_test_type>, - DeviceUniqueByKeyParams>; + DeviceUniqueByKeyParams>; TYPED_TEST_SUITE(HipcubDeviceUniqueByKeyTests, HipcubDeviceUniqueByKeyTestsParams); diff --git a/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp b/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp index 3cc5e8d980a..d44f1b5f02c 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp @@ -109,7 +109,7 @@ TYPED_TEST(HipcubThreadOperatorsTests, Equality) using input_type = typename TestFixture::input_type; using output_type = typename TestFixture::output_type; - using Equality = typename EqualitySelector::type; + using Equality = typename EqualitySelector::type; Equality op{}; equality_op_test(op, true); @@ -120,7 +120,8 @@ TYPED_TEST(HipcubThreadOperatorsTests, Inequality) using input_type = typename TestFixture::input_type; using output_type = typename TestFixture::output_type; - using Inequality = typename EqualitySelector::type; + using Inequality = + typename EqualitySelector::type; Inequality op{}; equality_op_test(op, false); @@ -131,7 +132,7 @@ TYPED_TEST(HipcubThreadOperatorsTests, InequalityWrapper) using input_type = typename TestFixture::input_type; using output_type = typename TestFixture::output_type; - using Equality = typename EqualitySelector::type; + using Equality = typename EqualitySelector::type; Equality wrapped_op{}; hipcub::InequalityWrapper op{wrapped_op}; diff --git a/projects/hipcub/test/hipcub/test_utils_functional.hpp b/projects/hipcub/test/hipcub/test_utils_functional.hpp index 7da5860f48d..2a0ee956b6f 100644 --- a/projects/hipcub/test/hipcub/test_utils_functional.hpp +++ b/projects/hipcub/test/hipcub/test_utils_functional.hpp @@ -72,6 +72,26 @@ struct greater_equal } }; +struct equal +{ + template + HIPCUB_HOST_DEVICE + inline constexpr auto operator()(const A& a, const B& b) const + { + return a == b; + } +}; + +struct not_equal +{ + template + HIPCUB_HOST_DEVICE + inline constexpr auto operator()(const A& a, const B& b) const + { + return a != b; + } +}; + struct plus { template From b1d660f24d3bd7cecf6587bbf097e14bdcbfb2fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Wed, 28 Jan 2026 23:00:49 +0000 Subject: [PATCH 73/95] Fix sort keys over 4GB --- .../test/hipcub/test_hipcub_device_radix_sort.hpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp b/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp index 8b12185ca22..2a550a1c6d0 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp @@ -1264,12 +1264,18 @@ inline void sort_keys_over_4g() SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); HIP_CHECK(hipSetDevice(device_id)); - using key_type = uint8_t; + using key_type = uint32_t; constexpr unsigned int start_bit = 0; constexpr unsigned int end_bit = 8ull * sizeof(key_type); constexpr hipStream_t stream = 0; - constexpr size_t size = (1ull << 32) + 32; - constexpr size_t number_of_possible_keys = 1ull << (8ull * sizeof(key_type)); + + constexpr size_t total_bytes = (1ull << 32) + 32; + static_assert(total_bytes > (1ull << 32), "must be over 4 GiB"); + static_assert(total_bytes % sizeof(key_type) == 0, + "total_bytes must be divisible by sizeof(key_type)"); + + constexpr size_t size = total_bytes / sizeof(key_type); + constexpr size_t number_of_possible_keys = 1ull << (8ull * sizeof(key_type)); assert(std::is_unsigned::value); hipDeviceProp_t dev_prop; HIP_CHECK(hipGetDeviceProperties(&dev_prop, device_id)); From 904a941cd80ab85d8a72c405ce26ee79253cc942 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Wed, 28 Jan 2026 23:38:50 +0000 Subject: [PATCH 74/95] Fix device radix sort large sizes test --- .../hipcub/test_hipcub_device_radix_sort.hpp | 33 +++++++++++-------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp b/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp index 2a550a1c6d0..459a7c8e67c 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp @@ -1409,6 +1409,14 @@ inline void sort_keys_large_sizes() { SCOPED_TRACE(testing::Message() << "with size = " << size); + // Avoid sizes the CUB backend can't handle +#ifdef __HIP_PLATFORM_NVIDIA__ + if(size > static_cast(::cuda::std::numeric_limits::max())) + { + continue; + } +#endif // __HIP_PLATFORM_NVIDIA__ + // Generate data std::vector keys_input; try @@ -1423,6 +1431,10 @@ inline void sort_keys_large_sizes() key_type* d_keys; HIP_CHECK_MEMORY(test_common_utils::hipMallocHelper(&d_keys, size * sizeof(key_type))); + + key_type* d_keys_out; + HIP_CHECK_MEMORY(test_common_utils::hipMallocHelper(&d_keys_out, size * sizeof(key_type))); + HIP_CHECK( hipMemcpy(d_keys, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); @@ -1431,7 +1443,7 @@ inline void sort_keys_large_sizes() HIP_CHECK(invoke_sort_keys(d_temporary_storage, temporary_storage_bytes, d_keys, - d_keys, + d_keys_out, size, start_bit, end_bit, @@ -1445,7 +1457,7 @@ inline void sort_keys_large_sizes() HIP_CHECK(invoke_sort_keys(d_temporary_storage, temporary_storage_bytes, d_keys, - d_keys, + d_keys_out, size, start_bit, end_bit, @@ -1454,19 +1466,12 @@ inline void sort_keys_large_sizes() HIP_CHECK(hipFree(d_temporary_storage)); std::vector keys_output(size); - try - { - keys_output.resize(size); - } - catch(const std::bad_alloc& e) - { - HIP_CHECK(hipFree(d_keys)); - continue; - } - - HIP_CHECK( - hipMemcpy(keys_output.data(), d_keys, size * sizeof(key_type), hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(keys_output.data(), + d_keys_out, + size * sizeof(key_type), + hipMemcpyDeviceToHost)); + HIP_CHECK(hipFree(d_keys_out)); HIP_CHECK(hipFree(d_keys)); // Check if output values are as expected From 51afcd90d91bd4075551e8e7ef6e666e6a52d6f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Wed, 11 Feb 2026 23:02:55 +0000 Subject: [PATCH 75/95] Fix device reduce ArgMin/Max CUB backend --- .../backend/cub/device/device_reduce.hpp | 158 ++++++++++++++---- .../test/hipcub/test_hipcub_device_reduce.cpp | 8 +- 2 files changed, 125 insertions(+), 41 deletions(-) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp index 97660c72b5f..b8a5edf87bb 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_reduce.hpp @@ -36,6 +36,8 @@ #include // IWYU pragma: export +#include + #include BEGIN_HIPCUB_NAMESPACE @@ -99,24 +101,49 @@ class DeviceReduce stream)); } - template - HIPCUB_RUNTIME_FUNCTION + template + HIPCUB_RUNTIME_FUNCTION static hipError_t ArgMin(void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, ExtremumOutIteratorT d_min_out, IndexOutIteratorT d_index_out, - int num_items, + NumItemsT num_items, hipStream_t stream = 0) { - using value_type = ::hipcub::detail::it_value_t; - using index_type = int64_t; - using pair_type = ::cub::KeyValuePair; + using index_type = ::hipcub::detail::it_value_t; + + // CUB handles zero-length inputs in its internal dispatch layer. + // That behavior must be reproduced manually, so this case is handled here. + if(num_items == 0) + { + if(d_temp_storage == nullptr) + { + temp_storage_bytes = sizeof(int); + return hipSuccess; + } + + value_type init_value = ::cuda::std::numeric_limits::max(); + index_type init_index = 1; - static_cast(sizeof(pair_type)); - static_cast(sizeof(index_type)); - static_cast(sizeof(value_type)); + hipError_t e1 = hipMemcpyAsync(d_min_out, + &init_value, + sizeof(value_type), + hipMemcpyHostToDevice, + stream); + + hipError_t e2 = hipMemcpyAsync(d_index_out, + &init_index, + sizeof(index_type), + hipMemcpyHostToDevice, + stream); + + return (e1 != hipSuccess ? e1 : e2); + } return hipCUDAErrorTohipError(::cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, @@ -142,9 +169,24 @@ class DeviceReduce NumItemsT num_items, hipStream_t stream = 0) { - using value_type = ::hipcub::detail::it_value_t; - using index_type = int64_t; - using pair_type = ::hipcub::KeyValuePair; + using pair_type = ::hipcub::detail::it_value_t; + using value_type = decltype(pair_type::value); + using index_type = decltype(pair_type::key); + + if(num_items == 0) + { + if(d_temp_storage == nullptr) + { + temp_storage_bytes = sizeof(int); + return hipSuccess; + } + + pair_type init; + init.key = static_cast(1); + init.value = ::cuda::std::numeric_limits::max(); + + return hipMemcpyAsync(d_out, &init, sizeof(pair_type), hipMemcpyHostToDevice, stream); + } pair_type* out_pair = reinterpret_cast(d_out); @@ -152,14 +194,15 @@ class DeviceReduce index_type* d_index_out = &(out_pair->key); _CCCL_SUPPRESS_DEPRECATED_PUSH - return ArgMin(d_temp_storage, - temp_storage_bytes, - d_in, - d_min_out, - d_index_out, - static_cast(num_items), - stream); + auto status = ArgMin(d_temp_storage, + temp_storage_bytes, + d_in, + d_min_out, + d_index_out, + static_cast(num_items), + stream); _CCCL_SUPPRESS_DEPRECATED_POP + return status; } template @@ -178,23 +221,51 @@ class DeviceReduce stream)); } - template + template HIPCUB_RUNTIME_FUNCTION static hipError_t ArgMax(void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, ExtremumOutIteratorT d_max_out, IndexOutIteratorT d_index_out, - int num_items, + NumItemsT num_items, hipStream_t stream = 0) { using value_type = ::hipcub::detail::it_value_t; - using index_type = int64_t; - using pair_type = ::cub::KeyValuePair; + using index_type = ::hipcub::detail::it_value_t; + + // CUB documentation claims zero-length inputs initialize with numeric_limits::max(), + // but the actual CUB implementation uses numeric_limits::lowest(). + // hipCUB matches the implementation. + + if(num_items == 0) + { + if(d_temp_storage == nullptr) + { + temp_storage_bytes = sizeof(int); + return hipSuccess; + } + + value_type init_value = ::cuda::std::numeric_limits::lowest(); + index_type init_index = 1; // hipCUB 1-based index + + hipError_t e1 = hipMemcpyAsync(d_max_out, + &init_value, + sizeof(value_type), + hipMemcpyHostToDevice, + stream); - static_cast(sizeof(pair_type)); - static_cast(sizeof(index_type)); - static_cast(sizeof(value_type)); + hipError_t e2 = hipMemcpyAsync(d_index_out, + &init_index, + sizeof(index_type), + hipMemcpyHostToDevice, + stream); + + return (e1 != hipSuccess ? e1 : e2); + } return hipCUDAErrorTohipError(::cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, @@ -218,10 +289,24 @@ HIPCUB_RUNTIME_FUNCTION NumItemsT num_items, hipStream_t stream = 0) { + using pair_type = ::hipcub::detail::it_value_t; + using value_type = decltype(pair_type::value); + using index_type = decltype(pair_type::key); - using value_type = ::hipcub::detail::it_value_t; - using index_type = int64_t; - using pair_type = ::hipcub::KeyValuePair; + if(num_items == 0) + { + if(d_temp_storage == nullptr) + { + temp_storage_bytes = sizeof(int); + return hipSuccess; + } + + pair_type init; + init.key = static_cast(1); + init.value = ::cuda::std::numeric_limits::lowest(); + + return hipMemcpyAsync(d_out, &init, sizeof(pair_type), hipMemcpyHostToDevice, stream); + } pair_type* out_pair = reinterpret_cast(d_out); @@ -229,14 +314,15 @@ HIPCUB_RUNTIME_FUNCTION index_type* d_index_out = &(out_pair->key); _CCCL_SUPPRESS_DEPRECATED_PUSH - return ArgMax(d_temp_storage, - temp_storage_bytes, - d_in, - d_max_out, - d_index_out, - static_cast(num_items), - stream); + auto status = ArgMax(d_temp_storage, + temp_storage_bytes, + d_in, + d_max_out, + d_index_out, + static_cast(num_items), + stream); _CCCL_SUPPRESS_DEPRECATED_POP + return status; } template, DeviceReduceParams, DeviceReduceParams, - DeviceReduceParams + DeviceReduceParams, + DeviceReduceParams #ifdef __HIP_PLATFORM_AMD__ , - DeviceReduceParams, // Doesn't work on NVIDIA / CUB - DeviceReduceParams, // Doesn't work on NVIDIA / CUB + DeviceReduceParams, DeviceReduceParams, test_utils::custom_test_type>, DeviceReduceParams, test_utils::custom_test_type> #endif From 05ebcd7dacf880cadaf59d16b4efbe51114c252d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Tue, 17 Feb 2026 11:10:40 +0000 Subject: [PATCH 76/95] Fix block merge sort default value management for CUB backend compatibility --- .../hipcub/test_hipcub_block_merge_sort.cpp | 117 ++++++++++++++---- 1 file changed, 96 insertions(+), 21 deletions(-) diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_merge_sort.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_merge_sort.cpp index acb78eb3556..0258a33880e 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_merge_sort.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_merge_sort.cpp @@ -183,19 +183,45 @@ void sort_key_with_valid_items_kernel(T* device_input, T default_val) { constexpr size_t items_per_block = items_per_thread * block_size; - const size_t offset = (blockIdx.x * items_per_block) + (threadIdx.x * items_per_thread); + + const int block_offset = static_cast(blockIdx.x * items_per_block); + const int thread_offset = static_cast(threadIdx.x * items_per_thread); T input[items_per_thread]; + // Define per-thread valid range within the block + const int thread_start = thread_offset; + const int thread_end = thread_start + static_cast(items_per_thread); + + // Count valid items this thread actually owns + const int local_valid = (thread_start >= valid_items) + ? 0 + : ((thread_end <= valid_items) ? static_cast(items_per_thread) + : (valid_items - thread_start)); + + // Load valid items and fill the rest with default_val for(size_t i = 0; i < items_per_thread; i++) - input[i] = device_input[offset + i]; + { + const int idx = block_offset + thread_offset + static_cast(i); + if(static_cast(i) < local_valid) + input[i] = device_input[idx]; + else + input[i] = default_val; + } - hipcub::BlockMergeSort bsort; + using BlockSort = hipcub::BlockMergeSort; + __shared__ + typename BlockSort::TempStorage temp_storage; + BlockSort bsort(temp_storage); - bsort.Sort(input, compare_op, valid_items, default_val); + // Sort the whole block since all invalid items are already default_val + bsort.Sort(input, compare_op); for(size_t i = 0; i < items_per_thread; i++) - device_input[offset + i] = input[i]; + { + const int idx = block_offset + thread_offset + static_cast(i); + device_input[idx] = input[i]; + } } TYPED_TEST(HipcubBlockMergeSort, SortKeysWithValidItems) @@ -697,23 +723,45 @@ void stable_sort_key_with_valid_items_kernel(T* device_input, T default_val) { constexpr size_t items_per_block = items_per_thread * block_size; - const size_t offset = (blockIdx.x * items_per_block) + (threadIdx.x * items_per_thread); + const int block_offset = static_cast(blockIdx.x * items_per_block); + const int thread_offset = static_cast(threadIdx.x * items_per_thread); T input[items_per_thread]; + // Define per-thread valid range within the block + const int thread_start = thread_offset; + const int thread_end = thread_start + static_cast(items_per_thread); + + const int local_valid = (thread_start >= valid_items) + ? 0 + : ((thread_end <= valid_items) ? static_cast(items_per_thread) + : (valid_items - thread_start)); + + // Load valid items and fill invalid ones with default_val for(size_t i = 0; i < items_per_thread; i++) - input[i] = device_input[offset + i]; + { + const int idx = block_offset + thread_offset + static_cast(i); + + if(static_cast(i) < local_valid) + input[i] = device_input[idx]; + else + input[i] = default_val; + } - hipcub::BlockMergeSort bsort; + using BlockSort = hipcub::BlockMergeSort; + __shared__ + typename BlockSort::TempStorage temp_storage; + BlockSort bsort(temp_storage); - bsort.StableSort( - input, - [&](const T& lhs, const T& rhs) { return compare_op(lhs.elem, rhs.elem); }, - valid_items, - default_val); + // Stable-sort the whole block since all invalid items are masked + bsort.StableSort(input, + [&](const T& lhs, const T& rhs) { return compare_op(lhs.elem, rhs.elem); }); for(size_t i = 0; i < items_per_thread; i++) - device_input[offset + i] = input[i]; + { + const int idx = block_offset + thread_offset + static_cast(i); + device_input[idx] = input[i]; + } } TYPED_TEST(HipcubBlockMergeSort, StableSortKeysWithValidItems) @@ -837,25 +885,52 @@ void stable_sort_key_value_with_valid_items_kernel(T* device_key_input, T default_val) { constexpr size_t items_per_block = items_per_thread * block_size; - const size_t offset = (blockIdx.x * items_per_block) + (threadIdx.x * items_per_thread); + + const int block_offset = static_cast(blockIdx.x * items_per_block); + const int thread_offset = static_cast(threadIdx.x * items_per_thread); T key_input[items_per_thread]; T value_input[items_per_thread]; + // Define per-thread valid range + const int thread_start = thread_offset; + const int thread_end = thread_start + static_cast(items_per_thread); + + const int local_valid = (thread_start >= valid_items) + ? 0 + : ((thread_end <= valid_items) ? static_cast(items_per_thread) + : (valid_items - thread_start)); + + // Load valid items and fill invalid ones with default_val for(size_t i = 0; i < items_per_thread; i++) { - key_input[i] = device_key_input[offset + i]; - value_input[i] = device_value_input[offset + i]; + const int idx = block_offset + thread_offset + static_cast(i); + + if(static_cast(i) < local_valid) + { + key_input[i] = device_key_input[idx]; + value_input[i] = device_value_input[idx]; + } + else + { + key_input[i] = default_val; + value_input[i] = device_value_input[idx]; + } } - hipcub::BlockMergeSort bsort; + using BlockSort = hipcub::BlockMergeSort; + __shared__ + typename BlockSort::TempStorage temp_storage; + BlockSort bsort(temp_storage); - bsort.StableSort(key_input, value_input, compare_op, valid_items, default_val); + // Sort entire block since all invalid items are masked + bsort.StableSort(key_input, value_input, compare_op); for(size_t i = 0; i < items_per_thread; i++) { - device_key_input[offset + i] = key_input[i]; - device_value_input[offset + i] = value_input[i]; + const int idx = block_offset + thread_offset + static_cast(i); + device_key_input[idx] = key_input[i]; + device_value_input[idx] = value_input[i]; } } From 8370c972ba39ef11aa22ff06b29ecbb9e6700e7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Thu, 19 Feb 2026 10:15:24 +0000 Subject: [PATCH 77/95] Fix block radix rank test for CUB backend --- .../hipcub/test_hipcub_block_radix_rank.cpp | 76 +++++++++++++++---- 1 file changed, 60 insertions(+), 16 deletions(-) diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp index 25823d5b1a4..f8311dfb6be 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp @@ -531,18 +531,39 @@ void rank_with_prefix_sum_kernel(const KeyType* keys_input, int* prefix_sum_output, unsigned int start_bit) { +#if defined(__HIP_PLATFORM_NVIDIA__) + constexpr bool warp_striped = false; // CUB BlockRadixRankMatch expects blocked layout +#else constexpr bool warp_striped = Algorithm == RadixRankAlgorithm::RADIX_RANK_MATCH; +#endif using KeyTraits = hipcub::Traits; using UnsignedBits = typename KeyTraits::UnsignedBits; using DigitExtractor = hipcub::BFEDigitExtractor; - using RankType = std::conditional_t< - Algorithm == RadixRankAlgorithm::RADIX_RANK_MATCH, - hipcub::BlockRadixRankMatch, - hipcub::BlockRadixRank>; + + using RankType = +#if defined(__HIP_PLATFORM_NVIDIA__) + // For CUB + ULL + MATCH, fall back to basic BlockRadixRank + std::conditional_t< + Algorithm == RadixRankAlgorithm::RADIX_RANK_MATCH + && std::is_same_v, + hipcub::BlockRadixRank, + std::conditional_t< + Algorithm == RadixRankAlgorithm::RADIX_RANK_MATCH, + hipcub::BlockRadixRankMatch, + hipcub::BlockRadixRank>>; +#else + std::conditional_t< + Algorithm == RadixRankAlgorithm::RADIX_RANK_MATCH, + hipcub::BlockRadixRankMatch, + hipcub::BlockRadixRank>; +#endif using KeyExchangeType = hipcub::BlockExchange; using RankExchangeType = hipcub::BlockExchange; @@ -595,13 +616,28 @@ void rank_with_prefix_sum_kernel(const KeyType* keys_input, hipcub::StoreDirectBlocked(lid, ranks_output + block_offset, ranks); - const size_t pfs_size = (1 << RadixBits); - const size_t pfs_offset = (blockIdx.x * pfs_size) + (threadIdx.x * bins_tracked_per_thread); + const size_t pfs_size = (1 << RadixBits); + const size_t pfs_offset = (blockIdx.x * pfs_size); for(size_t i = 0; i < bins_tracked_per_thread; i++) { - if((threadIdx.x * bins_tracked_per_thread) + i < pfs_size) - prefix_sum_output[pfs_offset + i] = prefix_sum_storage[i]; + const size_t local_bin = threadIdx.x * bins_tracked_per_thread + i; + if(local_bin >= pfs_size) + continue; + +#if defined(__HIP_PLATFORM_NVIDIA__) + if constexpr(std::is_same_v && Descending) + { + // Make CUB's layout match rocPRIM's: flip the global bin index + const size_t mirrored_bin = pfs_size - 1 - local_bin; + prefix_sum_output[pfs_offset + mirrored_bin] = prefix_sum_storage[i]; + } + else +#endif + { + // Normal (rocPRIM-compatible) layout + prefix_sum_output[pfs_offset + local_bin] = prefix_sum_storage[i]; + } } } @@ -715,10 +751,10 @@ void test_radix_rank_with_prefix_sum_output() uint64_t bit_rep = c.out; bit_rep >>= start_bit; - bit_rep &= ((1 << radix_bits) - 1); + bit_rep &= ((1ull << radix_bits) - 1); if(descending) - bit_rep = (1 << radix_bits) - (1 + bit_rep); //flip it + bit_rep = (1ull << radix_bits) - (1 + bit_rep); //flip it ++histogram[bit_rep]; } @@ -783,14 +819,22 @@ void test_radix_rank_with_prefix_sum_output() { SCOPED_TRACE(testing::Message() << "with index= " << i); ASSERT_EQ(ranks_output[i], expected[i]); - - if(i < pfs_size) - ASSERT_EQ(prefix_sum_output[i], pfs_expected[i]); } HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_ranks_output)); HIP_CHECK(hipFree(d_prefix_sum_output)); + + for(size_t block = 0; block < grid_size; ++block) + { + const size_t block_pfs_offset = block * pfs_items_per_block; + + for(size_t bin = 0; bin < pfs_items_per_block; ++bin) + { + const size_t idx = block_pfs_offset + bin; + ASSERT_EQ(prefix_sum_output[idx], pfs_expected[idx]); + } + } } } } From 5db16750c2252811d70966824a4df0fd6475cc46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Thu, 19 Feb 2026 16:38:33 +0000 Subject: [PATCH 78/95] Fix static linking for Windows --- projects/hipcub/benchmark/CMakeLists.txt | 5 +++++ projects/hipcub/toolchain-windows.cmake | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/projects/hipcub/benchmark/CMakeLists.txt b/projects/hipcub/benchmark/CMakeLists.txt index fb21e56247c..9679f1285f0 100644 --- a/projects/hipcub/benchmark/CMakeLists.txt +++ b/projects/hipcub/benchmark/CMakeLists.txt @@ -35,6 +35,11 @@ function(add_hipcub_benchmark BENCHMARK_SOURCE) benchmark::benchmark hipcub ) + + if (WIN32) + target_compile_definitions(${BENCHMARK_TARGET} PRIVATE BENCHMARK_STATIC_DEFINE) + endif() + if((HIP_COMPILER STREQUAL "nvcc")) set_property(TARGET ${BENCHMARK_TARGET} PROPERTY CUDA_STANDARD 17) set_source_files_properties(${BENCHMARK_SOURCE} PROPERTIES LANGUAGE CUDA) diff --git a/projects/hipcub/toolchain-windows.cmake b/projects/hipcub/toolchain-windows.cmake index 6b688314c93..2485ca85e01 100644 --- a/projects/hipcub/toolchain-windows.cmake +++ b/projects/hipcub/toolchain-windows.cmake @@ -31,4 +31,8 @@ if (DEFINED ENV{VCPKG_PATH}) else() set(VCPKG_PATH "C:/github/vcpkg") endif() + +# Force static libraries on Windows +set(VCPKG_TARGET_TRIPLET "x64-windows-static") + include("${VCPKG_PATH}/scripts/buildsystems/vcpkg.cmake") From c4fa8b7779e47e4634ecf464ca8c83e6285abfef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Fri, 20 Feb 2026 12:52:32 +0000 Subject: [PATCH 79/95] Fix Windows build by forcing static MSVC runtime to match vcpkg static triplet --- projects/hipcub/toolchain-windows.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/projects/hipcub/toolchain-windows.cmake b/projects/hipcub/toolchain-windows.cmake index 2485ca85e01..313ab047ba7 100644 --- a/projects/hipcub/toolchain-windows.cmake +++ b/projects/hipcub/toolchain-windows.cmake @@ -35,4 +35,7 @@ endif() # Force static libraries on Windows set(VCPKG_TARGET_TRIPLET "x64-windows-static") +# Force static MSVC runtime (/MT) to match vcpkg static triplet +set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") + include("${VCPKG_PATH}/scripts/buildsystems/vcpkg.cmake") From 6e168e7ec9aa2a9375d93ef431fda1584cd7a5a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Mon, 23 Feb 2026 13:34:52 +0000 Subject: [PATCH 80/95] Fix Windows HIP cannot support allocations over 4 GiB --- projects/hipcub/test/hipcub/test_hipcub_device_merge.cpp | 5 +++++ .../hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp | 4 ---- projects/hipcub/test/hipcub/test_utils_data_generation.hpp | 5 +++++ 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_merge.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_merge.cpp index 08a006b247c..068384490f4 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_merge.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_merge.cpp @@ -468,6 +468,11 @@ std::vector> get_large_sizes() TEST(HipcubDeviceMerge, MergeLargeSizeIterators) { + +#if defined(_WIN32) + GTEST_SKIP() << "Windows AMD HIP cannot allocate >= 4 GiB buffers."; +#endif + int device_id = test_common_utils::obtain_device_from_ctest(); SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); HIP_CHECK(hipSetDevice(device_id)); diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp b/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp index 459a7c8e67c..4faad8148a6 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_radix_sort.hpp @@ -1400,11 +1400,7 @@ inline void sort_keys_large_sizes() // Workaround: `hipMalloc` always returns `hipSuccess` even when allocation fails. // We limit the maximum size so this bug doesn't occur. -#ifdef _WIN32 - const std::vector sizes = test_utils::get_large_sizes<34>(seeds[0]); -#else const std::vector sizes = test_utils::get_large_sizes(seeds[0]); -#endif for(const size_t size : sizes) { SCOPED_TRACE(testing::Message() << "with size = " << size); diff --git a/projects/hipcub/test/hipcub/test_utils_data_generation.hpp b/projects/hipcub/test/hipcub/test_utils_data_generation.hpp index e1228aec294..f73a593e825 100644 --- a/projects/hipcub/test/hipcub/test_utils_data_generation.hpp +++ b/projects/hipcub/test/hipcub/test_utils_data_generation.hpp @@ -473,7 +473,12 @@ inline std::vector get_random_data01(size_t size, float p, int seed_value) return data; } +// Windows HIP cannot support allocations >= 4 GiB. +#ifdef _WIN32 +template +#else template +#endif inline std::vector get_large_sizes(int seed_value) { // clang-format off From 9a8e7c2286ac4420c9f3884e0c30c52650cbb8c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Thu, 26 Feb 2026 12:15:04 +0000 Subject: [PATCH 81/95] Remove internal ``hipcub::Difference`` usage --- .../cub/device/device_adjacent_difference.hpp | 10 ++++++---- .../device/device_adjacent_difference.hpp | 18 ++++++++++-------- .../test_hipcub_device_adjacent_difference.cpp | 2 +- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_adjacent_difference.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_adjacent_difference.hpp index b4a457610d5..68787062916 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_adjacent_difference.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_adjacent_difference.hpp @@ -35,13 +35,15 @@ #include // IWYU pragma: export +#include + BEGIN_HIPCUB_NAMESPACE struct DeviceAdjacentDifference { template, typename NumItemsT = uint32_t> static HIPCUB_RUNTIME_FUNCTION hipError_t SubtractLeftCopy(void* d_temp_storage, @@ -63,7 +65,7 @@ struct DeviceAdjacentDifference } template, typename NumItemsT = uint32_t> static HIPCUB_RUNTIME_FUNCTION hipError_t SubtractLeft(void* d_temp_storage, @@ -84,7 +86,7 @@ struct DeviceAdjacentDifference template, typename NumItemsT = uint32_t> static HIPCUB_RUNTIME_FUNCTION hipError_t SubtractRightCopy(void* d_temp_storage, @@ -106,7 +108,7 @@ struct DeviceAdjacentDifference } template, typename NumItemsT = uint32_t> static HIPCUB_RUNTIME_FUNCTION hipError_t SubtractRight(void* d_temp_storage, diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_adjacent_difference.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_adjacent_difference.hpp index 13ec5bf386c..8b6ef8d6bd7 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_adjacent_difference.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_adjacent_difference.hpp @@ -35,13 +35,15 @@ #include #include // IWYU pragma: export +#include _HIPCUB_STD_INCLUDE(functional) + BEGIN_HIPCUB_NAMESPACE struct DeviceAdjacentDifference { template, typename NumItemsT = uint32_t> static HIPCUB_RUNTIME_FUNCTION hipError_t SubtractLeftCopy(void* d_temp_storage, @@ -64,7 +66,7 @@ struct DeviceAdjacentDifference template, typename NumItemsT = uint32_t> HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS static HIPCUB_RUNTIME_FUNCTION hipError_t SubtractLeftCopy(void* d_temp_storage, @@ -87,7 +89,7 @@ struct DeviceAdjacentDifference } template, typename NumItemsT = uint32_t> static HIPCUB_RUNTIME_FUNCTION hipError_t SubtractLeft(void* d_temp_storage, @@ -107,7 +109,7 @@ struct DeviceAdjacentDifference } template, typename NumItemsT = uint32_t> HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS static HIPCUB_RUNTIME_FUNCTION hipError_t SubtractLeft(void* d_temp_storage, @@ -129,7 +131,7 @@ struct DeviceAdjacentDifference template, typename NumItemsT = uint32_t> static HIPCUB_RUNTIME_FUNCTION hipError_t SubtractRightCopy(void* d_temp_storage, @@ -152,7 +154,7 @@ struct DeviceAdjacentDifference template, typename NumItemsT = uint32_t> HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS static HIPCUB_RUNTIME_FUNCTION hipError_t SubtractRightCopy(void* d_temp_storage, @@ -175,7 +177,7 @@ struct DeviceAdjacentDifference } template, typename NumItemsT = uint32_t> static HIPCUB_RUNTIME_FUNCTION hipError_t SubtractRight(void* d_temp_storage, @@ -195,7 +197,7 @@ struct DeviceAdjacentDifference } template, typename NumItemsT = uint32_t> HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS static HIPCUB_RUNTIME_FUNCTION hipError_t SubtractRight(void* d_temp_storage, diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_adjacent_difference.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_adjacent_difference.cpp index 00ce16cedf9..b8e6e0d4549 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_adjacent_difference.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_adjacent_difference.cpp @@ -155,7 +155,7 @@ TYPED_TEST(HipcubDeviceAdjacentDifference, SubtractLeftCopy) static constexpr std::integral_constant copy_constant{}; using output_type = std::conditional_t; - static constexpr ::hipcub::Difference op; + static constexpr test_utils::minus op; hipStream_t stream = 0; if(TestFixture::params::use_graphs) From 40488125570651d36c22d8961f1eb18793bd47d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Thu, 26 Feb 2026 13:35:03 +0000 Subject: [PATCH 82/95] Remove internal ``hipcub::InequalityWrapper`` usage --- .../hipcub/test/hipcub/test_hipcub_thread_operators.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp b/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp index d44f1b5f02c..9a26fb18650 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp @@ -37,6 +37,8 @@ #include #include +#include _HIPCUB_STD_INCLUDE(functional) + template struct ThreadOperatorsParams { @@ -133,8 +135,8 @@ TYPED_TEST(HipcubThreadOperatorsTests, InequalityWrapper) using output_type = typename TestFixture::output_type; using Equality = typename EqualitySelector::type; - Equality wrapped_op{}; - hipcub::InequalityWrapper op{wrapped_op}; + Equality wrapped_op{}; + auto op = _HIPCUB_STD::not_fn(wrapped_op); equality_op_test(op, false); } From a8d24522b13fc368fa1b014b7887be0f484b0ac4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Thu, 26 Feb 2026 13:53:25 +0000 Subject: [PATCH 83/95] Remove internal ``hipcub::Equality`` usage --- .../benchmark_block_discontinuity.cpp | 32 ++++++++----------- .../benchmark/benchmark_device_scan.cpp | 6 ++-- .../hipcub/backend/cub/device/device_scan.hpp | 12 ++++--- .../backend/rocprim/device/device_scan.hpp | 18 ++++++----- .../backend/rocprim/device/device_select.hpp | 8 +++-- .../rocprim/thread/thread_operators.hpp | 2 +- .../test_hipcub_device_reduce_by_key.cpp | 2 +- .../test/hipcub/test_hipcub_device_scan.cpp | 20 ++++++------ .../test/hipcub/test_hipcub_device_select.cpp | 2 +- .../test/hipcub/test_utils_functional.hpp | 12 +++---- 10 files changed, 59 insertions(+), 55 deletions(-) diff --git a/projects/hipcub/benchmark/benchmark_block_discontinuity.cpp b/projects/hipcub/benchmark/benchmark_block_discontinuity.cpp index 1471ccb6a11..80666bcfbaa 100644 --- a/projects/hipcub/benchmark/benchmark_block_discontinuity.cpp +++ b/projects/hipcub/benchmark/benchmark_block_discontinuity.cpp @@ -24,7 +24,6 @@ #include #include #include -#include //to use hipcub::Equality #include "common_benchmark_header.hpp" @@ -32,13 +31,13 @@ const size_t DEFAULT_N = 1024 * 1024 * 128; #endif -template -struct custom_flag_op1 +struct equal { - HIPCUB_HOST_DEVICE - bool operator()(const T& a, const T& b) const + template + HIPCUB_HOST_DEVICE + inline constexpr auto operator()(const A& a, const B& b) const { - return (a == b); + return a == b; } }; @@ -75,10 +74,10 @@ struct flag_heads bool head_flags[ItemsPerThread]; if(WithTile) { - bdiscontinuity.FlagHeads(head_flags, input, hipcub::Equality(), T(123)); + bdiscontinuity.FlagHeads(head_flags, input, equal(), T(123)); } else { - bdiscontinuity.FlagHeads(head_flags, input, hipcub::Equality()); + bdiscontinuity.FlagHeads(head_flags, input, equal()); } for(unsigned int i = 0; i < ItemsPerThread; i++) @@ -113,10 +112,10 @@ struct flag_tails bool tail_flags[ItemsPerThread]; if(WithTile) { - bdiscontinuity.FlagTails(tail_flags, input, hipcub::Equality(), T(123)); + bdiscontinuity.FlagTails(tail_flags, input, equal(), T(123)); } else { - bdiscontinuity.FlagTails(tail_flags, input, hipcub::Equality()); + bdiscontinuity.FlagTails(tail_flags, input, equal()); } for(unsigned int i = 0; i < ItemsPerThread; i++) @@ -152,15 +151,12 @@ struct flag_heads_and_tails bool tail_flags[ItemsPerThread]; if(WithTile) { - bdiscontinuity.FlagHeadsAndTails(head_flags, - T(123), - tail_flags, - T(234), - input, - hipcub::Equality()); - } else + bdiscontinuity + .FlagHeadsAndTails(head_flags, T(123), tail_flags, T(234), input, equal()); + } + else { - bdiscontinuity.FlagHeadsAndTails(head_flags, tail_flags, input, hipcub::Equality()); + bdiscontinuity.FlagHeadsAndTails(head_flags, tail_flags, input, equal()); } for(unsigned int i = 0; i < ItemsPerThread; i++) diff --git a/projects/hipcub/benchmark/benchmark_device_scan.cpp b/projects/hipcub/benchmark/benchmark_device_scan.cpp index d9a0c77aebd..bcacb77fcb3 100644 --- a/projects/hipcub/benchmark/benchmark_device_scan.cpp +++ b/projects/hipcub/benchmark/benchmark_device_scan.cpp @@ -32,6 +32,8 @@ #include #include +#include _HIPCUB_STD_INCLUDE(functional) + #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif @@ -98,7 +100,7 @@ auto run_device_scan_by_key(void* temporary_storage, scan_op, initial_value, static_cast(input_size), - hipcub::Equality(), + _HIPCUB_STD::equal_to<>(), stream); } @@ -121,7 +123,7 @@ auto run_device_scan_by_key(void* temporary_storage, output, scan_op, static_cast(input_size), - hipcub::Equality(), + _HIPCUB_STD::equal_to<>(), stream); } diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp index 6bbe0ecdfe7..af57b0f0fd7 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp @@ -36,6 +36,8 @@ #include // IWYU pragma: export +#include + BEGIN_HIPCUB_NAMESPACE class DeviceScan @@ -260,7 +262,7 @@ class DeviceScan template, typename NumItemsT = std::uint32_t> HIPCUB_RUNTIME_FUNCTION static hipError_t ExclusiveSumByKey(void* d_temp_storage, @@ -287,7 +289,7 @@ class DeviceScan typename ValuesOutputIteratorT, typename ScanOpT, typename InitValueT, - typename EqualityOpT = ::hipcub::Equality, + typename EqualityOpT = ::cuda::std::equal_to<>, typename NumItemsT = std::uint32_t> HIPCUB_RUNTIME_FUNCTION static hipError_t ExclusiveScanByKey(void* d_temp_storage, @@ -316,7 +318,7 @@ class DeviceScan template, typename NumItemsT = std::uint32_t> HIPCUB_RUNTIME_FUNCTION static hipError_t InclusiveSumByKey(void* d_temp_storage, @@ -342,7 +344,7 @@ class DeviceScan typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename ScanOpT, - typename EqualityOpT = ::hipcub::Equality, + typename EqualityOpT = ::cuda::std::equal_to<>, typename NumItemsT = std::uint32_t> HIPCUB_RUNTIME_FUNCTION static hipError_t InclusiveScanByKey(void* d_temp_storage, @@ -370,7 +372,7 @@ class DeviceScan typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename ScanOpT, - typename EqualityOpT = ::hipcub::Equality, + typename EqualityOpT = ::cuda::std::equal_to<>, typename NumItemsT = std::uint32_t> HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t InclusiveScanByKey(void* d_temp_storage, diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_scan.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_scan.hpp index c29881a7de6..68ff1b1a024 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_scan.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_scan.hpp @@ -40,6 +40,8 @@ #include // IWYU pragma: export #include // IWYU pragma: export +#include _HIPCUB_STD_INCLUDE(functional) + BEGIN_HIPCUB_NAMESPACE class DeviceScan @@ -492,7 +494,7 @@ class DeviceScan template, typename NumItemsT = uint32_t> HIPCUB_RUNTIME_FUNCTION static hipError_t ExclusiveSumByKey(void* d_temp_storage, @@ -521,7 +523,7 @@ class DeviceScan template, typename NumItemsT = uint32_t> HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t ExclusiveSumByKey(void* d_temp_storage, @@ -550,7 +552,7 @@ class DeviceScan typename ValuesOutputIteratorT, typename ScanOpT, typename InitValueT, - typename EqualityOpT = ::hipcub::Equality, + typename EqualityOpT = _HIPCUB_STD::equal_to<>, typename NumItemsT = uint32_t> HIPCUB_RUNTIME_FUNCTION static hipError_t ExclusiveScanByKey(void* d_temp_storage, @@ -591,7 +593,7 @@ class DeviceScan typename ValuesOutputIteratorT, typename ScanOpT, typename InitValueT, - typename EqualityOpT = ::hipcub::Equality, + typename EqualityOpT = _HIPCUB_STD::equal_to<>, typename NumItemsT = uint32_t> HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t ExclusiveScanByKey(void* d_temp_storage, @@ -622,7 +624,7 @@ class DeviceScan template, typename NumItemsT = uint32_t> HIPCUB_RUNTIME_FUNCTION static hipError_t InclusiveSumByKey(void* d_temp_storage, @@ -648,7 +650,7 @@ class DeviceScan template, typename NumItemsT = uint32_t> HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t InclusiveSumByKey(void* d_temp_storage, @@ -676,7 +678,7 @@ class DeviceScan typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename ScanOpT, - typename EqualityOpT = ::hipcub::Equality, + typename EqualityOpT = _HIPCUB_STD::equal_to<>, typename NumItemsT = uint32_t> HIPCUB_RUNTIME_FUNCTION static hipError_t InclusiveScanByKey(void* d_temp_storage, @@ -713,7 +715,7 @@ class DeviceScan typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename ScanOpT, - typename EqualityOpT = ::hipcub::Equality, + typename EqualityOpT = _HIPCUB_STD::equal_to<>, typename NumItemsT = uint32_t> HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t InclusiveScanByKey(void* d_temp_storage, diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_select.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_select.hpp index 090b0f4a285..5a521e30422 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_select.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_select.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -37,6 +37,8 @@ #include // IWYU pragma: export +#include _HIPCUB_STD_INCLUDE(functional) + #include BEGIN_HIPCUB_NAMESPACE @@ -352,7 +354,7 @@ class DeviceSelect d_out, d_num_selected_out, num_items, - hipcub::Equality(), + _HIPCUB_STD::equal_to<>(), stream, HIPCUB_DETAIL_DEBUG_SYNC_VALUE); } @@ -437,7 +439,7 @@ class DeviceSelect d_values_output, d_num_selected_out, num_items, - hipcub::Equality(), + _HIPCUB_STD::equal_to<>(), stream); } diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp index 3413034a6a0..5d9327c84e3 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp @@ -65,7 +65,7 @@ struct Inequality }; // TODO: this is deprecated in cub, we should also mark this as deprecated when we have libhipcxx -template +template struct InequalityWrapper { EqualityOp op; diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp index 013019bac18..49178bdda1a 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp @@ -90,7 +90,7 @@ TYPED_TEST(HipcubDeviceReduceByKey, ReduceByKey) std::uniform_int_distribution>>::type; reduce_op_type reduce_op; - hipcub::Equality key_compare_op; + test_utils::equal key_compare_op; hipStream_t stream = 0; // default if(TestFixture::params::use_graphs) diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp index d4dff6505eb..e289c391646 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp @@ -580,7 +580,7 @@ TYPED_TEST(HipcubDeviceScanTests, InclusiveScanByKey) keys.begin(), expected.begin(), scan_op, - hipcub::Equality()); + test_utils::equal()); // Scan operator: CastOp. hipcub::CastOp op{}; @@ -600,7 +600,7 @@ TYPED_TEST(HipcubDeviceScanTests, InclusiveScanByKey) input_iterator, d_output, static_cast(input.size()), - hipcub::Equality(), + test_utils::equal(), stream)); } else @@ -612,7 +612,7 @@ TYPED_TEST(HipcubDeviceScanTests, InclusiveScanByKey) d_output, scan_op, static_cast(input.size()), - hipcub::Equality(), + test_utils::equal(), stream)); } @@ -636,7 +636,7 @@ TYPED_TEST(HipcubDeviceScanTests, InclusiveScanByKey) input_iterator, d_output, static_cast(input.size()), - hipcub::Equality(), + test_utils::equal(), stream)); } else @@ -648,7 +648,7 @@ TYPED_TEST(HipcubDeviceScanTests, InclusiveScanByKey) d_output, scan_op, static_cast(input.size()), - hipcub::Equality(), + test_utils::equal(), stream)); } @@ -982,7 +982,7 @@ TYPED_TEST(HipcubDeviceScanTests, ExclusiveScanByKey) initial_value, expected.begin(), scan_op, - hipcub::Equality()); + test_utils::equal()); // Scan operator: CastOp. hipcub::CastOp op{}; @@ -1002,7 +1002,7 @@ TYPED_TEST(HipcubDeviceScanTests, ExclusiveScanByKey) input_iterator, d_output, static_cast(input.size()), - hipcub::Equality(), + test_utils::equal(), stream)); } else @@ -1015,7 +1015,7 @@ TYPED_TEST(HipcubDeviceScanTests, ExclusiveScanByKey) scan_op, initial_value, static_cast(input.size()), - hipcub::Equality(), + test_utils::equal(), stream)); } @@ -1039,7 +1039,7 @@ TYPED_TEST(HipcubDeviceScanTests, ExclusiveScanByKey) input_iterator, d_output, static_cast(input.size()), - hipcub::Equality(), + test_utils::equal(), stream)); } else @@ -1052,7 +1052,7 @@ TYPED_TEST(HipcubDeviceScanTests, ExclusiveScanByKey) scan_op, initial_value, static_cast(input.size()), - hipcub::Equality(), + test_utils::equal(), stream)); } diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp index 97af33f75f6..e63926b234e 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp @@ -974,7 +974,7 @@ TEST_P(HipcubDeviceSelectLargeIndicesTests, LargeIndicesSelectOp) template + template HIPCUB_HOST_DEVICE - inline constexpr auto operator()(const A& a, const B& b) const + inline constexpr auto operator()(const T& a, const U& b) const { return a == b; } @@ -84,9 +84,9 @@ struct equal struct not_equal { - template + template HIPCUB_HOST_DEVICE - inline constexpr auto operator()(const A& a, const B& b) const + inline constexpr auto operator()(const T& a, const U& b) const { return a != b; } @@ -124,9 +124,9 @@ struct multiplies struct divides { - template + template HIPCUB_HOST_DEVICE - inline constexpr auto operator()(const A& a, const B& b) const -> decltype(a / b) + inline constexpr auto operator()(const T& a, const U& b) const -> decltype(a / b) { return a / b; } From 83a0571dbdd79a121be160f19bc28f73c03c5ab8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Thu, 26 Feb 2026 14:50:25 +0000 Subject: [PATCH 84/95] Remove internal ``hipcub::Max`` usage --- .../benchmark/benchmark_device_reduce.cpp | 8 +++--- .../benchmark_device_reduce_by_key.cpp | 4 +-- .../benchmark/benchmark_device_scan.cpp | 3 ++- .../benchmark_device_segmented_reduce.cpp | 6 ++--- projects/hipcub/benchmark/benchmark_utils.hpp | 10 +++++++ .../backend/rocprim/device/device_reduce.hpp | 14 ++++++++-- .../device/device_segmented_reduce.hpp | 14 ++++++++-- .../test_hipcub_block_run_length_decode.cpp | 10 +++---- .../test/hipcub/test_hipcub_device_reduce.cpp | 6 ++--- .../test_hipcub_device_reduce_by_key.cpp | 12 ++++----- .../test/hipcub/test_hipcub_device_scan.cpp | 18 ++++++------- .../test_hipcub_device_segmented_reduce.cpp | 26 +++++++++---------- .../test/hipcub/test_utils_functional.hpp | 24 +++++++++++++++++ .../hipcub/test_utils_thread_operators.hpp | 16 ++++++------ 14 files changed, 113 insertions(+), 58 deletions(-) diff --git a/projects/hipcub/benchmark/benchmark_device_reduce.cpp b/projects/hipcub/benchmark/benchmark_device_reduce.cpp index 8dcd96861a9..16c6c835632 100644 --- a/projects/hipcub/benchmark/benchmark_device_reduce.cpp +++ b/projects/hipcub/benchmark/benchmark_device_reduce.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2020-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -100,7 +100,7 @@ struct Benchmark }; template -struct Benchmark +struct Benchmark { static void run(benchmark::State& state, size_t size, const hipStream_t stream) { @@ -168,9 +168,9 @@ int main(int argc, char* argv[]) std::vector benchmarks = { CREATE_BENCHMARKS(hipcub::Sum), CREATE_BENCHMARK(custom_double2, hipcub::Sum), - CREATE_BENCHMARKS(hipcub::Min), + CREATE_BENCHMARKS(benchmark_utils::minimum), #ifdef HIPCUB_ROCPRIM_API - CREATE_BENCHMARK(custom_double2, hipcub::Min), + CREATE_BENCHMARK(custom_double2, benchmark_utils::minimum), #endif CREATE_BENCHMARKS(hipcub::ArgMin), #ifdef HIPCUB_ROCPRIM_API diff --git a/projects/hipcub/benchmark/benchmark_device_reduce_by_key.cpp b/projects/hipcub/benchmark/benchmark_device_reduce_by_key.cpp index 6a11732e1ff..dce6ce8c6cc 100644 --- a/projects/hipcub/benchmark/benchmark_device_reduce_by_key.cpp +++ b/projects/hipcub/benchmark/benchmark_device_reduce_by_key.cpp @@ -189,9 +189,9 @@ void add_benchmarks(size_t max_length, std::vector bs = { CREATE_BENCHMARKS(hipcub::Sum), CREATE_BENCHMARK(long long, custom_double2, hipcub::Sum), - CREATE_BENCHMARKS(hipcub::Min), + CREATE_BENCHMARKS(benchmark_utils::minimum), #ifdef HIPCUB_ROCPRIM_API - CREATE_BENCHMARK(long long, custom_double2, hipcub::Min), + CREATE_BENCHMARK(long long, custom_double2, benchmark_utils::minimum), #endif }; diff --git a/projects/hipcub/benchmark/benchmark_device_scan.cpp b/projects/hipcub/benchmark/benchmark_device_scan.cpp index bcacb77fcb3..5364b877965 100644 --- a/projects/hipcub/benchmark/benchmark_device_scan.cpp +++ b/projects/hipcub/benchmark/benchmark_device_scan.cpp @@ -29,6 +29,7 @@ #include "common_benchmark_header.hpp" // HIP API +#include #include #include @@ -353,7 +354,7 @@ int main(int argc, char* argv[]) // Add benchmarks std::vector benchmarks = { CREATE_BENCHMARKS(hipcub::Sum), - CREATE_BENCHMARKS(hipcub::Min), + CREATE_BENCHMARKS(benchmark_utils::minimum), }; // Use manual timing diff --git a/projects/hipcub/benchmark/benchmark_device_segmented_reduce.cpp b/projects/hipcub/benchmark/benchmark_device_segmented_reduce.cpp index 815acb0078c..32fb7779552 100644 --- a/projects/hipcub/benchmark/benchmark_device_segmented_reduce.cpp +++ b/projects/hipcub/benchmark/benchmark_device_segmented_reduce.cpp @@ -166,7 +166,7 @@ struct Benchmark }; template -struct Benchmark +struct Benchmark { static void run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size) @@ -235,9 +235,9 @@ void add_benchmarks(std::vector& benchmarks, std::vector bs = { CREATE_BENCHMARKS(hipcub::Sum), BENCHMARK_TYPE(custom_double2, hipcub::Sum), - CREATE_BENCHMARKS(hipcub::Min), + CREATE_BENCHMARKS(benchmark_utils::minimum), #ifdef HIPCUB_ROCPRIM_API - BENCHMARK_TYPE(custom_double2, hipcub::Min), + BENCHMARK_TYPE(custom_double2, benchmark_utils::minimum), #endif CREATE_BENCHMARKS(hipcub::ArgMin), #ifdef HIPCUB_ROCPRIM_API diff --git a/projects/hipcub/benchmark/benchmark_utils.hpp b/projects/hipcub/benchmark/benchmark_utils.hpp index 6af618b92b8..982dfb66e5d 100644 --- a/projects/hipcub/benchmark/benchmark_utils.hpp +++ b/projects/hipcub/benchmark/benchmark_utils.hpp @@ -485,6 +485,16 @@ inline constexpr auto ceiling_div(const T a, const U b) return a / b + (a % b > 0 ? 1 : 0); } +struct minimum +{ + template + HIPCUB_HOST_DEVICE + auto operator()(const T& a, const U& b) const + { + return a < b ? a : b; + } +}; + } // namespace benchmark_utils // Need for hipcub::DeviceReduce::Min/Max etc. diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp index 0c5ad32fc97..5ed9ef05ddb 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp @@ -46,6 +46,8 @@ #include // hip_bfloat16 #include // __half +#include _HIPCUB_LIBCXX_INCLUDE(functional) +#include _HIPCUB_STD_INCLUDE(functional) #include _HIPCUB_STD_INCLUDE(limits) #include @@ -271,7 +273,11 @@ class DeviceReduce d_in, d_out, num_items, - ::hipcub::Min(), +#if _HIPCUB_HAS_DEVICE_SYSTEM_STD + _HIPCUB_LIBCXX::minimum<>{}, +#else + [] (auto a, auto b) { return a > b ? b : a;}, +#endif detail::get_max_value(), stream); } @@ -388,7 +394,11 @@ class DeviceReduce d_in, d_out, num_items, - ::hipcub::Max(), +#if _HIPCUB_HAS_DEVICE_SYSTEM_STD + _HIPCUB_LIBCXX::maximum<>{}, +#else + [] (auto a, auto b) { return a > b ? a : b;}, +#endif detail::get_lowest_value(), stream); } diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp index d0d5c029dd5..c9b38b1108c 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp @@ -42,6 +42,8 @@ #include // IWYU pragma: export #include // IWYU pragma: export +#include _HIPCUB_LIBCXX_INCLUDE(functional) +#include _HIPCUB_STD_INCLUDE(functional) #include _HIPCUB_STD_INCLUDE(limits) #include @@ -312,7 +314,11 @@ struct DeviceSegmentedReduce num_segments, d_begin_offsets, d_end_offsets, - ::hipcub::Min(), +#if _HIPCUB_HAS_DEVICE_SYSTEM_STD + _HIPCUB_LIBCXX::minimum<>{}, +#else + [] (auto a, auto b) { return a > b ? b : a;}, +#endif _HIPCUB_STD::numeric_limits::max(), stream); } @@ -424,7 +430,11 @@ struct DeviceSegmentedReduce num_segments, d_begin_offsets, d_end_offsets, - ::hipcub::Max(), +#if _HIPCUB_HAS_DEVICE_SYSTEM_STD + _HIPCUB_LIBCXX::maximum<>{}, +#else + [] (auto a, auto b) { return a > b ? a : b;}, +#endif _HIPCUB_STD::numeric_limits::lowest(), stream); } diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp index 3d043d3d5c8..f881fae5fe6 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp @@ -113,11 +113,11 @@ void block_run_length_decode_kernel(const ItemT* d_run_items, ItemT decoded_items[DecodedItemsPerThread]; block_run_length_decode.RunLengthDecode(decoded_items, decoded_window_offset); - hipcub::StoreDirectBlocked( - global_thread_idx, - d_decoded_items + decoded_window_offset, - decoded_items, - hipcub::Min{}(total_decoded_size - decoded_window_offset, decoded_items_per_block)); + hipcub::StoreDirectBlocked(global_thread_idx, + d_decoded_items + decoded_window_offset, + decoded_items, + test_utils::minimum{}(total_decoded_size - decoded_window_offset, + decoded_items_per_block)); decoded_window_offset += decoded_items_per_block; } diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp index 2a5b5b8db8f..c60e8022bd7 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp @@ -263,10 +263,10 @@ TYPED_TEST(HipcubDeviceReduceTests, ReduceMinimum) // Calculate expected results on host using the same accumulator type than on device using Min = typename MinSelector::type; // For custom_type_test tests - using AccumT = hipcub::detail::accumulator_t; + using AccumT = hipcub::detail::accumulator_t; Min min_op; AccumT tmp_result = test_utils::numeric_limits< - AccumT>::max(); // hipcub::Min uses as initial type the input type + AccumT>::max(); // test_utils::minimum uses as initial type the input type for(unsigned int i = 0; i < input.size(); i++) { tmp_result = min_op(tmp_result, input[i]); @@ -377,7 +377,7 @@ TYPED_TEST(HipcubDeviceReduceTests, ReduceMaximum) // Calculate expected results on host using the same accumulator type than on device using Max = typename MaxSelector::type; // For custom_type_test tests - using AccumT = hipcub::detail::accumulator_t; + using AccumT = hipcub::detail::accumulator_t; Max max_op; AccumT tmp_result = test_utils::numeric_limits::min(); for(unsigned int i = 0; i < input.size(); i++) diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp index 49178bdda1a..5c1f3c7b5e5 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp @@ -56,19 +56,19 @@ using Params = ::testing::Types< params, params, params, - params, - params, - params, + params, + params, + params, params, params, params, params, - params, + params, params, params, // Sum for half and bfloat will result in values too big due to limited range. - params, - params, + params, + params, params>; TYPED_TEST_SUITE(HipcubDeviceReduceByKey, Params); diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp index e289c391646..94e5aa438a2 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp @@ -62,15 +62,15 @@ class HipcubDeviceScanTests : public ::testing::Test static constexpr bool use_graphs = Params::use_graphs; }; -using HipcubDeviceScanTestsParams - = ::testing::Types, - DeviceScanParams, - DeviceScanParams, - DeviceScanParams, - DeviceScanParams, - DeviceScanParams, - DeviceScanParams, - DeviceScanParams>; +using HipcubDeviceScanTestsParams = ::testing::Types< + DeviceScanParams, + DeviceScanParams, + DeviceScanParams, + DeviceScanParams, + DeviceScanParams, + DeviceScanParams, + DeviceScanParams, + DeviceScanParams>; // use float for accumulation of bfloat16 and half inputs if operator is plus template diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp index c0f3b0d3556..467c2447b4b 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp @@ -56,17 +56,17 @@ class HipcubDeviceSegmentedReduceOp : public ::testing::Test using params = Params; }; -using Params1 - = ::testing::Types, - params1, - params1, - params1, - params1, - params1, - params1, - params1, - params1, - params1>; +using Params1 = ::testing::Types< + params1, + params1, + params1, + params1, + params1, + params1, + params1, + params1, + params1, + params1>; TYPED_TEST_SUITE(HipcubDeviceSegmentedReduceOp, Params1); @@ -435,7 +435,7 @@ TYPED_TEST(HipcubDeviceSegmentedReduce, Min) using input_type = typename TestFixture::params::input_type; using output_type = typename TestFixture::params::output_type; - using reduce_op_type = typename hipcub::Min; + using reduce_op_type = typename test_utils::minimum; using result_type = output_type; using offset_type = unsigned int; @@ -592,7 +592,7 @@ TYPED_TEST(HipcubDeviceSegmentedReduce, Max) using input_type = typename TestFixture::params::input_type; using output_type = typename TestFixture::params::output_type; - using reduce_op_type = typename hipcub::Max; + using reduce_op_type = typename test_utils::maximum; using result_type = output_type; using offset_type = unsigned int; diff --git a/projects/hipcub/test/hipcub/test_utils_functional.hpp b/projects/hipcub/test/hipcub/test_utils_functional.hpp index 56c22664da8..25776b8c1f5 100644 --- a/projects/hipcub/test/hipcub/test_utils_functional.hpp +++ b/projects/hipcub/test/hipcub/test_utils_functional.hpp @@ -132,6 +132,30 @@ struct divides } }; +struct maximum +{ + template + HIPCUB_HOST_DEVICE + auto operator()(const T& a, const U& b) const + { + using result_type = ::std::common_type_t; + result_type ra = a, rb = b; + return ra < rb ? rb : ra; + } +}; + +struct minimum +{ + template + HIPCUB_HOST_DEVICE + auto operator()(const T& a, const U& b) const + { + using result_type = ::std::common_type_t; + result_type ra = a, rb = b; + return ra < rb ? ra : rb; + } +}; + // HALF template<> HIPCUB_HOST_DEVICE inline bool diff --git a/projects/hipcub/test/hipcub/test_utils_thread_operators.hpp b/projects/hipcub/test/hipcub/test_utils_thread_operators.hpp index 1ae57cbbd47..630c79b2b77 100644 --- a/projects/hipcub/test/hipcub/test_utils_thread_operators.hpp +++ b/projects/hipcub/test/hipcub/test_utils_thread_operators.hpp @@ -279,50 +279,50 @@ struct AlgebraicSelector template struct MaxSelector { - using type = hipcub::Max; + using type = test_utils::maximum; }; template struct MaxSelector, test_utils::custom_test_type> { - using type = CustomTestOp; + using type = CustomTestOp; }; template struct MaxSelector { - using type = ExtendedFloatBinOp; + using type = ExtendedFloatBinOp; }; template struct MaxSelector { - using type = ExtendedFloatBinOp; + using type = ExtendedFloatBinOp; }; // Min functor selector. template struct MinSelector { - using type = hipcub::Min; + using type = test_utils::minimum; }; template struct MinSelector, test_utils::custom_test_type> { - using type = CustomTestOp; + using type = CustomTestOp; }; template struct MinSelector { - using type = ExtendedFloatBinOp; + using type = ExtendedFloatBinOp; }; template struct MinSelector { - using type = ExtendedFloatBinOp; + using type = ExtendedFloatBinOp; }; // ArgMax functor selector From b35571e6589f0b6dd15bcc1b84d66ff1001f02ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Mon, 2 Mar 2026 16:08:45 +0000 Subject: [PATCH 85/95] Remove internal usage of ``hipcub::Sum`` --- .../benchmark/benchmark_block_reduce.cpp | 2 +- .../hipcub/benchmark/benchmark_block_scan.cpp | 4 +- .../benchmark_device_adjacent_difference.cpp | 2 +- .../benchmark/benchmark_device_memory.cpp | 2 +- .../benchmark/benchmark_device_select.cpp | 4 +- projects/hipcub/benchmark/benchmark_utils.hpp | 10 ++++ .../benchmark/benchmark_warp_reduce.cpp | 2 +- .../hipcub/benchmark/benchmark_warp_scan.cpp | 4 +- .../backend/rocprim/block/block_scan.hpp | 58 +++++++++++-------- .../backend/rocprim/device/device_reduce.hpp | 2 +- .../backend/rocprim/device/device_scan.hpp | 8 +-- .../device/device_segmented_reduce.hpp | 2 +- .../test_hipcub_block_adjacent_difference.cpp | 10 ++-- .../test/hipcub/test_hipcub_block_reduce.cpp | 14 +++-- .../test/hipcub/test_hipcub_block_scan.cpp | 28 ++++----- .../test/hipcub/test_hipcub_device_copy.cpp | 6 +- .../test/hipcub/test_hipcub_device_memcpy.cpp | 4 +- .../test/hipcub/test_hipcub_device_reduce.cpp | 17 +++--- .../test_hipcub_device_reduce_by_key.cpp | 20 +++---- .../test/hipcub/test_hipcub_device_scan.cpp | 28 ++++----- .../test_hipcub_device_segmented_reduce.cpp | 16 ++--- .../test/hipcub/test_hipcub_device_select.cpp | 4 +- .../hipcub/test/hipcub/test_hipcub_grid.cpp | 5 +- ...test_hipcub_single_pass_scan_operators.cpp | 8 +-- .../hipcub/test_hipcub_thread_operators.cpp | 14 ++--- .../test/hipcub/test_hipcub_warp_reduce.cpp | 12 +++- .../test/hipcub/test_hipcub_warp_scan.cpp | 19 +++--- .../test/hipcub/test_utils_functional.hpp | 12 ++-- .../hipcub/test_utils_thread_operators.hpp | 2 +- 29 files changed, 178 insertions(+), 141 deletions(-) diff --git a/projects/hipcub/benchmark/benchmark_block_reduce.cpp b/projects/hipcub/benchmark/benchmark_block_reduce.cpp index 7c448693d17..c74be6d25d6 100644 --- a/projects/hipcub/benchmark/benchmark_block_reduce.cpp +++ b/projects/hipcub/benchmark/benchmark_block_reduce.cpp @@ -61,7 +61,7 @@ struct reduce _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { - reduced_value = breduce_t(storage).Reduce(values, hipcub::Sum()); + reduced_value = breduce_t(storage).Reduce(values, benchmark_utils::plus{}); values[0] = reduced_value; } diff --git a/projects/hipcub/benchmark/benchmark_block_scan.cpp b/projects/hipcub/benchmark/benchmark_block_scan.cpp index 3ef35356bf1..9eaeeec098c 100644 --- a/projects/hipcub/benchmark/benchmark_block_scan.cpp +++ b/projects/hipcub/benchmark/benchmark_block_scan.cpp @@ -61,7 +61,7 @@ struct inclusive_scan _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { - bscan_t(storage).InclusiveScan(values, values, hipcub::Sum()); + bscan_t(storage).InclusiveScan(values, values, benchmark_utils::plus{}); } for(unsigned int k = 0; k < ItemsPerThread; k++) @@ -91,7 +91,7 @@ struct exclusive_scan _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { - bscan_t(storage).ExclusiveScan(values, values, init, hipcub::Sum()); + bscan_t(storage).ExclusiveScan(values, values, init, benchmark_utils::plus{}); } for(unsigned int k = 0; k < ItemsPerThread; k++) diff --git a/projects/hipcub/benchmark/benchmark_device_adjacent_difference.cpp b/projects/hipcub/benchmark/benchmark_device_adjacent_difference.cpp index d6a55ba2aef..d3d57da67d1 100644 --- a/projects/hipcub/benchmark/benchmark_device_adjacent_difference.cpp +++ b/projects/hipcub/benchmark/benchmark_device_adjacent_difference.cpp @@ -146,7 +146,7 @@ void run_benchmark(benchmark::State& state, const size_t size, const hipStream_t d_input, d_output, size, - hipcub::Sum{}, + benchmark_utils::plus{}, stream); }; HIP_CHECK(launch()); diff --git a/projects/hipcub/benchmark/benchmark_device_memory.cpp b/projects/hipcub/benchmark/benchmark_device_memory.cpp index fed54b86270..d00007f4e32 100644 --- a/projects/hipcub/benchmark/benchmark_device_memory.cpp +++ b/projects/hipcub/benchmark/benchmark_device_memory.cpp @@ -113,7 +113,7 @@ struct operation // sync before re-using shared memory from load __syncthreads(); - block_scan_type(storage).InclusiveScan(input, input, hipcub::Sum()); + block_scan_type(storage).InclusiveScan(input, input, benchmark_utils::plus{}); } }; diff --git a/projects/hipcub/benchmark/benchmark_device_select.cpp b/projects/hipcub/benchmark/benchmark_device_select.cpp index d52c4dcbde2..cbf963ea000 100644 --- a/projects/hipcub/benchmark/benchmark_device_select.cpp +++ b/projects/hipcub/benchmark/benchmark_device_select.cpp @@ -321,7 +321,7 @@ void run_unique_benchmark(benchmark::State& state, const hipStream_t stream, float discontinuity_probability) { - hipcub::Sum op; + benchmark_utils::plus op{}; std::vector input(size); { @@ -410,7 +410,7 @@ void run_unique_by_key_benchmark(benchmark::State& state, const hipStream_t stream, float discontinuity_probability) { - hipcub::Sum op; + benchmark_utils::plus op{}; std::vector input_keys(size); { diff --git a/projects/hipcub/benchmark/benchmark_utils.hpp b/projects/hipcub/benchmark/benchmark_utils.hpp index 982dfb66e5d..da6db74963a 100644 --- a/projects/hipcub/benchmark/benchmark_utils.hpp +++ b/projects/hipcub/benchmark/benchmark_utils.hpp @@ -495,6 +495,16 @@ struct minimum } }; +struct plus +{ + template + HIPCUB_HOST_DEVICE + constexpr auto operator()(const A& a, const B& b) const -> decltype(a + b) + { + return a + b; + } +}; + } // namespace benchmark_utils // Need for hipcub::DeviceReduce::Min/Max etc. diff --git a/projects/hipcub/benchmark/benchmark_warp_reduce.cpp b/projects/hipcub/benchmark/benchmark_warp_reduce.cpp index 9847ff80d71..aa830d7360c 100644 --- a/projects/hipcub/benchmark/benchmark_warp_reduce.cpp +++ b/projects/hipcub/benchmark/benchmark_warp_reduce.cpp @@ -40,7 +40,7 @@ __device__ auto warp_reduce_benchmark(const T* d_input, T* d_output) using wreduce_t = hipcub::WarpReduce; __shared__ typename wreduce_t::TempStorage storage; - auto reduce_op = hipcub::Sum(); + auto reduce_op = benchmark_utils::plus{}; _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { diff --git a/projects/hipcub/benchmark/benchmark_warp_scan.cpp b/projects/hipcub/benchmark/benchmark_warp_scan.cpp index ef63a4ee9bd..450df772a90 100644 --- a/projects/hipcub/benchmark/benchmark_warp_scan.cpp +++ b/projects/hipcub/benchmark/benchmark_warp_scan.cpp @@ -57,7 +57,7 @@ struct inclusive_scan using wscan_t = hipcub::WarpScan; __shared__ typename wscan_t::TempStorage storage; - auto scan_op = hipcub::Sum(); + auto scan_op = benchmark_utils::plus{}; _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { @@ -86,7 +86,7 @@ struct exclusive_scan using wscan_t = hipcub::WarpScan; __shared__ typename wscan_t::TempStorage storage; - auto scan_op = hipcub::Sum(); + auto scan_op = benchmark_utils::plus{}; _CCCL_PRAGMA_NOUNROLL() for(unsigned int trial = 0; trial < Trials; trial++) { diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_scan.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_scan.hpp index 3111e8224f2..462b32c0580 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_scan.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_scan.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -36,6 +36,8 @@ #include // IWYU pragma: export +#include _HIPCUB_STD_INCLUDE(functional) + #include BEGIN_HIPCUB_NAMESPACE @@ -121,12 +123,14 @@ class BlockScan } template - HIPCUB_DEVICE inline - void InclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op) + HIPCUB_DEVICE + inline void InclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op) { - base_type::inclusive_scan( - input, output, temp_storage_, block_prefix_callback_op, ::hipcub::Sum() - ); + base_type::inclusive_scan(input, + output, + temp_storage_, + block_prefix_callback_op, + _HIPCUB_STD::plus<>{}); } template @@ -145,13 +149,16 @@ class BlockScan } template - HIPCUB_DEVICE inline - void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], - BlockPrefixCallbackOp& block_prefix_callback_op) + HIPCUB_DEVICE + inline void InclusiveSum(T (&input)[ITEMS_PER_THREAD], + T (&output)[ITEMS_PER_THREAD], + BlockPrefixCallbackOp& block_prefix_callback_op) { - base_type::inclusive_scan( - input, output, temp_storage_, block_prefix_callback_op, ::hipcub::Sum() - ); + base_type::inclusive_scan(input, + output, + temp_storage_, + block_prefix_callback_op, + _HIPCUB_STD::plus<>{}); } template @@ -241,12 +248,14 @@ class BlockScan } template - HIPCUB_DEVICE inline - void ExclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op) + HIPCUB_DEVICE + inline void ExclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op) { - base_type::exclusive_scan( - input, output, temp_storage_, block_prefix_callback_op, ::hipcub::Sum() - ); + base_type::exclusive_scan(input, + output, + temp_storage_, + block_prefix_callback_op, + _HIPCUB_STD::plus<>{}); } template @@ -265,13 +274,16 @@ class BlockScan } template - HIPCUB_DEVICE inline - void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], - BlockPrefixCallbackOp& block_prefix_callback_op) + HIPCUB_DEVICE + inline void ExclusiveSum(T (&input)[ITEMS_PER_THREAD], + T (&output)[ITEMS_PER_THREAD], + BlockPrefixCallbackOp& block_prefix_callback_op) { - base_type::exclusive_scan( - input, output, temp_storage_, block_prefix_callback_op, ::hipcub::Sum() - ); + base_type::exclusive_scan(input, + output, + temp_storage_, + block_prefix_callback_op, + _HIPCUB_STD::plus<>{}); } template diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp index 5ed9ef05ddb..41b697eb3a5 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp @@ -240,7 +240,7 @@ class DeviceReduce d_in, d_out, num_items, - ::hipcub::Sum(), + _HIPCUB_STD::plus<>{}, InitT(0), stream); } diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_scan.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_scan.hpp index 68ff1b1a024..de2be6e0859 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_scan.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_scan.hpp @@ -60,7 +60,7 @@ class DeviceScan temp_storage_bytes, d_in, d_out, - ::hipcub::Sum(), + _HIPCUB_STD::plus<>{}, num_items, stream); } @@ -237,7 +237,7 @@ class DeviceScan temp_storage_bytes, d_in, d_out, - ::hipcub::Sum(), + _HIPCUB_STD::plus<>{}, T(0), num_items, stream); @@ -513,7 +513,7 @@ class DeviceScan d_keys_in, d_values_in, d_values_out, - ::hipcub::Sum(), + _HIPCUB_STD::plus<>{}, static_cast(0), num_items, equality_op, @@ -641,7 +641,7 @@ class DeviceScan d_keys_in, d_values_in, d_values_out, - ::hipcub::Sum(), + _HIPCUB_STD::plus<>{}, num_items, equality_op, stream); diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp index c9b38b1108c..1a05059891d 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp @@ -266,7 +266,7 @@ struct DeviceSegmentedReduce num_segments, d_begin_offsets, d_end_offsets, - ::hipcub::Sum(), + _HIPCUB_STD::plus<>{}, input_type(), stream); } diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_adjacent_difference.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_adjacent_difference.cpp index 6ca858bbcc5..9d7c8f1664a 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_adjacent_difference.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_adjacent_difference.cpp @@ -72,25 +72,25 @@ struct custom_op2 }; using ParamsSubtract - = ::testing::Types, + = ::testing::Types, params_subtract, params_subtract, params_subtract, params_subtract, params_subtract, - params_subtract, + params_subtract, params_subtract, params_subtract, - params_subtract, + params_subtract, params_subtract, params_subtract, - params_subtract, + params_subtract, params_subtract, params_subtract, - params_subtract, + params_subtract, params_subtract, params_subtract>; diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_reduce.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_reduce.cpp index ee1be7338df..fb871a75135 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_reduce.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_reduce.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -129,8 +129,9 @@ void reduce_kernel(T* device_output, T* device_output_reductions) const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; T value = device_output[index]; using breduce_t = hipcub::BlockReduce; - __shared__ typename breduce_t::TempStorage temp_storage; - value = breduce_t(temp_storage).Reduce(value, hipcub::Sum()); + __shared__ + typename breduce_t::TempStorage temp_storage; + value = breduce_t(temp_storage).Reduce(value, test_utils::plus{}); if(hipThreadIdx_x == 0) { device_output_reductions[hipBlockIdx_x] = value; @@ -337,8 +338,9 @@ void reduce_valid_kernel(T* device_output, const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; T value = device_output[index]; using breduce_t = hipcub::BlockReduce; - __shared__ typename breduce_t::TempStorage temp_storage; - value = breduce_t(temp_storage).Reduce(value, hipcub::Sum(), valid_items); + __shared__ + typename breduce_t::TempStorage temp_storage; + value = breduce_t(temp_storage).Reduce(value, test_utils::plus{}, valid_items); if(hipThreadIdx_x == 0) { device_output_reductions[hipBlockIdx_x] = value; @@ -606,7 +608,7 @@ void reduce_array_kernel(T* device_output, T* device_output_reductions) T reduction; using breduce_t = hipcub::BlockReduce; __shared__ typename breduce_t::TempStorage temp_storage; - reduction = breduce_t(temp_storage).Reduce(in_out, hipcub::Sum()); + reduction = breduce_t(temp_storage).Reduce(in_out, test_utils::plus{}); if(hipThreadIdx_x == 0) { diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_scan.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_scan.cpp index 8add3ce9a99..ea35605377f 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_scan.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_scan.cpp @@ -97,7 +97,7 @@ void inclusive_scan_kernel(T* device_output) using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage temp_storage; - bscan_t(temp_storage).InclusiveScan(value, value, hipcub::Sum()); + bscan_t(temp_storage).InclusiveScan(value, value, test_utils::plus{}); device_output[index] = value; } @@ -202,7 +202,7 @@ void inclusive_scan_initial_value_kernel(T* device_output, T initial_value) using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage temp_storage; - bscan_t(temp_storage).InclusiveScan(input, output, initial_value, hipcub::Sum()); + bscan_t(temp_storage).InclusiveScan(input, output, initial_value, test_utils::plus{}); for(unsigned int i = 0; i < ItemsPerThread; ++i) { @@ -305,7 +305,7 @@ void inclusive_scan_reduce_kernel(T* device_output, T* device_output_reductions) T reduction; using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage temp_storage; - bscan_t(temp_storage).InclusiveScan(value, value, hipcub::Sum(), reduction); + bscan_t(temp_storage).InclusiveScan(value, value, test_utils::plus{}, reduction); device_output[index] = value; if(hipThreadIdx_x == 0) { @@ -439,7 +439,7 @@ void inclusive_scan_reduce_initial_value_kernel(T* device_output, using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage temp_storage; - bscan_t(temp_storage).InclusiveScan(input, output, initial_value, hipcub::Sum(), reduction); + bscan_t(temp_storage).InclusiveScan(input, output, initial_value, test_utils::plus{}, reduction); for(unsigned int i = 0; i < ItemsPerThread; ++i) { @@ -580,7 +580,7 @@ void inclusive_scan_prefix_callback_kernel(T* device_output, T* device_output_bp using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage temp_storage; - bscan_t(temp_storage).InclusiveScan(value, value, hipcub::Sum(), prefix_callback); + bscan_t(temp_storage).InclusiveScan(value, value, test_utils::plus{}, prefix_callback); device_output[index] = value; if(hipThreadIdx_x == 0) @@ -699,7 +699,7 @@ void exclusive_scan_kernel(T* device_output, T init) T value = device_output[index]; using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage temp_storage; - bscan_t(temp_storage).ExclusiveScan(value, value, init, hipcub::Sum()); + bscan_t(temp_storage).ExclusiveScan(value, value, init, test_utils::plus{}); device_output[index] = value; } @@ -796,7 +796,7 @@ void exclusive_scan_reduce_kernel(T* device_output, T* device_output_reductions, T reduction; using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage temp_storage; - bscan_t(temp_storage).ExclusiveScan(value, value, init, hipcub::Sum(), reduction); + bscan_t(temp_storage).ExclusiveScan(value, value, init, test_utils::plus{}, reduction); device_output[index] = value; if(hipThreadIdx_x == 0) { @@ -933,7 +933,7 @@ void exclusive_scan_prefix_callback_kernel(T* device_output, T* device_output_bp using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage temp_storage; - bscan_t(temp_storage).ExclusiveScan(value, value, hipcub::Sum(), prefix_callback); + bscan_t(temp_storage).ExclusiveScan(value, value, test_utils::plus{}, prefix_callback); device_output[index] = value; if(hipThreadIdx_x == 0) @@ -1908,7 +1908,7 @@ void inclusive_scan_array_kernel(T* device_output) using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage temp_storage; - bscan_t(temp_storage).InclusiveScan(in_out, in_out, hipcub::Sum()); + bscan_t(temp_storage).InclusiveScan(in_out, in_out, test_utils::plus{}); // store for(unsigned int j = 0; j < ItemsPerThread; j++) @@ -2021,7 +2021,7 @@ void inclusive_scan_reduce_array_kernel(T* device_output, T* device_output_reduc using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage temp_storage; T reduction; - bscan_t(temp_storage).InclusiveScan(in_out, in_out, hipcub::Sum(), reduction); + bscan_t(temp_storage).InclusiveScan(in_out, in_out, test_utils::plus{}, reduction); // store for(unsigned int j = 0; j < ItemsPerThread; j++) @@ -2172,7 +2172,7 @@ void inclusive_scan_array_prefix_callback_kernel(T* device_output, using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage temp_storage; - bscan_t(temp_storage).InclusiveScan(in_out, in_out, hipcub::Sum(), prefix_callback); + bscan_t(temp_storage).InclusiveScan(in_out, in_out, test_utils::plus{}, prefix_callback); // store for(unsigned int j = 0; j < ItemsPerThread; j++) @@ -2320,7 +2320,7 @@ void exclusive_scan_array_kernel(T* device_output, T init) using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage temp_storage; - bscan_t(temp_storage).ExclusiveScan(in_out, in_out, init, hipcub::Sum()); + bscan_t(temp_storage).ExclusiveScan(in_out, in_out, init, test_utils::plus{}); // store for(unsigned int j = 0; j < ItemsPerThread; j++) @@ -2441,7 +2441,7 @@ void exclusive_scan_reduce_array_kernel(T* device_output, T* device_output_reduc using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage temp_storage; T reduction; - bscan_t(temp_storage).ExclusiveScan(in_out, in_out, init, hipcub::Sum(), reduction); + bscan_t(temp_storage).ExclusiveScan(in_out, in_out, init, test_utils::plus{}, reduction); // store for(unsigned int j = 0; j < ItemsPerThread; j++) @@ -2609,7 +2609,7 @@ void exclusive_scan_prefix_callback_array_kernel(T* device_output, using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage temp_storage; - bscan_t(temp_storage).ExclusiveScan(in_out, in_out, hipcub::Sum(), prefix_callback); + bscan_t(temp_storage).ExclusiveScan(in_out, in_out, test_utils::plus{}, prefix_callback); // store for(unsigned int j = 0; j < ItemsPerThread; j++) diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_copy.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_copy.cpp index e446d77af0b..4fc2eabe36a 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_copy.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_copy.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024-2026 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -268,12 +268,12 @@ TYPED_TEST(DeviceBatchCopyTests, SizeAndTypeVariation) h_buffer_num_elements.end(), 0, src_offsets.begin(), - hipcub::Sum{}); + test_utils::plus{}); test_utils::host_exclusive_scan(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), 0, dst_offsets.begin(), - hipcub::Sum{}); + test_utils::plus{}); } // Generate the source and destination pointers. diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_memcpy.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_memcpy.cpp index ec7b3f62fbd..a36066d6bb6 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_memcpy.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_memcpy.cpp @@ -272,12 +272,12 @@ TYPED_TEST(DeviceBatchMemcpyTests, SizeAndTypeVariation) h_buffer_num_elements.end(), 0, src_offsets.begin(), - hipcub::Sum{}); + test_utils::plus{}); test_utils::host_exclusive_scan(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), 0, dst_offsets.begin(), - hipcub::Sum{}); + test_utils::plus{}); } // Generate the source and destination pointers. diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp index c60e8022bd7..411a969b8f2 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp @@ -120,11 +120,12 @@ TYPED_TEST(HipcubDeviceReduceTests, ReduceSum) HIP_CHECK(hipDeviceSynchronize()); // Calculate expected results on host using the same accumulator type than on device - using Sum = - typename AlgebraicSelector::type; // For custom_type_test tests + using Sum = typename AlgebraicSelector:: + type; // For custom_type_test tests using AccumT = hipcub::detail::accumulator_t; Sum sum_op; - AccumT tmp_result = AccumT(0.0f); // hipcub::Sum uses as initial type the output type + AccumT tmp_result + = AccumT(0.0f); // test_utils::plus uses as initial type the output type for(unsigned int i = 0; i < input.size(); i++) { tmp_result = sum_op(tmp_result, input[i]); @@ -143,7 +144,7 @@ TYPED_TEST(HipcubDeviceReduceTests, ReduceSum) d_input, d_output, input.size(), - ExtendedFloatBinOp(), + ExtendedFloatBinOp(), U(0.f), stream)); } @@ -177,7 +178,7 @@ TYPED_TEST(HipcubDeviceReduceTests, ReduceSum) d_input, d_output, input.size(), - ExtendedFloatBinOp(), + ExtendedFloatBinOp(), U(0.f), stream)); } @@ -1051,14 +1052,14 @@ TYPED_TEST(HipcubDeviceReduceTests, TransformReduce) hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); // Calculate expected results on host using the same accumulator type than on device - using Sum = - typename AlgebraicSelector::type; // For custom_type_test tests + using Sum = typename AlgebraicSelector:: + type; // For custom_type_test tests using AccumT = hipcub::detail::accumulator_t; Sum reduction_op; TestTransformOp transform_op; const U init(10); - AccumT tmp_result = init; // hipcub::Sum uses as initial type the output type + AccumT tmp_result = init; // test_utils::plus uses as initial type the output type for(size_t i = 0; i < input.size(); ++i) { tmp_result = reduction_op(tmp_result, transform_op(input[i])); diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp index 5c1f3c7b5e5..87355be27de 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp @@ -53,23 +53,23 @@ class HipcubDeviceReduceByKey : public ::testing::Test { }; using Params = ::testing::Types< - params, - params, - params, + params, + params, + params, params, params, params, - params, - params, - params, - params, + params, + params, + params, + params, params, - params, - params, + params, + params, // Sum for half and bfloat will result in values too big due to limited range. params, params, - params>; + params>; TYPED_TEST_SUITE(HipcubDeviceReduceByKey, Params); diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp index 94e5aa438a2..53d0417d1eb 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp @@ -33,7 +33,7 @@ // Params for tests template struct DeviceScanParams @@ -70,7 +70,7 @@ using HipcubDeviceScanTestsParams = ::testing::Types< DeviceScanParams, DeviceScanParams, DeviceScanParams, - DeviceScanParams>; + DeviceScanParams>; // use float for accumulation of bfloat16 and half inputs if operator is plus template @@ -208,7 +208,7 @@ TYPED_TEST(HipcubDeviceScanTests, InclusiveScan) auto call = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { - if constexpr(std::is_same_v) + if constexpr(std::is_same_v) { if constexpr(inplace) { @@ -592,7 +592,7 @@ TYPED_TEST(HipcubDeviceScanTests, InclusiveScanByKey) size_t temp_storage_size_bytes{}; void* d_temp_storage = nullptr; // Get size of d_temp_storage - if(std::is_same_v) + if(std::is_same_v) { HIP_CHECK(hipcub::DeviceScan::InclusiveSumByKey(d_temp_storage, temp_storage_size_bytes, @@ -628,7 +628,7 @@ TYPED_TEST(HipcubDeviceScanTests, InclusiveScanByKey) gHelper.startStreamCapture(stream); // Run - if(std::is_same_v) + if(std::is_same_v) { HIP_CHECK(hipcub::DeviceScan::InclusiveSumByKey(d_temp_storage, temp_storage_size_bytes, @@ -760,7 +760,7 @@ TYPED_TEST(HipcubDeviceScanTests, ExclusiveScan) // Calculate expected results on host std::vector expected(input.size()); const T initial_value - = std::is_same_v + = std::is_same_v ? test_utils::convert_to_device(0) : test_utils::get_random_value(test_utils::convert_to_device(1), test_utils::convert_to_device(100), @@ -779,7 +779,7 @@ TYPED_TEST(HipcubDeviceScanTests, ExclusiveScan) auto call = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { - if constexpr(std::is_same_v) + if constexpr(std::is_same_v) { if constexpr(inplace) { @@ -954,7 +954,7 @@ TYPED_TEST(HipcubDeviceScanTests, ExclusiveScanByKey) test_utils::convert_to_device(10), seed_value); T initial_value = initial_value_vector.front(); - if(std::is_same_v) + if(std::is_same_v) { initial_value = test_utils::convert_to_device(0); } @@ -994,7 +994,7 @@ TYPED_TEST(HipcubDeviceScanTests, ExclusiveScanByKey) size_t temp_storage_size_bytes; void* d_temp_storage = nullptr; // Get size of d_temp_storage - if(std::is_same_v) + if(std::is_same_v) { HIP_CHECK(hipcub::DeviceScan::ExclusiveSumByKey(d_temp_storage, temp_storage_size_bytes, @@ -1031,7 +1031,7 @@ TYPED_TEST(HipcubDeviceScanTests, ExclusiveScanByKey) gHelper.startStreamCapture(stream); // Run - if(std::is_same_v) + if(std::is_same_v) { HIP_CHECK(hipcub::DeviceScan::ExclusiveSumByKey(d_temp_storage, temp_storage_size_bytes, @@ -1117,7 +1117,7 @@ TEST(HipcubDeviceScanTests, LargeIndicesInclusiveScan) temp_storage_size_bytes, input_begin, output_it, - ::hipcub::Sum(), + test_utils::plus{}, size, stream)); @@ -1133,7 +1133,7 @@ TEST(HipcubDeviceScanTests, LargeIndicesInclusiveScan) temp_storage_size_bytes, input_begin, output_it, - ::hipcub::Sum(), + test_utils::plus{}, size, stream)); HIP_CHECK(hipGetLastError()); @@ -1188,7 +1188,7 @@ TEST(HipcubDeviceScanTests, LargeIndicesExclusiveScan) temp_storage_size_bytes, input_begin, output_it, - ::hipcub::Sum(), + test_utils::plus{}, initial_value, size, stream)); @@ -1205,7 +1205,7 @@ TEST(HipcubDeviceScanTests, LargeIndicesExclusiveScan) temp_storage_size_bytes, input_begin, output_it, - ::hipcub::Sum(), + test_utils::plus{}, initial_value, size, stream)); diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp index 467c2447b4b..e1e80ffcef3 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_segmented_reduce.cpp @@ -32,7 +32,7 @@ template, - params1, + params1, + params1, params1, params1, - params1, + params1, params1, - params1, + params1, params1, params1, - params1>; + params1>; TYPED_TEST_SUITE(HipcubDeviceSegmentedReduceOp, Params1); @@ -278,7 +278,7 @@ TYPED_TEST(HipcubDeviceSegmentedReduce, Sum) using input_type = typename TestFixture::params::input_type; using output_type = typename TestFixture::params::output_type; - using reduce_op_type = typename hipcub::Sum; + using reduce_op_type = typename test_utils::plus; using result_type = output_type; using offset_type = unsigned int; @@ -1145,7 +1145,7 @@ TEST(HipcubDeviceSegmentedReduceLargeIndicesTests, LargeIndices) using input_type = T; using output_type = T; using IteratorType = test_utils::counting_iterator; - using reduce_op_type = typename hipcub::Sum; + using reduce_op_type = typename test_utils::plus; using offset_type = T; const input_type init = input_type(0); diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp index e63926b234e..cdccddab541 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp @@ -690,7 +690,7 @@ TYPED_TEST(HipcubDeviceSelectTests, Unique) test_utils::host_inclusive_scan(input01.begin(), input01.end(), input.begin(), - hipcub::Sum()); + test_utils::plus{}); } // Allocate and copy to device @@ -1069,7 +1069,7 @@ TYPED_TEST(HipcubDeviceUniqueByKeyTests, UniqueByKey) test_utils::host_inclusive_scan(input01.begin(), input01.end(), input_keys.begin(), - hipcub::Sum()); + test_utils::plus{}); } const auto input_values diff --git a/projects/hipcub/test/hipcub/test_hipcub_grid.cpp b/projects/hipcub/test/hipcub/test_hipcub_grid.cpp index be9c116ee56..62783e2f8e1 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_grid.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_grid.cpp @@ -58,7 +58,7 @@ __global__ void KernelGridEvenShare( T value = device_output[index]; - value = breduce_t(temp_storage).Reduce(value, hipcub::Sum()); + value = breduce_t(temp_storage).Reduce(value, test_utils::plus{}); if(hipThreadIdx_x == 0) { device_output_reductions[hipBlockIdx_x] = value; @@ -179,7 +179,8 @@ __global__ void KernelGridQueue( int32_t index = block_tile_index * BlockSize + hipThreadIdx_x; T value = device_output[index]; - value = breduce_t(temp_storage).Reduce(value, hipcub::Sum()); + + value = breduce_t(temp_storage).Reduce(value, test_utils::plus{}); if(hipThreadIdx_x == 0) { diff --git a/projects/hipcub/test/hipcub/test_hipcub_single_pass_scan_operators.cpp b/projects/hipcub/test/hipcub/test_hipcub_single_pass_scan_operators.cpp index 9aaf38cc002..4b876305da1 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_single_pass_scan_operators.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_single_pass_scan_operators.cpp @@ -34,7 +34,7 @@ #include #include -template +template struct custom_key_value_pair_op { using type = hipcub::KeyValuePair; @@ -135,7 +135,7 @@ static void PrefixKernel(TileState tile_state, T* d_input, T* d_output) template, int BlockSize = 64, - typename ScanOp = hipcub::Sum> + typename ScanOp = test_utils::plus> struct SinglePassScanRunner { void run(int num_items, T* d_input, T* d_output) @@ -166,7 +166,7 @@ struct custom_scan_tile_state : hipcub::ScanTileState template, - typename ScanOp = hipcub::Sum> + typename ScanOp = test_utils::plus> struct SinglePassScanParams { using type = T; @@ -281,7 +281,7 @@ static void RunningPrefixKernel(T* d_input, T* d_output) } } -template +template struct RunningPrefixRunner { void run(T* d_input, T* d_output) diff --git a/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp b/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp index 9a26fb18650..b6cb3438a35 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp @@ -162,7 +162,7 @@ TYPED_TEST(HipcubThreadOperatorsTests, Sum) { using input_type = typename TestFixture::input_type; using output_type = typename TestFixture::output_type; - using Sum = typename AlgebraicSelector::type; + using Sum = typename AlgebraicSelector::type; for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { @@ -462,8 +462,8 @@ TYPED_TEST(HipcubNCThreadOperatorsTests, SwizzleScanOp) std::iota(h_input.begin(), h_input.end(), static_cast(1)); // Scan function: SwizzleScanOp. - hipcub::Sum sum_op{}; - hipcub::SwizzleScanOp scan_op(sum_op); + test_utils::plus sum_op{}; + hipcub::SwizzleScanOp scan_op(sum_op); // Calculate expected results on host. std::vector h_expected(input_size); @@ -507,8 +507,8 @@ TYPED_TEST(HipcubNCThreadOperatorsTests, ReduceBySegmentOp) } // Reduce and scan operators. - hipcub::Sum sum_op{}; - hipcub::ReduceBySegmentOp op(sum_op); + test_utils::plus sum_op{}; + hipcub::ReduceBySegmentOp op(sum_op); // Calculate expected results on host. std::vector expected{}; @@ -588,8 +588,8 @@ TYPED_TEST(HipcubNCThreadOperatorsTests, ReduceByKeyOp) } // Reduce operators. - hipcub::Sum sum_op; - hipcub::ReduceByKeyOp op{}; + test_utils::plus sum_op; + hipcub::ReduceByKeyOp op{}; // Calculate output on host. std::vector h_output(h_unique_keys); diff --git a/projects/hipcub/test/hipcub/test_hipcub_warp_reduce.cpp b/projects/hipcub/test/hipcub/test_hipcub_warp_reduce.cpp index ea1f69aff3e..c449d91b6fc 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_warp_reduce.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_warp_reduce.cpp @@ -123,7 +123,9 @@ auto warp_reduce_kernel(T* device_input, T* device_output) -> using wreduce_t = hipcub::WarpReduce; __shared__ typename wreduce_t::TempStorage storage[warps_no]; - auto reduce_op = hipcub::Sum(); + + auto reduce_op = test_utils::plus{}; + value = wreduce_t(storage[warp_id]).Reduce(value, reduce_op); if (hipThreadIdx_x % LogicalWarpSize == 0) @@ -281,7 +283,9 @@ auto warp_reduce_valid_kernel(T* device_input, T* device_output, const int valid using wreduce_t = hipcub::WarpReduce; __shared__ typename wreduce_t::TempStorage storage[warps_no]; - auto reduce_op = hipcub::Sum(); + + auto reduce_op = test_utils::plus{}; + value = wreduce_t(storage[warp_id]).Reduce(value, reduce_op, valid); if (hipThreadIdx_x % LogicalWarpSize == 0) @@ -650,7 +654,9 @@ auto tail_segmented_warp_reduce_kernel(T* input, Flag* flags, T* output) -> using wreduce_t = hipcub::WarpReduce; __shared__ typename wreduce_t::TempStorage storage[warps_no]; - auto reduce_op = hipcub::Sum(); + + auto reduce_op = test_utils::plus{}; + value = wreduce_t(storage[warp_id]).TailSegmentedReduce(value, flag, reduce_op); output[index] = value; diff --git a/projects/hipcub/test/hipcub/test_hipcub_warp_scan.cpp b/projects/hipcub/test/hipcub/test_hipcub_warp_scan.cpp index 55014c51a66..7ada07a406e 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_warp_scan.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_warp_scan.cpp @@ -122,7 +122,8 @@ auto warp_inclusive_scan_kernel(T* device_input, T* device_output) using wscan_t = hipcub::WarpScan; __shared__ typename wscan_t::TempStorage storage[warps_no]; - auto scan_op = hipcub::Sum(); + + auto scan_op = test_utils::plus{}; wscan_t(storage[warp_id]).InclusiveScan(value, value, scan_op); device_output[index] = value; @@ -273,7 +274,7 @@ auto warp_inclusive_scan_initial_value_kernel(T* device_input, T* device_output, using wscan_t = hipcub::WarpScan; __shared__ typename wscan_t::TempStorage storage[warps_no]; - auto scan_op = hipcub::Sum(); + auto scan_op = test_utils::plus{}; wscan_t(storage[warp_id]).InclusiveScan(value, value, initial_value, scan_op); device_output[index] = value; @@ -446,7 +447,7 @@ auto warp_inclusive_scan_reduce_kernel(T* device_input, __shared__ typename wscan_t::TempStorage storage[warps_no]; if(hipBlockIdx_x%2 == 0) { - auto scan_op = hipcub::Sum(); + auto scan_op = test_utils::plus{}; wscan_t(storage[warp_id]).InclusiveScan(value, value, scan_op, reduction); } else @@ -631,7 +632,8 @@ auto warp_inclusive_scan_reduce_initial_value_kernel(T* device_input, using wscan_t = hipcub::WarpScan; __shared__ typename wscan_t::TempStorage storage[warps_no]; - wscan_t(storage[warp_id]).InclusiveScan(value, value, initial_value, hipcub::Sum(), reduction); + wscan_t(storage[warp_id]) + .InclusiveScan(value, value, initial_value, test_utils::plus{}, reduction); device_output[index] = value; if((hipThreadIdx_x % LogicalWarpSize) == 0) @@ -818,7 +820,8 @@ auto warp_exclusive_scan_kernel(T* device_input, T* device_output, T init) using wscan_t = hipcub::WarpScan; __shared__ typename wscan_t::TempStorage storage[warps_no]; - auto scan_op = hipcub::Sum(); + + auto scan_op = test_utils::plus{}; wscan_t(storage[warp_id]).ExclusiveScan(value, value, init, scan_op); device_output[index] = value; @@ -976,7 +979,8 @@ auto warp_exclusive_scan_reduce_kernel(T* device_input, using wscan_t = hipcub::WarpScan; __shared__ typename wscan_t::TempStorage storage[warps_no]; - auto scan_op = hipcub::Sum(); + + auto scan_op = test_utils::plus{}; wscan_t(storage[warp_id]).ExclusiveScan(value, value, init, scan_op, reduction); device_output[index] = value; @@ -1170,7 +1174,8 @@ auto warp_scan_kernel(T* device_input, using wscan_t = hipcub::WarpScan; __shared__ typename wscan_t::TempStorage storage[warps_no]; - auto scan_op = hipcub::Sum(); + + auto scan_op = test_utils::plus{}; wscan_t(storage[warp_id]).Scan(input, inclusive_output, exclusive_output, init, scan_op); device_inclusive_output[index] = inclusive_output; diff --git a/projects/hipcub/test/hipcub/test_utils_functional.hpp b/projects/hipcub/test/hipcub/test_utils_functional.hpp index 25776b8c1f5..d74214479dd 100644 --- a/projects/hipcub/test/hipcub/test_utils_functional.hpp +++ b/projects/hipcub/test/hipcub/test_utils_functional.hpp @@ -94,9 +94,9 @@ struct not_equal struct plus { - template + template HIPCUB_HOST_DEVICE - inline constexpr T operator()(const T& a, const T& b) const + inline constexpr auto operator()(const T& a, const U& b) const -> decltype(a + b) { return a + b; } @@ -104,9 +104,9 @@ struct plus struct minus { - template + template HIPCUB_HOST_DEVICE - inline constexpr T operator()(const T& a, const T& b) const + inline constexpr auto operator()(const T& a, const U& b) const -> decltype(a - b) { return a - b; } @@ -114,9 +114,9 @@ struct minus struct multiplies { - template + template HIPCUB_HOST_DEVICE - inline constexpr T operator()(const T& a, const T& b) const + inline constexpr auto operator()(const T& a, const U& b) const -> decltype(a * b) { return a * b; } diff --git a/projects/hipcub/test/hipcub/test_utils_thread_operators.hpp b/projects/hipcub/test/hipcub/test_utils_thread_operators.hpp index 630c79b2b77..dce0155b29c 100644 --- a/projects/hipcub/test/hipcub/test_utils_thread_operators.hpp +++ b/projects/hipcub/test/hipcub/test_utils_thread_operators.hpp @@ -73,7 +73,7 @@ struct ExtendedFloatBoolOp }; /** - * \brief ExtendedFloatBinOp general functor - Because hipcub::Sum(), Difference(), Division(), + * \brief ExtendedFloatBinOp general functor - Because test_utils::plus{}, Difference(), Division(), * Max() and Min() don't work with input types , * and * and . From 06b97617b46d4ce28527412dd749fabde4828aaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Thu, 5 Mar 2026 09:19:38 +0000 Subject: [PATCH 86/95] Remove headers that include internally used thread operators --- projects/hipcub/benchmark/benchmark_block_reduce.cpp | 1 - projects/hipcub/benchmark/benchmark_block_scan.cpp | 1 - projects/hipcub/benchmark/benchmark_device_reduce.cpp | 6 +++--- .../hipcub/benchmark/benchmark_device_reduce_by_key.cpp | 4 ++-- projects/hipcub/benchmark/benchmark_device_scan.cpp | 3 +-- .../hipcub/benchmark/benchmark_device_segmented_reduce.cpp | 6 +++--- projects/hipcub/benchmark/benchmark_device_select.cpp | 1 - projects/hipcub/benchmark/benchmark_warp_reduce.cpp | 1 - projects/hipcub/benchmark/benchmark_warp_scan.cpp | 1 - projects/hipcub/benchmark/common_benchmark_header.hpp | 1 - .../backend/cub/device/device_adjacent_difference.hpp | 1 - .../include/hipcub/backend/cub/device/device_scan.hpp | 2 +- .../include/hipcub/backend/rocprim/block/block_scan.hpp | 2 -- .../include/hipcub/backend/rocprim/block/block_shuffle.hpp | 4 +--- .../backend/rocprim/device/device_adjacent_difference.hpp | 1 - .../include/hipcub/backend/rocprim/device/device_for.hpp | 1 - .../include/hipcub/backend/rocprim/device/device_select.hpp | 2 -- .../hipcub/hipcub/include/hipcub/backend/rocprim/hipcub.hpp | 1 - .../include/hipcub/backend/rocprim/warp/warp_reduce.hpp | 3 +-- .../include/hipcub/backend/rocprim/warp/warp_scan.hpp | 3 +-- .../test/hipcub/test_hipcub_block_adjacent_difference.cpp | 3 +-- .../hipcub/test/hipcub/test_hipcub_block_discontinuity.cpp | 1 - projects/hipcub/test/hipcub/test_hipcub_block_reduce.cpp | 1 - .../test/hipcub/test_hipcub_block_run_length_decode.cpp | 1 - projects/hipcub/test/hipcub/test_hipcub_block_scan.cpp | 1 - .../test/hipcub/test_hipcub_device_adjacent_difference.cpp | 1 - projects/hipcub/test/hipcub/test_hipcub_device_copy.cpp | 1 - projects/hipcub/test/hipcub/test_hipcub_device_memcpy.cpp | 1 - .../hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp | 1 - projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp | 1 - projects/hipcub/test/hipcub/test_hipcub_device_select.cpp | 1 - projects/hipcub/test/hipcub/test_hipcub_grid.cpp | 1 - .../test/hipcub/test_hipcub_single_pass_scan_operators.cpp | 1 - projects/hipcub/test/hipcub/test_hipcub_warp_exchange.cpp | 1 - projects/hipcub/test/hipcub/test_hipcub_warp_reduce.cpp | 1 - projects/hipcub/test/hipcub/test_hipcub_warp_scan.cpp | 1 - 36 files changed, 14 insertions(+), 49 deletions(-) diff --git a/projects/hipcub/benchmark/benchmark_block_reduce.cpp b/projects/hipcub/benchmark/benchmark_block_reduce.cpp index c74be6d25d6..53b27b32c32 100644 --- a/projects/hipcub/benchmark/benchmark_block_reduce.cpp +++ b/projects/hipcub/benchmark/benchmark_block_reduce.cpp @@ -24,7 +24,6 @@ // HIP API #include -#include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; diff --git a/projects/hipcub/benchmark/benchmark_block_scan.cpp b/projects/hipcub/benchmark/benchmark_block_scan.cpp index 9eaeeec098c..7732e61c0fd 100644 --- a/projects/hipcub/benchmark/benchmark_block_scan.cpp +++ b/projects/hipcub/benchmark/benchmark_block_scan.cpp @@ -24,7 +24,6 @@ // hipCUB API #include -#include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; diff --git a/projects/hipcub/benchmark/benchmark_device_reduce.cpp b/projects/hipcub/benchmark/benchmark_device_reduce.cpp index 16c6c835632..22f38c7d0e6 100644 --- a/projects/hipcub/benchmark/benchmark_device_reduce.cpp +++ b/projects/hipcub/benchmark/benchmark_device_reduce.cpp @@ -89,7 +89,7 @@ template struct Benchmark; template -struct Benchmark +struct Benchmark { static void run(benchmark::State& state, size_t size, const hipStream_t stream) { @@ -166,8 +166,8 @@ int main(int argc, char* argv[]) // Add benchmarks std::vector benchmarks = { - CREATE_BENCHMARKS(hipcub::Sum), - CREATE_BENCHMARK(custom_double2, hipcub::Sum), + CREATE_BENCHMARKS(benchmark_utils::plus), + CREATE_BENCHMARK(custom_double2, benchmark_utils::plus), CREATE_BENCHMARKS(benchmark_utils::minimum), #ifdef HIPCUB_ROCPRIM_API CREATE_BENCHMARK(custom_double2, benchmark_utils::minimum), diff --git a/projects/hipcub/benchmark/benchmark_device_reduce_by_key.cpp b/projects/hipcub/benchmark/benchmark_device_reduce_by_key.cpp index dce6ce8c6cc..d57d6249392 100644 --- a/projects/hipcub/benchmark/benchmark_device_reduce_by_key.cpp +++ b/projects/hipcub/benchmark/benchmark_device_reduce_by_key.cpp @@ -187,8 +187,8 @@ void add_benchmarks(size_t max_length, using custom_double2 = benchmark_utils::custom_type; std::vector bs = { - CREATE_BENCHMARKS(hipcub::Sum), - CREATE_BENCHMARK(long long, custom_double2, hipcub::Sum), + CREATE_BENCHMARKS(benchmark_utils::plus), + CREATE_BENCHMARK(long long, custom_double2, benchmark_utils::plus), CREATE_BENCHMARKS(benchmark_utils::minimum), #ifdef HIPCUB_ROCPRIM_API CREATE_BENCHMARK(long long, custom_double2, benchmark_utils::minimum), diff --git a/projects/hipcub/benchmark/benchmark_device_scan.cpp b/projects/hipcub/benchmark/benchmark_device_scan.cpp index 5364b877965..54706c2ae4b 100644 --- a/projects/hipcub/benchmark/benchmark_device_scan.cpp +++ b/projects/hipcub/benchmark/benchmark_device_scan.cpp @@ -31,7 +31,6 @@ // HIP API #include #include -#include #include _HIPCUB_STD_INCLUDE(functional) @@ -353,7 +352,7 @@ int main(int argc, char* argv[]) // Add benchmarks std::vector benchmarks = { - CREATE_BENCHMARKS(hipcub::Sum), + CREATE_BENCHMARKS(benchmark_utils::plus), CREATE_BENCHMARKS(benchmark_utils::minimum), }; diff --git a/projects/hipcub/benchmark/benchmark_device_segmented_reduce.cpp b/projects/hipcub/benchmark/benchmark_device_segmented_reduce.cpp index 32fb7779552..7159de1e486 100644 --- a/projects/hipcub/benchmark/benchmark_device_segmented_reduce.cpp +++ b/projects/hipcub/benchmark/benchmark_device_segmented_reduce.cpp @@ -147,7 +147,7 @@ template struct Benchmark; template -struct Benchmark +struct Benchmark { static void run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size) @@ -233,8 +233,8 @@ void add_benchmarks(std::vector& benchmarks, using custom_double2 = benchmark_utils::custom_type; std::vector bs = { - CREATE_BENCHMARKS(hipcub::Sum), - BENCHMARK_TYPE(custom_double2, hipcub::Sum), + CREATE_BENCHMARKS(benchmark_utils::plus), + BENCHMARK_TYPE(custom_double2, benchmark_utils::plus), CREATE_BENCHMARKS(benchmark_utils::minimum), #ifdef HIPCUB_ROCPRIM_API BENCHMARK_TYPE(custom_double2, benchmark_utils::minimum), diff --git a/projects/hipcub/benchmark/benchmark_device_select.cpp b/projects/hipcub/benchmark/benchmark_device_select.cpp index cbf963ea000..835789b6ffe 100644 --- a/projects/hipcub/benchmark/benchmark_device_select.cpp +++ b/projects/hipcub/benchmark/benchmark_device_select.cpp @@ -24,7 +24,6 @@ // HIP API #include -#include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; diff --git a/projects/hipcub/benchmark/benchmark_warp_reduce.cpp b/projects/hipcub/benchmark/benchmark_warp_reduce.cpp index aa830d7360c..d9391c7f85a 100644 --- a/projects/hipcub/benchmark/benchmark_warp_reduce.cpp +++ b/projects/hipcub/benchmark/benchmark_warp_reduce.cpp @@ -23,7 +23,6 @@ #include "common_benchmark_header.hpp" // HIP API -#include #include #ifndef DEFAULT_N diff --git a/projects/hipcub/benchmark/benchmark_warp_scan.cpp b/projects/hipcub/benchmark/benchmark_warp_scan.cpp index 450df772a90..f9f80baac70 100644 --- a/projects/hipcub/benchmark/benchmark_warp_scan.cpp +++ b/projects/hipcub/benchmark/benchmark_warp_scan.cpp @@ -23,7 +23,6 @@ #include "common_benchmark_header.hpp" // HIP -#include #include #ifndef DEFAULT_N diff --git a/projects/hipcub/benchmark/common_benchmark_header.hpp b/projects/hipcub/benchmark/common_benchmark_header.hpp index 28693abcec6..093a0079ef1 100644 --- a/projects/hipcub/benchmark/common_benchmark_header.hpp +++ b/projects/hipcub/benchmark/common_benchmark_header.hpp @@ -42,7 +42,6 @@ #include #include -#include #include _HIPCUB_LIBCXX_INCLUDE(cmath) #include _HIPCUB_STD_INCLUDE(limits) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_adjacent_difference.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_adjacent_difference.hpp index 68787062916..fb40da8575e 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_adjacent_difference.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_adjacent_difference.hpp @@ -31,7 +31,6 @@ #include "../../../config.hpp" #include "../../../util_deprecated.hpp" -#include "../thread/thread_operators.hpp" // for Difference #include // IWYU pragma: export diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp index af57b0f0fd7..660a6942d4f 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp @@ -32,7 +32,7 @@ #include "../../../config.hpp" #include "../../../util_deprecated.hpp" -#include "../thread/thread_operators.hpp" // for Equality +#include "../thread/thread_operators.hpp" #include // IWYU pragma: export diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_scan.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_scan.hpp index 462b32c0580..a4f40625229 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_scan.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_scan.hpp @@ -32,8 +32,6 @@ #include "../../../config.hpp" -#include "../thread/thread_operators.hpp" - #include // IWYU pragma: export #include _HIPCUB_STD_INCLUDE(functional) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_shuffle.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_shuffle.hpp index 8698d576754..2c591898dd0 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_shuffle.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/block_shuffle.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -32,8 +32,6 @@ #include "../../../config.hpp" -#include "../thread/thread_operators.hpp" - #include // IWYU pragma: export #include diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_adjacent_difference.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_adjacent_difference.hpp index 8b6ef8d6bd7..5bf19740a62 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_adjacent_difference.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_adjacent_difference.hpp @@ -32,7 +32,6 @@ #include "../../../config.hpp" #include "../../../util_deprecated.hpp" -#include #include // IWYU pragma: export #include _HIPCUB_STD_INCLUDE(functional) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_for.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_for.hpp index fe9a1f554eb..32a672b5d4c 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_for.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_for.hpp @@ -31,7 +31,6 @@ #include "../../../config.hpp" -#include "../thread/thread_operators.hpp" #include "../util_mdspan.hpp" #include // IWYU pragma: export diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_select.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_select.hpp index 5a521e30422..5360ed4220f 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_select.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/device/device_select.hpp @@ -33,8 +33,6 @@ #include "../../../config.hpp" #include "../../../util_deprecated.hpp" -#include "../thread/thread_operators.hpp" - #include // IWYU pragma: export #include _HIPCUB_STD_INCLUDE(functional) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/hipcub.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/hipcub.hpp index d7ea5726334..3124a142c2f 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/hipcub.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/hipcub.hpp @@ -82,7 +82,6 @@ // Thread #include "thread/thread_load.hpp" -#include "thread/thread_operators.hpp" #include "thread/thread_reduce.hpp" #include "thread/thread_scan.hpp" #include "thread/thread_search.hpp" diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/warp/warp_reduce.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/warp/warp_reduce.hpp index 3d6e938b733..c17896227c4 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/warp/warp_reduce.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/warp/warp_reduce.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -33,7 +33,6 @@ #include "../../../config.hpp" #include "../util_ptx.hpp" -#include "../thread/thread_operators.hpp" #include // IWYU pragma: export diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/warp/warp_scan.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/warp/warp_scan.hpp index 60a20e6ecb9..b5c6071d6c7 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/warp/warp_scan.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/warp/warp_scan.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2025, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2026, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -33,7 +33,6 @@ #include "../../../config.hpp" #include "../util_ptx.hpp" -#include "../thread/thread_operators.hpp" #include // IWYU pragma: export diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_adjacent_difference.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_adjacent_difference.cpp index 9d7c8f1664a..65a43392fe7 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_adjacent_difference.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_adjacent_difference.cpp @@ -23,11 +23,10 @@ #include "common_test_header.hpp" // required rocprim headers -#include #include #include #include -#include +#include template< class T, diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_discontinuity.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_discontinuity.cpp index 35d341e1789..61e44a99023 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_discontinuity.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_discontinuity.cpp @@ -26,7 +26,6 @@ #include #include #include -#include template struct params diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_reduce.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_reduce.cpp index fb871a75135..11e078842c1 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_reduce.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_reduce.cpp @@ -24,7 +24,6 @@ // hipcub API #include -#include // Params for tests template #include #include -#include template -#include // Params for tests template -#include #include "test_utils.hpp" #include "test_utils_data_generation.hpp" diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_copy.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_copy.cpp index 4fc2eabe36a..f9a3cf1d4f2 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_copy.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_copy.cpp @@ -27,7 +27,6 @@ #include "test_utils_types.hpp" #include -#include #include #include diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_memcpy.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_memcpy.cpp index a36066d6bb6..342214e995c 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_memcpy.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_memcpy.cpp @@ -27,7 +27,6 @@ #include "test_utils_types.hpp" #include -#include #include #include diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp index 87355be27de..901d8f034b7 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_reduce_by_key.cpp @@ -24,7 +24,6 @@ // hipcub API #include -#include #include "test_utils_data_generation.hpp" diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp index 53d0417d1eb..92883f15c02 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_scan.cpp @@ -24,7 +24,6 @@ // hipcub API #include -#include #include "single_index_iterator.hpp" #include "test_utils_bfloat16.hpp" diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp index cdccddab541..4e346c6522b 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_select.cpp @@ -24,7 +24,6 @@ // hipcub API #include -#include #include "single_index_iterator.hpp" #include "test_utils_bfloat16.hpp" diff --git a/projects/hipcub/test/hipcub/test_hipcub_grid.cpp b/projects/hipcub/test/hipcub/test_hipcub_grid.cpp index 62783e2f8e1..e9f38a58509 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_grid.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_grid.cpp @@ -30,7 +30,6 @@ #include "common_test_header.hpp" #include -#include #include #include diff --git a/projects/hipcub/test/hipcub/test_hipcub_single_pass_scan_operators.cpp b/projects/hipcub/test/hipcub/test_hipcub_single_pass_scan_operators.cpp index 4b876305da1..7bb09a7ba3b 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_single_pass_scan_operators.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_single_pass_scan_operators.cpp @@ -25,7 +25,6 @@ #include #include -#include #include diff --git a/projects/hipcub/test/hipcub/test_hipcub_warp_exchange.cpp b/projects/hipcub/test/hipcub/test_hipcub_warp_exchange.cpp index d0325d40d2a..25bd35263d5 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_warp_exchange.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_warp_exchange.cpp @@ -24,7 +24,6 @@ #include "test_utils_data_generation.hpp" #include "test_utils_half.hpp" -#include #include #include diff --git a/projects/hipcub/test/hipcub/test_hipcub_warp_reduce.cpp b/projects/hipcub/test/hipcub/test_hipcub_warp_reduce.cpp index c449d91b6fc..0f36220cc98 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_warp_reduce.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_warp_reduce.cpp @@ -22,7 +22,6 @@ #include "common_test_header.hpp" -#include #include #include diff --git a/projects/hipcub/test/hipcub/test_hipcub_warp_scan.cpp b/projects/hipcub/test/hipcub/test_hipcub_warp_scan.cpp index 7ada07a406e..4d801be1bf6 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_warp_scan.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_warp_scan.cpp @@ -22,7 +22,6 @@ #include "common_test_header.hpp" -#include #include #include From 8a1d5bd9806b2638a9151abef8b1e8427a5a6de8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Fri, 6 Mar 2026 17:58:00 +0000 Subject: [PATCH 87/95] Deprecate thread operators --- projects/hipcub/CHANGELOG.md | 40 ++++++++++++------- .../backend/cub/thread/thread_operators.hpp | 29 +++++++------- .../rocprim/thread/thread_operators.hpp | 34 ++++++++-------- 3 files changed, 56 insertions(+), 47 deletions(-) diff --git a/projects/hipcub/CHANGELOG.md b/projects/hipcub/CHANGELOG.md index b7984eb8d05..c1ea75272db 100644 --- a/projects/hipcub/CHANGELOG.md +++ b/projects/hipcub/CHANGELOG.md @@ -2,26 +2,14 @@ Full documentation for hipCUB is available at [https://rocm.docs.amd.com/projects/hipCUB/en/latest/](https://rocm.docs.amd.com/projects/hipCUB/en/latest/). -## Since last release ROCm 7.12 - -### Optimizations - -* Reduced build times for unit tests. - -### Resolved issues - -* Fixed more memory leak issues with some unit tests. - -## hipCUB-4.3.0 for ROCm 7.12 +## hipCUB-4.5.0 for ROCm 7.14 ### Added -* Added `generate_resource_spec.cpp` to the test directory. It is now built as a new target by CMake. It generates the resource spec file required by CTest when running tests in parallel. * Added `::hip::std` support. ### Changed -* Updated the documentation on how to run hipCUB tests on multiple GPUs in parallel. * Changed `CCCL_MINIMUM_VERSION` to `3.0.0` to align with CUB. * Add support for large num_items `DeviceMerge` and `DeviceSegmentedSort`. * Replace `#pragma unroll` by `_CCCL_PRAGMA_UNROLL_FULL()` and `_CCCL_PRAGMA_NOUNROLL()` by `_CCCL_PRAGMA_NOUNROLL()`. @@ -33,13 +21,37 @@ Full documentation for hipCUB is available at [https://rocm.docs.amd.com/project * Removed `hipcub::BaseTraits::CATEGORY`, `hipcub::BaseTraits::nullptr_TYPE` and `hipcub::BaseTraits::PRIMITIVE`. * Removed `ConstantInputIterator`, `CountingInputIterator`, `DiscardOutputIterator` and `TransformInputIterator` which were deprecated in hipCUB-4.1.0. * Removed `DeviceSpmv`, which was removed from CUB after CCCL's 2.8.0 release. Use `hipSPARSE` or `rocSPARSE` libraries instead. -* Removed the `GenerateResourceSpec.cmake` script - it is replaced by the added `generate_resource_spec.cpp` code mentioned above. * Removed `GridBarrier`. * Removed `HIPCUB_MIN`, `HIPCUB_MAX`, `HIPCUB_QUOTIENT_FLOOR`, `HIPCUB_QUOTIENT_CEILING`, `HIPCUB_ROUND_UP_NEAREST` and `HIPCUB_ROUND_DOWN_NEAREST` which were deprecated in hipCUB-4.1.0. * Removed `LEGACY_PTX_ARCH`. * Removed `hipcub:max` and `hipcub:min`, which were deprecated. Use `hip::std::max` and `hip::std::min` instead. * Deprecated `hipcub::Swap`, use `rocprim::swap` instead. * Deprecated `HIPCUB_IS_INT128_ENABLED`, use `_CCCL_HAS_INT128()` instead. +* Deprecated `hipcub::Equality`, `hipcub::Inequality`, `hipcub::InequalityWrapper`, `hipcub::Sum`, `hipcub::Difference`, `hipcub::Division`, `hipcub::Max` and `hipcub::Min` operators. Use `hip::std::equal_to`, `hip::std::not_equal_to`, `hip::std::plus`, `hip::std::minus`, `hip::std::divides`, `hip::maximum` and `hip:minimum` operators instead. + +## Since last release ROCm 7.12 + +### Optimizations + +* Reduced build times for unit tests. + +### Resolved issues + +* Fixed more memory leak issues with some unit tests. + +## hipCUB-4.3.0 for ROCm 7.12 + +### Added + +* Added `generate_resource_spec.cpp` to the test directory. It is now built as a new target by CMake. It generates the resource spec file required by CTest when running tests in parallel. + +### Changed + +* Updated the documentation on how to run hipCUB tests on multiple GPUs in parallel. + +### Removed + +* Removed the `GenerateResourceSpec.cmake` script - it is replaced by the added `generate_resource_spec.cpp` code mentioned above. ## hipCUB-4.2.0 for ROCm 7.2 diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/thread/thread_operators.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/thread/thread_operators.hpp index 7ce5f71061d..0918d3ff049 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/cub/thread/thread_operators.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/thread/thread_operators.hpp @@ -39,8 +39,8 @@ using accumulator_t = ::cuda::std::__accumulator_t; } // namespace detail -// TODO: this is deprecated in cub, we should also mark this as deprecated when we have libhipcxx -struct Equality +//! deprecated [Since 5.0] +struct HIPCUB_DEPRECATED_BECAUSE("Use hip::std::equal_to instead.") Equality { template HIPCUB_HOST_DEVICE @@ -50,8 +50,8 @@ struct Equality } }; -// TODO: this is deprecated in cub, we should also mark this as deprecated when we have libhipcxx -struct Inequality +//! deprecated [Since 5.0] +struct HIPCUB_DEPRECATED_BECAUSE("Use hip::std::not_equal_to instead.") Inequality { template HIPCUB_HOST_DEVICE @@ -61,8 +61,8 @@ struct Inequality } }; -// TODO: this is deprecated in cub, we should also mark this as deprecated when we have libhipcxx -struct Sum +//! deprecated [Since 5.0] +struct HIPCUB_DEPRECATED_BECAUSE("Use hip::std::plus instead.") Sum { template HIPCUB_HOST_DEVICE @@ -72,8 +72,8 @@ struct Sum } }; -// TODO: this is deprecated in cub, we should also mark this as deprecated when we have libhipcxx -struct Difference +//! deprecated [Since 5.0] +struct HIPCUB_DEPRECATED_BECAUSE("Use hip::std::minus instead.") Difference { template HIPCUB_HOST_DEVICE @@ -83,8 +83,8 @@ struct Difference } }; -// TODO: this is deprecated in cub, we should also mark this as deprecated when we have libhipcxx -struct Division +//! deprecated [Since 5.0] +struct HIPCUB_DEPRECATED_BECAUSE("Use hip::std::divides instead") Division { template HIPCUB_HOST_DEVICE @@ -94,8 +94,8 @@ struct Division } }; -// TODO: this is deprecated in cub, we should also mark this as deprecated when we have libhipcxx -struct Max +//! deprecated [Since 5.0] +struct HIPCUB_DEPRECATED_BECAUSE("Use hip::maximum instead.") Max { template HIPCUB_HOST_DEVICE @@ -107,7 +107,8 @@ struct Max } }; -struct Min +//! deprecated [Since 5.0] +struct HIPCUB_DEPRECATED_BECAUSE("Use hip::minimum instead") Min { template HIPCUB_HOST_DEVICE @@ -119,7 +120,6 @@ struct Min } }; -// TODO: this is deprecated in cub, we should also mark this as deprecated when we have libhipcxx struct ArgMax { template @@ -132,7 +132,6 @@ struct ArgMax } }; -// TODO: this is deprecated in cub, we should also mark this as deprecated when we have libhipcxx struct ArgMin { template diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp index 5d9327c84e3..68be944b031 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp @@ -44,8 +44,8 @@ BEGIN_HIPCUB_NAMESPACE -// TODO: this is deprecated in cub, we should also mark this as deprecated when we have libhipcxx -struct Equality +//! deprecated [Since 5.0] +struct HIPCUB_DEPRECATED_BECAUSE("Use hip::std::equal_to instead.") Equality { template HIPCUB_HOST_DEVICE inline constexpr bool operator()(T&& t, U&& u) const @@ -54,8 +54,8 @@ struct Equality } }; -// TODO: this is deprecated in cub, we should also mark this as deprecated when we have libhipcxx -struct Inequality +//! deprecated [Since 5.0] +struct HIPCUB_DEPRECATED_BECAUSE("Use hip::std::not_equal_to instead.") Inequality { template HIPCUB_HOST_DEVICE inline constexpr bool operator()(T&& t, U&& u) const @@ -64,9 +64,9 @@ struct Inequality } }; -// TODO: this is deprecated in cub, we should also mark this as deprecated when we have libhipcxx +//! deprecated [Since 5.0] template -struct InequalityWrapper +struct HIPCUB_DEPRECATED_BECAUSE("Use hip::std::not_equal_to instead.") InequalityWrapper { EqualityOp op; @@ -80,8 +80,8 @@ struct InequalityWrapper } }; -// TODO: this is deprecated in cub, we should also mark this as deprecated when we have libhipcxx -struct Sum +//! deprecated [Since 5.0] +struct HIPCUB_DEPRECATED_BECAUSE("Use hip::std::plus instead.") Sum { template HIPCUB_HOST_DEVICE inline constexpr auto operator()(T&& t, U&& u) const -> decltype(auto) @@ -90,8 +90,8 @@ struct Sum } }; -// TODO: this is deprecated in cub, we should also mark this as deprecated when we have libhipcxx -struct Difference +//! deprecated [Since 5.0] +struct HIPCUB_DEPRECATED_BECAUSE("Use hip::std::minus instead") Difference { template HIPCUB_HOST_DEVICE inline constexpr auto operator()(T&& t, U&& u) const -> decltype(auto) @@ -100,8 +100,8 @@ struct Difference } }; -// TODO: this is deprecated in cub, we should also mark this as deprecated when we have libhipcxx -struct Division +//! deprecated [Since 5.0] +struct HIPCUB_DEPRECATED_BECAUSE("Use hip::std::divides instead") Division { template HIPCUB_HOST_DEVICE inline constexpr auto operator()(T&& t, U&& u) const -> decltype(auto) @@ -110,26 +110,24 @@ struct Division } }; -// TODO: this is deprecated in cub, we should also mark this as deprecated when we have libhipcxx -struct Max +//! deprecated [Since 5.0] +struct HIPCUB_DEPRECATED_BECAUSE("Use hip::maximum instead.") Max { template HIPCUB_HOST_DEVICE inline constexpr typename std::common_type::type operator()(T&& t, U&& u) const { - // TODO: change to use hip::std::max after libhipcxx is ready return (((u) > (t)) ? (u) : (t)); } }; -// TODO: this is deprecated in cub, we should also mark this as deprecated when we have libhipcxx -struct Min +//! deprecated [Since 5.0] +struct HIPCUB_DEPRECATED_BECAUSE("Use hip::minimum instead.") Min { template HIPCUB_HOST_DEVICE inline constexpr typename std::common_type::type operator()(T&& t, U&& u) const { - // TODO: change to use hip::std::min after libhipcxx is ready return (((u) < (t)) ? (u) : (t)); } }; From 2b6783a492f1971399e3978bf1af22b6b4eda881 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Mon, 9 Mar 2026 13:38:11 +0000 Subject: [PATCH 88/95] Remove tests of deprecated thread operators --- .../hipcub/test_hipcub_thread_operators.cpp | 212 ------------------ 1 file changed, 212 deletions(-) diff --git a/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp b/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp index b6cb3438a35..db3b16407e3 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp @@ -81,218 +81,6 @@ TYPED_TEST_SUITE(HipcubThreadOperatorsTests, ThreadOperatorsParameters); // Commutative operators tests. -/// \brief Shared code for equality/inequality operators. -template -void equality_op_test(ScanOpT op, bool equality) -{ - for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) - { - // Generate random input value. - unsigned int seed_value - = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; - SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); - const InputT input_val - = test_utils::get_random_data(1, 1.0f, 100.0f, seed_value)[0]; - - OutputT output_val{}; - - ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(op(input_val, input_val), equality)); - ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(op(output_val, output_val), equality)); - ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(op(output_val, input_val), !equality)); - - output_val = OutputT(input_val); - - ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(op(output_val, input_val), equality)); - } -} - -TYPED_TEST(HipcubThreadOperatorsTests, Equality) -{ - using input_type = typename TestFixture::input_type; - using output_type = typename TestFixture::output_type; - - using Equality = typename EqualitySelector::type; - Equality op{}; - - equality_op_test(op, true); -} - -TYPED_TEST(HipcubThreadOperatorsTests, Inequality) -{ - using input_type = typename TestFixture::input_type; - using output_type = typename TestFixture::output_type; - - using Inequality = - typename EqualitySelector::type; - Inequality op{}; - - equality_op_test(op, false); -} - -TYPED_TEST(HipcubThreadOperatorsTests, InequalityWrapper) -{ - using input_type = typename TestFixture::input_type; - using output_type = typename TestFixture::output_type; - - using Equality = typename EqualitySelector::type; - Equality wrapped_op{}; - auto op = _HIPCUB_STD::not_fn(wrapped_op); - - equality_op_test(op, false); -} - -/// \brief Shared code for algebraic operators. -template -void algebraic_op_test(const InputT input_val, OutputT init_val) -{ - using accum_type = hipcub::detail::accumulator_t; - - ScanOpT op{}; - - accum_type output_val = init_val; - - // Check result. - ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(op(init_val, input_val), output_val)); - - // Check return type. - ASSERT_NO_FATAL_FAILURE(test_utils::assert_type(op(init_val, input_val), output_val)); -} - -TYPED_TEST(HipcubThreadOperatorsTests, Sum) -{ - using input_type = typename TestFixture::input_type; - using output_type = typename TestFixture::output_type; - using Sum = typename AlgebraicSelector::type; - - for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) - { - // Generate random initial value. - unsigned int seed_value - = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; - SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); - output_type init_val - = test_utils::get_random_data(1, 1.0f, 100.0f, seed_value)[0]; - - algebraic_op_test(input_type{}, init_val); - } -} - -TYPED_TEST(HipcubThreadOperatorsTests, Difference) -{ - using input_type = typename TestFixture::input_type; - using output_type = typename TestFixture::output_type; - using Difference = - typename AlgebraicSelector::type; - - for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) - { - // Generate random initial value. - unsigned int seed_value - = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; - SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); - output_type init_val - = test_utils::get_random_data(1, 1.0f, 100.0f, seed_value)[0]; - - algebraic_op_test(input_type{}, init_val); - } -} - -// Division operator is not defined for custom_test_type. -template -class HipcubDivisionOperatorTests : public ::testing::Test -{ -public: - using input_type = typename Params::input_type; - using output_type = typename Params::output_type; -}; - -using DivisionOperatorParameters = ::testing::Types< - ThreadOperatorsParams, - ThreadOperatorsParams, - ThreadOperatorsParams, - ThreadOperatorsParams, - ThreadOperatorsParams, - ThreadOperatorsParams, - ThreadOperatorsParams, - ThreadOperatorsParams, - ThreadOperatorsParams, - ThreadOperatorsParams, - ThreadOperatorsParams, - ThreadOperatorsParams, - ThreadOperatorsParams -#ifdef __HIP_PLATFORM_AMD__ - , - ThreadOperatorsParams, // Doesn't work on NVIDIA / CUB - ThreadOperatorsParams // Doesn't work on NVIDIA / CUB -#endif - >; -TYPED_TEST_SUITE(HipcubDivisionOperatorTests, DivisionOperatorParameters); - -TYPED_TEST(HipcubDivisionOperatorTests, Division) -{ - using input_type = typename TestFixture::input_type; - using output_type = typename TestFixture::output_type; - using Division = typename AlgebraicSelector::type; - - for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) - { - // Generate random input value. - unsigned int seed_value - = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; - SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); - input_type input_val - = test_utils::get_random_data(1, 1.0f, 100.0f, seed_value)[0]; - - algebraic_op_test(input_val, output_type{}); - } -} - -/// \brief Shared code for min/max operators. -template -void minmax_op_test(bool is_max) -{ - ScanOpT op{}; - - for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) - { - // Generate random initial and input values. - unsigned int seed_value - = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; - SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); - OutputT init_val = test_utils::get_random_data(1, 1.0f, 100.0f, seed_value)[0]; - InputT input_val = test_utils::get_random_data(1, 1.0f, 100.0f, seed_value)[0]; - - AccumT output_val - = is_max ? test_utils::max(init_val, input_val) : test_utils::min(init_val, input_val); - - // Check result. - ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(op(init_val, input_val), output_val)); - - // Check return type. - ASSERT_NO_FATAL_FAILURE(test_utils::assert_type(op(init_val, input_val), output_val)); - } -} - -TYPED_TEST(HipcubThreadOperatorsTests, Max) -{ - using input_type = typename TestFixture::input_type; - using output_type = typename TestFixture::output_type; - using accum_type = typename std::common_type::type; - using Max = typename MaxSelector::type; - - minmax_op_test(true); -} - -TYPED_TEST(HipcubThreadOperatorsTests, Min) -{ - using input_type = typename TestFixture::input_type; - using output_type = typename TestFixture::output_type; - using accum_type = typename std::common_type::type; - using Min = typename MinSelector::type; - - minmax_op_test(false); -} - /// \brief Shared code for ArgMin/ArgMax operators. template void arg_op_test(bool is_max) From c6f8e0ac6cce7f09279941fcbd4db4df4ce4a24f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Mon, 9 Mar 2026 16:00:34 +0000 Subject: [PATCH 89/95] Deprecate and replace `hipcub::BFE` --- .../rocprim/block/radix_rank_sort_operations.hpp | 16 +++++++++++++++- .../include/hipcub/backend/rocprim/util_ptx.hpp | 9 ++++++--- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/radix_rank_sort_operations.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/radix_rank_sort_operations.hpp index 74c0fd9df64..4cb0033ff10 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/radix_rank_sort_operations.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/block/radix_rank_sort_operations.hpp @@ -43,6 +43,8 @@ #include // IWYU pragma: export #include // IWYU pragma: export +#include _HIPCUB_LIBCXX_INCLUDE(bit) + #include BEGIN_HIPCUB_NAMESPACE @@ -128,7 +130,19 @@ struct RadixSortTwiddle __device__ __forceinline__ uint32_t Digit(UnsignedBits key) { - return BFE(this->ProcessFloatMinusZero(key), bit_start, num_bits); + HIPCUB_CLANG_SUPPRESS_DEPRECATED_PUSH + + uint32_t result = +#if _HIPCUB_HAS_DEVICE_SYSTEM_STD + _HIPCUB_LIBCXX::bitfield_extract +#else + BFE +#endif + (this->ProcessFloatMinusZero(key), bit_start, num_bits); + + HIPCUB_CLANG_SUPPRESS_DEPRECATED_POP + + return result; } }; diff --git a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_ptx.hpp b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_ptx.hpp index 7675c74e9fd..b4484569c53 100644 --- a/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_ptx.hpp +++ b/projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_ptx.hpp @@ -155,9 +155,11 @@ HIPCUB_FORCEINLINE auto // Extracts \p num_bits from \p source starting at bit-offset \p bit_start. // The input \p source may be an 8b, 16b, 32b, or 64b unsigned integer type. template +//! deprecated [Since 5.0] +HIPCUB_DEPRECATED_BECAUSE("Use hip::bitfield_extract()") HIPCUB_DEVICE -HIPCUB_FORCEINLINE unsigned int - BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits) +HIPCUB_FORCEINLINE + unsigned int BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits) { static_assert(std::is_unsigned::value, "UnsignedBits must be unsigned"); return detail::unsigned_bit_extract(source, bit_start, num_bits); @@ -168,7 +170,8 @@ HIPCUB_FORCEINLINE unsigned int * Bitfield-extract for 128-bit types. */ template -HIPCUB_DEVICE +//! deprecated [Since 5.0] +HIPCUB_DEPRECATED_BECAUSE("Use hip::bitfield_extract()") HIPCUB_DEVICE HIPCUB_FORCEINLINE unsigned int BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits, From eff957060a5c2f7dba29fad4884a4b2c4e7dd6fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Mon, 9 Mar 2026 14:19:32 +0000 Subject: [PATCH 90/95] Fix libhipcxx version check --- .../hipcub/hipcub/include/hipcub/libcxx.hpp | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/projects/hipcub/hipcub/include/hipcub/libcxx.hpp b/projects/hipcub/hipcub/include/hipcub/libcxx.hpp index 65ee721b5d0..7ab49d3969f 100644 --- a/projects/hipcub/hipcub/include/hipcub/libcxx.hpp +++ b/projects/hipcub/hipcub/include/hipcub/libcxx.hpp @@ -20,6 +20,9 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +#ifndef HIPCUB_LIBCXX_HPP_ +#define HIPCUB_LIBCXX_HPP_ + #pragma once // This is a utility file that helps managing which @@ -31,11 +34,14 @@ // #include _HIPCUB_STD_INCLUDE(optional) // using optional_int = _HIPCUB_STD::optional; -// Version that we depend on. We can ignore patch for now -// since we're only interested in breaking (major) and -// features (minor). -#define _HIPCUB_REQUIRED_LIBCXX_VERSION_MAJOR 2 -#define _HIPCUB_REQUIRED_LIBCXX_VERSION_MINOR 8 +// Minimum version that we depend on. +#define _HIPCUB_REQUIRED_LIBCXX_VERSION_MAJOR 3 +#define _HIPCUB_REQUIRED_LIBCXX_VERSION_MINOR 0 +#define _HIPCUB_REQUIRED_LIBCXX_VERSION_PATCH 0 + +#define _HIPCUB_REQUIRED_LIBCXX_VERSION \ + _HIPCUB_REQUIRED_LIBCXX_VERSION_MAJOR * 1000000 + _HIPCUB_REQUIRED_LIBCXX_VERSION_MINOR * 1000 \ + + _HIPCUB_REQUIRED_LIBCXX_VERSION_PATCH #ifdef __has_include #define HIPCUB_HAS_INCLUDE(_X) __has_include(_X) @@ -52,9 +58,7 @@ #if HIPCUB_HAS_INCLUDE() #include // If version matches and '_CUDA_VSTD' is available. - #if _LIBCUDACXX_CUDA_API_VERSION_MAJOR == _HIPCUB_REQUIRED_LIBCXX_VERSION_MAJOR \ - && _LIBCUDACXX_CUDA_API_VERSION_MINOR >= _HIPCUB_REQUIRED_LIBCXX_VERSION_MINOR \ - && defined(_CUDA_VSTD) + #if defined(_LIBCUDACXX_CUDA_API_VERSION) && (_LIBCUDACXX_CUDA_API_VERSION >= _HIPCUB_REQUIRED_LIBCXX_VERSION) && defined(_CUDA_VSTD) #define _HIPCUB_LIBCXX_INCLUDE(LIB) _HIPCUB_STRINGIFY(cuda/LIB) #define _HIPCUB_STD_INCLUDE(LIB) _HIPCUB_STRINGIFY(cuda/std/LIB) #define _HIPCUB_LIBCXX ::cuda @@ -63,14 +67,12 @@ #define _HIPCUB_STD_NAMESPACE_BEGIN _LIBCUDACXX_BEGIN_NAMESPACE_STD #define _HIPCUB_STD_NAMESPACE_END _LIBCUDACXX_END_NAMESPACE_STD #endif - +#endif // Otherwise, if the '::hip::std' namespace from 'libhipcxx' is available. -#elif HIPCUB_HAS_INCLUDE() +#if !defined(_HIPCUB_HAS_DEVICE_SYSTEM_STD) && HIPCUB_HAS_INCLUDE() #include // If version matches and '_CUDA_VSTD' is available. - #if _LIBCUDACXX_CUDA_API_VERSION_MAJOR == _HIPCUB_REQUIRED_LIBCXX_VERSION_MINOR \ - && _LIBCUDACXX_CUDA_API_VERSION_MINOR >= _HIPCUB_REQUIRED_LIBCXX_VERSION_MINOR \ - && defined(_CUDA_VSTD) + #if defined(_LIBCUDACXX_CUDA_API_VERSION) && (_LIBCUDACXX_CUDA_API_VERSION >= _HIPCUB_REQUIRED_LIBCXX_VERSION) && defined(_CUDA_VSTD) #define _HIPCUB_LIBCXX_INCLUDE(LIB) _HIPCUB_STRINGIFY(hip/LIB) #define _HIPCUB_STD_INCLUDE(LIB) _HIPCUB_STRINGIFY(hip/std/LIB) // In 'libhipcxx' the '::hip' namespace is synonymous with '::cuda'. @@ -97,3 +99,5 @@ #endif // clang-format on + +#endif // HIPCUB_LIBCXX_HPP_ From 02eaac04d051c466349ac8632af07adeadfc28a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Tue, 7 Apr 2026 15:09:28 +0000 Subject: [PATCH 91/95] Add sort_last for custom test type --- .../test/hipcub/test_hipcub_warp_merge_sort.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/projects/hipcub/test/hipcub/test_hipcub_warp_merge_sort.cpp b/projects/hipcub/test/hipcub/test_hipcub_warp_merge_sort.cpp index 1d0b7a888e7..c5aab18cf11 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_warp_merge_sort.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_warp_merge_sort.cpp @@ -209,6 +209,20 @@ struct sort_last { static constexpr T value = _HIPCUB_STD::numeric_limits::lowest(); }; +template +struct sort_last> +{ + static constexpr test_utils::custom_test_type value = test_utils::custom_test_type( + _HIPCUB_STD::numeric_limits::max(), _HIPCUB_STD::numeric_limits::max()); +}; + +template +struct sort_last> +{ + static constexpr test_utils::custom_test_type value = test_utils::custom_test_type( + _HIPCUB_STD::numeric_limits::lowest(), _HIPCUB_STD::numeric_limits::lowest()); +}; + template Date: Wed, 8 Apr 2026 19:36:26 +0000 Subject: [PATCH 92/95] Update device reduce test cases --- projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp index 411a969b8f2..3a15333e1ee 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp @@ -62,14 +62,12 @@ using HipcubDeviceReduceTestsParams = ::testing::Types< DeviceReduceParams, DeviceReduceParams, DeviceReduceParams, - DeviceReduceParams + DeviceReduceParams, #ifdef __HIP_PLATFORM_AMD__ - , DeviceReduceParams, - DeviceReduceParams, test_utils::custom_test_type>, - DeviceReduceParams, test_utils::custom_test_type> #endif - >; + DeviceReduceParams, test_utils::custom_test_type>, + DeviceReduceParams, test_utils::custom_test_type>>; TYPED_TEST_SUITE(HipcubDeviceReduceTests, HipcubDeviceReduceTestsParams); From 789b2426786a9723c849a24b82a7f166937d80dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Wed, 8 Apr 2026 19:43:14 +0000 Subject: [PATCH 93/95] Add numeric_limits support for custom test types --- .../test/hipcub/test_hipcub_device_reduce.cpp | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp index 3a15333e1ee..41e3b64d51b 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_reduce.cpp @@ -69,6 +69,32 @@ using HipcubDeviceReduceTestsParams = ::testing::Types< DeviceReduceParams, test_utils::custom_test_type>, DeviceReduceParams, test_utils::custom_test_type>>; +// Device numeric_limits for custom_test_type +_HIPCUB_STD_NAMESPACE_BEGIN + +template +struct numeric_limits> +{ + static constexpr bool is_specialized = true; + + static constexpr test_utils::custom_test_type min() noexcept + { + return {_HIPCUB_STD::numeric_limits::min(), _HIPCUB_STD::numeric_limits::min()}; + } + + static constexpr test_utils::custom_test_type lowest() noexcept + { + return {_HIPCUB_STD::numeric_limits::lowest(), _HIPCUB_STD::numeric_limits::lowest()}; + } + + static constexpr test_utils::custom_test_type max() noexcept + { + return {_HIPCUB_STD::numeric_limits::max(), _HIPCUB_STD::numeric_limits::max()}; + } +}; + +_HIPCUB_STD_NAMESPACE_END + TYPED_TEST_SUITE(HipcubDeviceReduceTests, HipcubDeviceReduceTestsParams); TYPED_TEST(HipcubDeviceReduceTests, ReduceSum) From 1441fba21f7cfe485d4d4c6114b01efec61ce44b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= Date: Thu, 16 Apr 2026 15:23:05 +0000 Subject: [PATCH 94/95] Skip unsupported bfloat16/half histogram tests --- .../test/hipcub/test_hipcub_device_histogram.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp index 55bb120c22f..48ed8092f9f 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_histogram.cpp @@ -473,7 +473,9 @@ using Params2 = ::testing::Types< params2, params2, params2, +#if defined(__HIP_PLATFORM_AMD__) params2, +#endif params2, params2, params2, @@ -771,6 +773,14 @@ TYPED_TEST(HipcubDeviceHistogramMultiEven, MultiEven) upper_level[channel] = test_utils::convert_to_device(n_upper_level[channel]); } + // accuracy problems with bfloat and half + // nvidia cub also doesn't work + // TODO: check if nvidia works with only sample type bfloat/half + if(test_utils::is_half::value || test_utils::is_bfloat16::value) + { + GTEST_SKIP(); + } + hipStream_t stream = 0; // default if(TestFixture::params::use_graphs) { @@ -1030,7 +1040,9 @@ using Params4 = ::testing::Types< params4, params4, params4, +#if defined(__HIP_PLATFORM_AMD__) params4, +#endif params4, params4, params4, From 9b42cecb32e3264dd266283750335b6aead6f045 Mon Sep 17 00:00:00 2001 From: Nara Prasetya Date: Wed, 20 May 2026 09:34:08 +0000 Subject: [PATCH 95/95] feat(hipcub): add infrastructure to expose the compatible cccl version --- projects/hipcub/CMakeLists.txt | 7 ++++++- .../include/hipcub/hipcub_version.hpp.in | 19 ++++++++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/projects/hipcub/CMakeLists.txt b/projects/hipcub/CMakeLists.txt index 2b3cef1f1d6..c0e9b5cdc57 100644 --- a/projects/hipcub/CMakeLists.txt +++ b/projects/hipcub/CMakeLists.txt @@ -28,6 +28,10 @@ cmake_policy(VERSION 3.18...3.25) # # Set the library version set(VERSION_STRING "4.4.0") +# Set the CCCL-compatible version. +set(HIPCUB_CCCL_VERSION_MAJOR 3) +set(HIPCUB_CCCL_VERSION_MINOR 0) +set(HIPCUB_CCCL_VERSION_PATCH 3) # Set the minimum required rocPRIM version set(MIN_ROCPRIM_PACKAGE_VERSION "4.1.0" CACHE STRING "Minimum version of rocPRIM to search for when ROCPRIM_FETCH_METHOD is set to PACKAGE.") # Set download branch for dependency rocPRIM @@ -167,6 +171,7 @@ endif() # Setup the library version rocm_setup_version(VERSION ${VERSION_STRING}) math(EXPR hipcub_VERSION_NUMBER "${hipcub_VERSION_MAJOR} * 100000 + ${hipcub_VERSION_MINOR} * 100 + ${hipcub_VERSION_PATCH}") +math(EXPR HIPCUB_CCCL_VERSION_NUMBER "${HIPCUB_CCCL_VERSION_MAJOR} * 100000 + ${HIPCUB_CCCL_VERSION_MINOR} * 100 + ${HIPCUB_CCCL_VERSION_PATCH}") # Find and verify HIP. include(VerifyCompiler) @@ -182,7 +187,7 @@ endif() include(CheckCXXCompilerFlag) if(BUILD_OFFLOAD_COMPRESS) - # We need to pass '-x hip' since check_cxx_compiler_flag assumes c++ and not HIP. + # We need to pass '-x hip' since check_cxx_compiler_flag assumes c++ and not HIP. check_cxx_compiler_flag("--offload-compress -x hip" CXX_COMPILER_SUPPORTS_OFFLOAD_COMPRESS) if(CXX_COMPILER_SUPPORTS_OFFLOAD_COMPRESS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --offload-compress") diff --git a/projects/hipcub/hipcub/include/hipcub/hipcub_version.hpp.in b/projects/hipcub/hipcub/include/hipcub/hipcub_version.hpp.in index 2f6612c8ac3..86790382e50 100644 --- a/projects/hipcub/hipcub/include/hipcub/hipcub_version.hpp.in +++ b/projects/hipcub/hipcub/include/hipcub/hipcub_version.hpp.in @@ -22,7 +22,7 @@ #define HIPCUB_VERSION_HPP_ /// \def HIPCUB_VERSION -/// \brief hipCUB library version +/// \brief The hipCUB library version. /// /// Version number may not be visible in the documentation. /// @@ -38,4 +38,21 @@ #define HIPCUB_VERSION_MINOR @hipcub_VERSION_MINOR@ #define HIPCUB_VERSION_PATCH @hipcub_VERSION_PATCH@ +/// \def HIPCUB_CCCL_VERSION +/// \brief The CCCL version hipCUB is compatible with. +/// +/// Version number may not be visible in the documentation. +/// +/// HIPCUB_CCCL_VERSION % 100 is the patch level, +/// HIPCUB_CCCL_VERSION / 100 % 1000 is the minor version, +/// HIPCUB_CCCL_VERSION / 100000 is the major version. +/// +/// For example, if HIPCUB_CCCL_VERSION is 300003, then this library +/// is compatible with CCCL 3.0.3. +#define HIPCUB_CCCL_VERSION @HIPCUB_CCCL_VERSION_NUMBER@ + +#define HIPCUB_CCCL_VERSION_MAJOR @HIPCUB_CCCL_VERSION_MAJOR@ +#define HIPCUB_CCCL_VERSION_MINOR @HIPCUB_CCCL_VERSION_MINOR@ +#define HIPCUB_CCCL_VERSION_PATCH @HIPCUB_CCCL_VERSION_PATCH@ + #endif // HIPCUB_VERSION_HPP_