diff --git a/.gitignore b/.gitignore index afc7a1adc..f7d8b7602 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,3 @@ - ### Build dirs ### build/ @@ -56,4 +55,7 @@ compile_commands.json CTestTestfile.cmake build +### Gtilab CI ### +.gitlab-ci-gputest.yml + # End of https://www.gitignore.io/api/c++,cmake diff --git a/.gitlab-ci-gputest.yml b/.gitlab-ci-gputest.yml new file mode 100644 index 000000000..c3d00113c --- /dev/null +++ b/.gitlab-ci-gputest.yml @@ -0,0 +1,14 @@ +test:rocm241: + extends: .unittest + tags: + - tag241 + +test:rocm243: + extends: .unittest + tags: + - tag243 + +test:rocm244: + extends: .unittest + tags: + - tag244 diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4f14eaa4b..7c9ed50c2 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -47,6 +47,8 @@ before_script: build:rocm: stage: build + tags: + - rocm variables: SUDO_CMD: "sudo -E" script: @@ -71,9 +73,7 @@ build:rocm: - build/rocprim*.zip expire_in: 2 weeks -test:rocm243: - tags: - - tag243 +.unittest: stage: test variables: SUDO_CMD: "sudo -E" @@ -83,20 +83,12 @@ test:rocm243: - cd build - $SUDO_CMD ctest --output-on-failure --repeat-until-fail 2 -test:rocm244: - tags: - - tag244 - stage: test - variables: - SUDO_CMD: "sudo -E" - dependencies: - - build:rocm - script: - - cd build - - $SUDO_CMD ctest --output-on-failure --repeat-until-fail 2 +include: '.gitlab-ci-gputest.yml' test:rocm_package: stage: test + tags: + - rocm variables: SUDO_CMD: "sudo -E" dependencies: @@ -112,6 +104,8 @@ test:rocm_package: test:rocm_install: stage: test + tags: + - rocm variables: SUDO_CMD: "sudo -E" script: diff --git a/CMakeLists.txt b/CMakeLists.txt index bdefb3fde..100bf1301 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,7 +26,7 @@ cmake_minimum_required(VERSION 3.5.1 FATAL_ERROR) set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories") # rocPRIM project -project(rocprim VERSION 1.0.1.0 LANGUAGES CXX) +project(rocprim VERSION 1.0.2.0 LANGUAGES CXX) # CMake modules list(APPEND CMAKE_MODULE_PATH @@ -62,13 +62,13 @@ option(ONLY_INSTALL "Only install" OFF) # Get dependencies include(cmake/Dependencies.cmake) +# AMD targets +set(AMDGPU_TARGETS gfx803;gfx900;gfx906 CACHE STRING "List of specific machine types for library to target") + # Print configuration summary include(cmake/Summary.cmake) print_configuration_summary() -# AMD targets -set(AMDGPU_TARGETS gfx803;gfx900 CACHE STRING "List of specific machine types for library to target") - # rocPRIM works only on hcc if(HIP_PLATFORM STREQUAL "hcc") # rocPRIM library @@ -118,12 +118,12 @@ if(HIP_PLATFORM STREQUAL "hcc") rocm_create_package( NAME rocprim DESCRIPTION "Radeon Open Compute Parallel Primitives Libary" - MAINTAINER "Jakub Szuppe " + MAINTAINER "Stream HPC Maintainers " ) else() rocm_create_package( NAME rocprim-hipcub DESCRIPTION "Radeon Open Compute Parallel Primitives Libary (hipCUB only)" - MAINTAINER "Jakub Szuppe " + MAINTAINER "Stream HPC Maintainers " ) endif() diff --git a/cmake/SetupNVCC.cmake b/cmake/SetupNVCC.cmake index 7ac8f011d..aea66331a 100644 --- a/cmake/SetupNVCC.cmake +++ b/cmake/SetupNVCC.cmake @@ -98,4 +98,5 @@ string(REPLACE " " ";" HIP_CPP_CONFIG_FLAGS ${HIP_CPP_CONFIG_FLAGS}) list(APPEND CUDA_NVCC_FLAGS "-std=c++11 ${HIP_CPP_CONFIG_FLAGS} ${HIP_NVCC_FLAGS}") # Ignore warnings about #pragma unroll -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas") +# and about deprecated CUDA function(s) used in hip/nvcc_detail/hip_runtime_api.h +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas -Wno-deprecated-declarations") diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index 5751c1b00..45442f39d 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -31,6 +31,7 @@ function(print_configuration_summary) message(STATUS " CXX flags : ${CMAKE_CXX_FLAGS_STRIP}") message(STATUS " Build type : ${CMAKE_BUILD_TYPE}") message(STATUS " Install prefix : ${CMAKE_INSTALL_PREFIX}") + message(STATUS " Device targets : ${AMDGPU_TARGETS}") message(STATUS "") message(STATUS " BUILD_TEST : ${BUILD_TEST}") message(STATUS " BUILD_BENCHMARK : ${BUILD_BENCHMARK}") diff --git a/hipcub/include/hipcub/rocprim/block/block_scan.hpp b/hipcub/include/hipcub/rocprim/block/block_scan.hpp index 3767397f2..3555308f5 100644 --- a/hipcub/include/hipcub/rocprim/block/block_scan.hpp +++ b/hipcub/include/hipcub/rocprim/block/block_scan.hpp @@ -190,13 +190,13 @@ class BlockScan HIPCUB_DEVICE inline void ExclusiveSum(T input, T& output) { - base_type::exclusive_scan(input, output, temp_storage_); + base_type::exclusive_scan(input, output, T(0), temp_storage_); } HIPCUB_DEVICE inline void ExclusiveSum(T input, T& output, T& block_aggregate) { - base_type::exclusive_scan(input, output, block_aggregate, temp_storage_); + base_type::exclusive_scan(input, output, T(0), block_aggregate, temp_storage_); } template @@ -212,7 +212,7 @@ class BlockScan HIPCUB_DEVICE inline void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD]) { - base_type::exclusive_scan(input, output, temp_storage_); + base_type::exclusive_scan(input, output, T(0), temp_storage_); } template @@ -220,7 +220,7 @@ class BlockScan void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], T& block_aggregate) { - base_type::exclusive_scan(input, output, block_aggregate, temp_storage_); + base_type::exclusive_scan(input, output, T(0), block_aggregate, temp_storage_); } template diff --git a/rocprim/include/rocprim/device/detail/device_merge_sort.hpp b/rocprim/include/rocprim/device/detail/device_merge_sort.hpp index 1f6fa1dca..b97b26f4d 100644 --- a/rocprim/include/rocprim/device/detail/device_merge_sort.hpp +++ b/rocprim/include/rocprim/device/detail/device_merge_sort.hpp @@ -359,11 +359,11 @@ void block_sort_kernel_impl(KeysInputIterator keys_input, ); block_sort_impl( - key[0], - value[0], - valid_in_last_block, - last_block, - compare_function + key[0], + value[0], + valid_in_last_block, + last_block, + compare_function ); block_store_impl( diff --git a/rocprim/include/rocprim/device/device_merge_sort_hc.hpp b/rocprim/include/rocprim/device/device_merge_sort_hc.hpp index 97705c5fa..772557389 100644 --- a/rocprim/include/rocprim/device/device_merge_sort_hc.hpp +++ b/rocprim/include/rocprim/device/device_merge_sort_hc.hpp @@ -132,7 +132,7 @@ void merge_sort_impl(void * temporary_storage, ROCPRIM_DETAIL_HC_SYNC("block_sort_kernel", size, start) bool temporary_store = false; - for(unsigned int block = block_size ; block < size; block *= 2) + for(unsigned int block = block_size; block < size; block *= 2) { temporary_store = !temporary_store; if(temporary_store) @@ -149,7 +149,7 @@ void merge_sort_impl(void * temporary_storage, ); } ); - ROCPRIM_DETAIL_HC_SYNC("block_merge_buffer_kernel", size, start) + ROCPRIM_DETAIL_HC_SYNC("block_merge_kernel", size, start) } else { diff --git a/rocprim/include/rocprim/device/device_merge_sort_hip.hpp b/rocprim/include/rocprim/device/device_merge_sort_hip.hpp index 0db57b985..978c44aed 100644 --- a/rocprim/include/rocprim/device/device_merge_sort_hip.hpp +++ b/rocprim/include/rocprim/device/device_merge_sort_hip.hpp @@ -176,7 +176,7 @@ hipError_t merge_sort_impl(void * temporary_storage, const unsigned int grid_size = number_of_blocks; hipLaunchKernelGGL( - HIP_KERNEL_NAME(detail::block_sort_kernel), + HIP_KERNEL_NAME(block_sort_kernel), dim3(grid_size), dim3(block_size), 0, stream, keys_input, keys_output, values_input, values_output, size, compare_function @@ -184,25 +184,25 @@ hipError_t merge_sort_impl(void * temporary_storage, ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_sort_kernel", size, start); bool temporary_store = false; - for(unsigned int block = block_size ; block < size; block *= 2) + for(unsigned int block = block_size; block < size; block *= 2) { temporary_store = !temporary_store; if(temporary_store) { if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL( - HIP_KERNEL_NAME(detail::block_merge_kernel), + HIP_KERNEL_NAME(block_merge_kernel), dim3(grid_size), dim3(block_size), 0, stream, keys_output, keys_buffer, values_output, values_buffer, size, block, compare_function ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_merge_buffer_kernel", size, start); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_merge_kernel", size, start); } else { if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL( - HIP_KERNEL_NAME(detail::block_merge_kernel), + HIP_KERNEL_NAME(block_merge_kernel), dim3(grid_size), dim3(block_size), 0, stream, keys_buffer, keys_output, values_buffer, values_output, size, block, compare_function diff --git a/rocprim/include/rocprim/device/device_scan_hc.hpp b/rocprim/include/rocprim/device/device_scan_hc.hpp index c9205d276..98752dd20 100644 --- a/rocprim/include/rocprim/device/device_scan_hc.hpp +++ b/rocprim/include/rocprim/device/device_scan_hc.hpp @@ -56,6 +56,7 @@ namespace detail template< bool Exclusive, + bool UseLoopback, class Config, class InputIterator, class OutputIterator, @@ -63,7 +64,7 @@ template< class BinaryFunction > inline -void scan_impl(void * temporary_storage, +auto scan_impl(void * temporary_storage, size_t& storage_size, InputIterator input, OutputIterator output, @@ -72,6 +73,7 @@ void scan_impl(void * temporary_storage, BinaryFunction scan_op, hc::accelerator_view acc_view, const bool debug_synchronous) + -> typename std::enable_if::type { using input_type = typename std::iterator_traits::value_type; using output_type = typename std::iterator_traits::value_type; @@ -140,7 +142,7 @@ void scan_impl(void * temporary_storage, auto nested_temp_storage_size = storage_size - (number_of_blocks * sizeof(result_type)); if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - scan_impl( + scan_impl( nested_temp_storage, nested_temp_storage_size, block_prefixes, // input @@ -188,6 +190,7 @@ void scan_impl(void * temporary_storage, template< bool Exclusive, + bool UseLoopback, class Config, class InputIterator, class OutputIterator, @@ -195,15 +198,16 @@ template< class BinaryFunction > inline -void lookback_scan_impl(void * temporary_storage, - size_t& storage_size, - InputIterator input, - OutputIterator output, - const InitValueType initial_value, - const size_t size, - BinaryFunction scan_op, - hc::accelerator_view acc_view, - const bool debug_synchronous) +auto scan_impl(void * temporary_storage, + size_t& storage_size, + InputIterator input, + OutputIterator output, + const InitValueType initial_value, + const size_t size, + BinaryFunction scan_op, + hc::accelerator_view acc_view, + const bool debug_synchronous) + -> typename std::enable_if::type { using input_type = typename std::iterator_traits::value_type; using output_type = typename std::iterator_traits::value_type; @@ -404,24 +408,14 @@ void inclusive_scan(void * temporary_storage, >::type; // Lookback scan has problems with types that are not arithmetic - if(::rocprim::is_arithmetic::value) - { - return detail::lookback_scan_impl( - temporary_storage, storage_size, - // result_type() is a dummy initial value (not used) - input, output, result_type(), size, - scan_op, acc_view, debug_synchronous - ); - } - else - { - return detail::scan_impl( - temporary_storage, storage_size, - // result_type() is a dummy initial value (not used) - input, output, result_type(), size, - scan_op, acc_view, debug_synchronous - ); - } + // TODO: Investigate why the compiler never finishes linking if half is used + // Workaround: rocprim::is_arithmetic is replaced by std::is_arithmetic + detail::scan_impl::value, Config>( + temporary_storage, storage_size, + // result_type() is a dummy initial value (not used) + input, output, result_type(), size, + scan_op, acc_view, debug_synchronous + ); } /// \brief HC parallel exclusive scan primitive for device level. @@ -533,22 +527,13 @@ void exclusive_scan(void * temporary_storage, >::type; // Lookback scan has problems with types that are not arithmetic - if(::rocprim::is_arithmetic::value) - { - return detail::lookback_scan_impl( - temporary_storage, storage_size, - input, output, initial_value, size, - scan_op, acc_view, debug_synchronous - ); - } - else - { - return detail::scan_impl( - temporary_storage, storage_size, - input, output, initial_value, size, - scan_op, acc_view, debug_synchronous - ); - } + // TODO: Investigate why the compiler never finishes linking if half is used + // Workaround: rocprim::is_arithmetic is replaced by std::is_arithmetic + detail::scan_impl::value, Config>( + temporary_storage, storage_size, + input, output, initial_value, size, + scan_op, acc_view, debug_synchronous + ); } /// @} diff --git a/rocprim/include/rocprim/device/device_scan_hip.hpp b/rocprim/include/rocprim/device/device_scan_hip.hpp index b01c9f97b..d518c6703 100644 --- a/rocprim/include/rocprim/device/device_scan_hip.hpp +++ b/rocprim/include/rocprim/device/device_scan_hip.hpp @@ -170,6 +170,7 @@ void init_lookback_scan_state_kernel(LookBackScanState lookback_scan_state, template< bool Exclusive, + bool UseLoopback, class Config, class InputIterator, class OutputIterator, @@ -177,15 +178,16 @@ template< class BinaryFunction > inline -hipError_t scan_impl(void * temporary_storage, - size_t& storage_size, - InputIterator input, - OutputIterator output, - const InitValueType initial_value, - const size_t size, - BinaryFunction scan_op, - const hipStream_t stream, - bool debug_synchronous) +auto scan_impl(void * temporary_storage, + size_t& storage_size, + InputIterator input, + OutputIterator output, + const InitValueType initial_value, + const size_t size, + BinaryFunction scan_op, + const hipStream_t stream, + bool debug_synchronous) + -> typename std::enable_if::type { using input_type = typename std::iterator_traits::value_type; using output_type = typename std::iterator_traits::value_type; @@ -250,7 +252,7 @@ hipError_t scan_impl(void * temporary_storage, auto nested_temp_storage_size = storage_size - (number_of_blocks * sizeof(result_type)); if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - auto error = scan_impl( + auto error = scan_impl( nested_temp_storage, nested_temp_storage_size, block_prefixes, // input @@ -303,6 +305,7 @@ hipError_t scan_impl(void * temporary_storage, template< bool Exclusive, + bool UseLoopback, class Config, class InputIterator, class OutputIterator, @@ -310,15 +313,16 @@ template< class BinaryFunction > inline -hipError_t lookback_scan_impl(void * temporary_storage, - size_t& storage_size, - InputIterator input, - OutputIterator output, - const InitValueType initial_value, - const size_t size, - BinaryFunction scan_op, - const hipStream_t stream, - bool debug_synchronous) +auto scan_impl(void * temporary_storage, + size_t& storage_size, + InputIterator input, + OutputIterator output, + const InitValueType initial_value, + const size_t size, + BinaryFunction scan_op, + const hipStream_t stream, + bool debug_synchronous) + -> typename std::enable_if::type { using input_type = typename std::iterator_traits::value_type; using output_type = typename std::iterator_traits::value_type; @@ -515,24 +519,14 @@ hipError_t inclusive_scan(void * temporary_storage, >::type; // Lookback scan has problems with types that are not arithmetic - if(::rocprim::is_arithmetic::value) - { - return detail::lookback_scan_impl( - temporary_storage, storage_size, - // result_type() is a dummy initial value (not used) - input, output, result_type(), size, - scan_op, stream, debug_synchronous - ); - } - else - { - return detail::scan_impl( - temporary_storage, storage_size, - // result_type() is a dummy initial value (not used) - input, output, result_type(), size, - scan_op, stream, debug_synchronous - ); - } + // TODO: Investigate why the compiler never finishes linking if half is used + // Workaround: rocprim::is_arithmetic is replaced by std::is_arithmetic + return detail::scan_impl::value, Config>( + temporary_storage, storage_size, + // result_type() is a dummy initial value (not used) + input, output, result_type(), size, + scan_op, stream, debug_synchronous + ); } /// \brief HIP parallel exclusive scan primitive for device level. @@ -644,22 +638,13 @@ hipError_t exclusive_scan(void * temporary_storage, >::type; // Lookback scan has problems with types that are not arithmetic - if(::rocprim::is_arithmetic::value) - { - return detail::lookback_scan_impl( - temporary_storage, storage_size, - input, output, initial_value, size, - scan_op, stream, debug_synchronous - ); - } - else - { - return detail::scan_impl( - temporary_storage, storage_size, - input, output, initial_value, size, - scan_op, stream, debug_synchronous - ); - } + // TODO: Investigate why the compiler never finishes linking if half is used + // Workaround: rocprim::is_arithmetic is replaced by std::is_arithmetic + return detail::scan_impl::value, Config>( + temporary_storage, storage_size, + input, output, initial_value, size, + scan_op, stream, debug_synchronous + ); } /// @} diff --git a/rocprim/include/rocprim/intrinsics/warp_shuffle.hpp b/rocprim/include/rocprim/intrinsics/warp_shuffle.hpp index a9afb9e36..b048dc792 100644 --- a/rocprim/include/rocprim/intrinsics/warp_shuffle.hpp +++ b/rocprim/include/rocprim/intrinsics/warp_shuffle.hpp @@ -40,16 +40,19 @@ T warp_shuffle_op(T input, ShuffleOp&& op) { constexpr int words_no = (sizeof(T) + sizeof(int) - 1) / sizeof(int); - int * shfl_input = reinterpret_cast(&input); - int shfl_output_words[words_no]; - T * shfl_output = reinterpret_cast(shfl_output_words); + int words[words_no]; + __builtin_memcpy(words, &input, sizeof(T)); #pragma unroll for(int i = 0; i < words_no; i++) { - shfl_output_words[i] = op(shfl_input[i]); + words[i] = op(words[i]); } - return *shfl_output; + + T output; + __builtin_memcpy(&output, words, sizeof(T)); + + return output; } ROCPRIM_DEVICE @@ -63,19 +66,22 @@ T warp_move_dpp(T input, int dpp_ctrl, { constexpr int words_no = (sizeof(T) + sizeof(int) - 1) / sizeof(int); - int * int_input = reinterpret_cast(&input); - int int_output_words[words_no]; - T * int_output = reinterpret_cast(int_output_words); + int words[words_no]; + __builtin_memcpy(words, &input, sizeof(T)); #pragma unroll for(int i = 0; i < words_no; i++) { - int_output_words[i] = __amdgcn_update_dpp( - 0, int_input[i], + words[i] = __amdgcn_update_dpp( + 0, words[i], dpp_ctrl, row_mask, bank_mask, bound_ctrl ); } - return *int_output; + + T output; + __builtin_memcpy(&output, words, sizeof(T)); + + return output; } } // end namespace detail @@ -102,7 +108,7 @@ T warp_shuffle(T input, const int src_lane, const int width = warp_size()) input, [=](int v) -> int { - #if defined(ROCPRIM_HC_API) || defined(__HIP_PLATFORM_HCC__) + #if defined(ROCPRIM_HC_API) || (defined(__HIP_PLATFORM_HCC__) && !defined(__HIP__)) return hc::__shfl(v, src_lane, width); #else return __shfl(v, src_lane, width); @@ -131,7 +137,7 @@ T warp_shuffle_up(T input, const unsigned int delta, const int width = warp_size input, [=](int v) -> int { - #if defined(ROCPRIM_HC_API) || defined(__HIP_PLATFORM_HCC__) + #if defined(ROCPRIM_HC_API) || (defined(__HIP_PLATFORM_HCC__) && !defined(__HIP__)) return hc::__shfl_up(v, delta, width); #else return __shfl_up(v, delta, width); @@ -160,7 +166,7 @@ T warp_shuffle_down(T input, const unsigned int delta, const int width = warp_si input, [=](int v) -> int { - #if defined(ROCPRIM_HC_API) || defined(__HIP_PLATFORM_HCC__) + #if defined(ROCPRIM_HC_API) || (defined(__HIP_PLATFORM_HCC__) && !defined(__HIP__)) return hc::__shfl_down(v, delta, width); #else return __shfl_down(v, delta, width); @@ -188,7 +194,7 @@ T warp_shuffle_xor(T input, const int lane_mask, const int width = warp_size()) input, [=](int v) -> int { - #if defined(ROCPRIM_HC_API) || defined(__HIP_PLATFORM_HCC__) + #if defined(ROCPRIM_HC_API) || (defined(__HIP_PLATFORM_HCC__) && !defined(__HIP__)) return hc::__shfl_xor(v, lane_mask, width); #else return __shfl_xor(v, lane_mask, width); diff --git a/test/extra/CMakeLists.txt b/test/extra/CMakeLists.txt index 125a6f93d..0af44f594 100644 --- a/test/extra/CMakeLists.txt +++ b/test/extra/CMakeLists.txt @@ -79,7 +79,7 @@ set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror") # AMD targets -set(AMDGPU_TARGETS gfx803;gfx900 CACHE STRING "List of specific machine types for library to target") +set(AMDGPU_TARGETS gfx803;gfx900;gfx906 CACHE STRING "List of specific machine types for library to target") # Enable testing (ctest) enable_testing() @@ -148,4 +148,4 @@ if(HIP_PLATFORM STREQUAL "hcc") add_rocprim_test("test_rocprim_package" test_rocprim_package.cpp) endif() # hipCUB package test -add_hipcub_test("test_hipcub_package" test_hipcub_package.cpp) \ No newline at end of file +add_hipcub_test("test_hipcub_package" test_hipcub_package.cpp) diff --git a/test/hipcub/test_utils.hpp b/test/hipcub/test_utils.hpp index 6eb17c6df..b93a2161e 100644 --- a/test/hipcub/test_utils.hpp +++ b/test/hipcub/test_utils.hpp @@ -323,13 +323,13 @@ struct custom_test_type HIPCUB_HOST_DEVICE inline bool operator<(const custom_test_type& other) const { - return (x < other.x && y < other.y); + return (x < other.x || (x == other.x && y < other.y)); } HIPCUB_HOST_DEVICE inline bool operator>(const custom_test_type& other) const { - return (x > other.x && y > other.y); + return (x > other.x || (x == other.x && y > other.y)); } HIPCUB_HOST_DEVICE inline @@ -366,7 +366,7 @@ inline auto get_random_data(size_t size, typename T::value_type min, typename T: std::default_random_engine gen(rd()); std::uniform_int_distribution distribution(min, max); std::vector data(size); - std::generate(data.begin(), data.end(), [&]() { return distribution(gen); }); + std::generate(data.begin(), data.end(), [&]() { return T(distribution(gen), distribution(gen)); }); return data; } @@ -381,7 +381,7 @@ inline auto get_random_data(size_t size, typename T::value_type min, typename T: std::default_random_engine gen(rd()); std::uniform_real_distribution distribution(min, max); std::vector data(size); - std::generate(data.begin(), data.end(), [&]() { return T(distribution(gen)); }); + std::generate(data.begin(), data.end(), [&]() { return T(distribution(gen), distribution(gen)); }); return data; } diff --git a/test/rocprim/test_hc_block_discontinuity.cpp b/test/rocprim/test_hc_block_discontinuity.cpp index 1f7c379e9..08a218b2a 100644 --- a/test/rocprim/test_hc_block_discontinuity.cpp +++ b/test/rocprim/test_hc_block_discontinuity.cpp @@ -122,6 +122,7 @@ typedef ::testing::Types< params >, params >, params >, + params, // Non-power of 2 BlockSize params >, @@ -135,12 +136,14 @@ typedef ::testing::Types< params >, params, params >, + params, // Non-power of 2 BlockSize and ItemsPerThread > 1 params, params >, params >, - params > + params >, + params > Params; TYPED_TEST_CASE(RocprimBlockDiscontinuity, Params); diff --git a/test/rocprim/test_hc_block_exchange.cpp b/test/rocprim/test_hc_block_exchange.cpp index 66432517d..e96bd2448 100644 --- a/test/rocprim/test_hc_block_exchange.cpp +++ b/test/rocprim/test_hc_block_exchange.cpp @@ -68,20 +68,23 @@ typedef ::testing::Types< params, params, params, + params, // Power of 2 BlockSize and ItemsPerThread > 1 params, params, params, params, - params, + params, params, + params, // Non-power of 2 BlockSize and ItemsPerThread > 1 params, params, params, - params + params, + params > Params; TYPED_TEST_CASE(RocprimBlockExchangeTests, Params); @@ -147,10 +150,7 @@ TYPED_TEST(RocprimBlockExchangeTests, BlockedToStriped) ); d_output.synchronize(); - for(size_t i = 0; i < size; i++) - { - ASSERT_EQ(output[i], expected[i]); - } + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); } TYPED_TEST(RocprimBlockExchangeTests, StripedToBlocked) @@ -214,10 +214,7 @@ TYPED_TEST(RocprimBlockExchangeTests, StripedToBlocked) ); d_output.synchronize(); - for(size_t i = 0; i < size; i++) - { - ASSERT_EQ(output[i], expected[i]); - } + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); } TYPED_TEST(RocprimBlockExchangeTests, BlockedToWarpStriped) @@ -292,10 +289,7 @@ TYPED_TEST(RocprimBlockExchangeTests, BlockedToWarpStriped) ); d_output.synchronize(); - for(size_t i = 0; i < size; i++) - { - ASSERT_EQ(output[i], expected[i]); - } + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); } TYPED_TEST(RocprimBlockExchangeTests, WarpStripedToBlocked) @@ -370,10 +364,7 @@ TYPED_TEST(RocprimBlockExchangeTests, WarpStripedToBlocked) ); d_output.synchronize(); - for(size_t i = 0; i < size; i++) - { - ASSERT_EQ(output[i], expected[i]); - } + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); } TYPED_TEST(RocprimBlockExchangeTests, ScatterToBlocked) @@ -447,10 +438,7 @@ TYPED_TEST(RocprimBlockExchangeTests, ScatterToBlocked) ); d_output.synchronize(); - for(size_t i = 0; i < size; i++) - { - ASSERT_EQ(output[i], expected[i]); - } + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); } TYPED_TEST(RocprimBlockExchangeTests, ScatterToStriped) @@ -526,8 +514,5 @@ TYPED_TEST(RocprimBlockExchangeTests, ScatterToStriped) ); d_output.synchronize(); - for(size_t i = 0; i < size; i++) - { - ASSERT_EQ(output[i], expected[i]); - } + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); } diff --git a/test/rocprim/test_hc_block_load_store.cpp b/test/rocprim/test_hc_block_load_store.cpp index 6b89c41ac..c930baff0 100644 --- a/test/rocprim/test_hc_block_load_store.cpp +++ b/test/rocprim/test_hc_block_load_store.cpp @@ -82,34 +82,34 @@ typedef ::testing::Types< // block_load_direct class_params, - class_params, + class_params, class_params, - class_params, class_params, class_params, + rp::block_store_method::block_store_direct, 512U, 3>, class_params, - class_params, + class_params, class_params, - class_params, + class_params, class_params, class_params, + rp::block_store_method::block_store_direct, 512U, 2>, class_params, rp::block_load_method::block_load_direct, rp::block_store_method::block_store_direct, 64U, 1>, class_params, rp::block_load_method::block_load_direct, - rp::block_store_method::block_store_direct, 64U, 4>, + rp::block_store_method::block_store_direct, 64U, 5>, class_params, rp::block_load_method::block_load_direct, rp::block_store_method::block_store_direct, 256U, 1>, class_params, rp::block_load_method::block_load_direct, @@ -119,12 +119,12 @@ typedef ::testing::Types< class_params, class_params, - class_params, + class_params, class_params, - class_params, class_params, @@ -136,11 +136,11 @@ typedef ::testing::Types< class_params, class_params, + rp::block_store_method::block_store_vectorize, 256U, 8>, class_params, class_params, + rp::block_store_method::block_store_vectorize, 512U, 2>, class_params, rp::block_load_method::block_load_vectorize, rp::block_store_method::block_store_vectorize, 64U, 1>, @@ -155,20 +155,20 @@ typedef ::testing::Types< class_params, class_params, + rp::block_store_method::block_store_transpose, 64U, 9>, class_params, - class_params, class_params, - class_params, class_params, class_params, + rp::block_store_method::block_store_transpose, 64U, 7>, class_params, class_params, class_params, + rp::block_store_method::block_store_transpose, 512U, 3>, class_params, rp::block_load_method::block_load_transpose, rp::block_store_method::block_store_transpose, 64U, 1>, class_params, rp::block_load_method::block_load_transpose, - rp::block_store_method::block_store_transpose, 64U, 4>, + rp::block_store_method::block_store_transpose, 64U, 5>, class_params, rp::block_load_method::block_load_transpose, rp::block_store_method::block_store_transpose, 256U, 1>, class_params, rp::block_load_method::block_load_transpose, @@ -294,10 +294,7 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClass) d_input.synchronize(); d_output.synchronize(); - for(size_t i = 0; i < output.size(); i++) - { - ASSERT_EQ(output[i], expected[i]); - } + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); } TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClassValid) @@ -355,10 +352,7 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClassValid) d_input.synchronize(); d_output.synchronize(); - for(size_t i = 0; i < output.size(); i++) - { - ASSERT_EQ(output[i], expected[i]); - } + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); } TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClassDefault) @@ -417,10 +411,7 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClassDefault) d_input.synchronize(); d_output.synchronize(); - for(size_t i = 0; i < output.size(); i++) - { - ASSERT_EQ(output[i], expected[i]); - } + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); } TYPED_TEST(RocprimVectorizationTests, IsVectorizable) @@ -441,4 +432,3 @@ TYPED_TEST(RocprimVectorizationTests, MatchVectorType) bool input = std::is_same::value; ASSERT_TRUE(input); } - diff --git a/test/rocprim/test_hc_block_reduce.cpp b/test/rocprim/test_hc_block_reduce.cpp index d1f0e62c0..b2aabcf65 100644 --- a/test/rocprim/test_hc_block_reduce.cpp +++ b/test/rocprim/test_hc_block_reduce.cpp @@ -34,16 +34,24 @@ namespace rp = rocprim; +template +T apply(BinaryOp binary_op, const T& a, const T& b) +{ + return binary_op(a, b); +} + // Params for tests template< class T, unsigned int BlockSize = 256U, unsigned int ItemsPerThread = 1U, - rocprim::block_reduce_algorithm Algorithm = rocprim::block_reduce_algorithm::using_warp_reduce + rocprim::block_reduce_algorithm Algorithm = rocprim::block_reduce_algorithm::using_warp_reduce, + class BinaryOp = rocprim::plus > struct params { using type = T; + using binary_op_type = BinaryOp; static constexpr rocprim::block_reduce_algorithm algorithm = Algorithm; static constexpr unsigned int block_size = BlockSize; static constexpr unsigned int items_per_thread = ItemsPerThread; @@ -58,6 +66,7 @@ class RocprimBlockReduceSingleValueTests : public ::testing::Test { public: using type = typename Params::type; + using binary_op_type = typename Params::binary_op_type; static constexpr rocprim::block_reduce_algorithm algorithm = Params::algorithm; static constexpr unsigned int block_size = Params::block_size; }; @@ -77,6 +86,14 @@ typedef ::testing::Types< params, params, params, + // char tests + params, + params, + params, + // half tests + params, + params, + params, // uint tests params, params, @@ -94,6 +111,14 @@ typedef ::testing::Types< params, params, params, + params, + params, + params, + params, + params, + params, + params, + params, params, params, params, @@ -107,6 +132,7 @@ TYPED_TEST_CASE(RocprimBlockReduceSingleValueTests, SingleValueTestParams); TYPED_TEST(RocprimBlockReduceSingleValueTests, Reduce) { using T = typename TestFixture::type; + using binary_op_type = typename TestFixture::binary_op_type; constexpr auto algorithm = TestFixture::algorithm; constexpr size_t block_size = TestFixture::block_size; @@ -119,18 +145,19 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, Reduce) const size_t size = block_size * 113; // Generate data - std::vector output = test_utils::get_random_data(size, 2, 200); + std::vector output = test_utils::get_random_data(size, 2, 50); std::vector output_reductions(size / block_size); // Calculate expected results on host std::vector expected_reductions(output_reductions.size(), 0); + binary_op_type binary_op; for(size_t i = 0; i < output.size() / block_size; i++) { T value = 0; for(size_t j = 0; j < block_size; j++) { auto idx = i * block_size + j; - value += output[idx]; + value = apply(binary_op, value, output[idx]); } expected_reductions[i] = value; } @@ -147,7 +174,7 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, Reduce) T value = d_output[i]; //T reduction; rp::block_reduce breduce; - breduce.reduce(value, value); + breduce.reduce(value, value, binary_op_type()); //d_output[i] = value; if(i.local[0] == 0) { @@ -158,10 +185,7 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, Reduce) d_output.synchronize(); d_output_r.synchronize(); - for(size_t i = 0; i < output_reductions.size(); i++) - { - ASSERT_EQ(output_reductions[i], expected_reductions[i]); - } + test_utils::assert_eq(output_reductions, expected_reductions); } TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceMultiplies) @@ -170,6 +194,12 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceMultiplies) constexpr auto algorithm = TestFixture::algorithm; constexpr size_t block_size = TestFixture::block_size; + // Half not tested here + if(std::is_same::value) + { + return; + } + hc::accelerator acc; // Given block size not supported if(block_size > test_utils::get_max_tile_size(acc)) @@ -223,15 +253,13 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceMultiplies) d_output.synchronize(); d_output_r.synchronize(); - for(size_t i = 0; i < output_reductions.size(); i++) - { - ASSERT_EQ(output_reductions[i], expected_reductions[i]); - } + test_utils::assert_eq(output_reductions, expected_reductions); } TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceValid) { using T = typename TestFixture::type; + using binary_op_type = typename TestFixture::binary_op_type; constexpr auto algorithm = TestFixture::algorithm; constexpr size_t block_size = TestFixture::block_size; const unsigned int valid_items = test_utils::get_random_value(block_size - 10, block_size); @@ -245,18 +273,19 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceValid) const size_t size = block_size * 113; // Generate data - std::vector output = test_utils::get_random_data(size, 2, 200); + std::vector output = test_utils::get_random_data(size, 2, 50); std::vector output_reductions(size / block_size); // Calculate expected results on host std::vector expected_reductions(output_reductions.size(), 0); + binary_op_type binary_op; for(size_t i = 0; i < output.size() / block_size; i++) { T value = 0; for(size_t j = 0; j < valid_items; j++) { auto idx = i * block_size + j; - value += output[idx]; + value = apply(binary_op, value, output[idx]); } expected_reductions[i] = value; } @@ -273,7 +302,7 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceValid) T value = d_output[i]; //T reduction; rp::block_reduce breduce; - breduce.reduce(value, value, valid_items); + breduce.reduce(value, value, valid_items, binary_op_type()); //d_output[i] = value; if(i.local[0] == 0) { @@ -284,10 +313,7 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceValid) d_output.synchronize(); d_output_r.synchronize(); - for(size_t i = 0; i < output_reductions.size(); i++) - { - ASSERT_EQ(output_reductions[i], expected_reductions[i]); - } + test_utils::assert_eq(output_reductions, expected_reductions); } template @@ -295,6 +321,7 @@ class RocprimBlockReduceInputArrayTests : public ::testing::Test { public: using type = typename Params::type; + using binary_op_type = typename Params::binary_op_type; static constexpr unsigned int block_size = Params::block_size; static constexpr rocprim::block_reduce_algorithm algorithm = Params::algorithm; static constexpr unsigned int items_per_thread = Params::items_per_thread; @@ -313,6 +340,12 @@ typedef ::testing::Types< params, params, params, + params, + params, + params, + params, + params, + params, // ----------------------------------------------------------------------- // rocprim::block_reduce_algorithm::raking_reduce // ----------------------------------------------------------------------- @@ -324,7 +357,13 @@ typedef ::testing::Types< params, params, params, - params + params, + params, + params, + params, + params, + params, + params > InputArrayTestParams; TYPED_TEST_CASE(RocprimBlockReduceInputArrayTests, InputArrayTestParams); @@ -332,6 +371,7 @@ TYPED_TEST_CASE(RocprimBlockReduceInputArrayTests, InputArrayTestParams); TYPED_TEST(RocprimBlockReduceInputArrayTests, Reduce) { using T = typename TestFixture::type; + using binary_op_type = typename TestFixture::binary_op_type; constexpr auto algorithm = TestFixture::algorithm; constexpr size_t block_size = TestFixture::block_size; constexpr size_t items_per_thread = TestFixture::items_per_thread; @@ -346,20 +386,21 @@ TYPED_TEST(RocprimBlockReduceInputArrayTests, Reduce) const size_t items_per_block = block_size * items_per_thread; const size_t size = items_per_block * 37; // Generate data - std::vector output = test_utils::get_random_data(size, 2, 200); + std::vector output = test_utils::get_random_data(size, 2, 50); // Output reduce results std::vector output_reductions(size / block_size); // Calculate expected results on host std::vector expected_reductions(output_reductions.size(), 0); + binary_op_type binary_op; for(size_t i = 0; i < output.size() / items_per_block; i++) { T value = 0; for(size_t j = 0; j < items_per_block; j++) { auto idx = i * items_per_block + j; - value += output[idx]; + value = apply(binary_op, value, output[idx]); } expected_reductions[i] = value; } @@ -386,7 +427,7 @@ TYPED_TEST(RocprimBlockReduceInputArrayTests, Reduce) rp::block_reduce breduce; T reduction; - breduce.reduce(in_out, reduction); + breduce.reduce(in_out, reduction, binary_op_type()); if(i.local[0] == 0) { @@ -397,11 +438,5 @@ TYPED_TEST(RocprimBlockReduceInputArrayTests, Reduce) d_output.synchronize(); d_output_r.synchronize(); - for(size_t i = 0; i < output_reductions.size(); i++) - { - ASSERT_NEAR( - output_reductions[i], expected_reductions[i], - static_cast(0.05) * expected_reductions[i] - ); - } + test_utils::assert_near(output_reductions, expected_reductions, 0.05); } diff --git a/test/rocprim/test_hc_device_merge_sort.cpp b/test/rocprim/test_hc_device_merge_sort.cpp index e8a6757fb..26fc4752f 100644 --- a/test/rocprim/test_hc_device_merge_sort.cpp +++ b/test/rocprim/test_hc_device_merge_sort.cpp @@ -62,9 +62,13 @@ class RocprimDeviceSortTests : public ::testing::Test typedef ::testing::Types< DeviceSortParams, + DeviceSortParams>, DeviceSortParams, - DeviceSortParams, - DeviceSortParams + DeviceSortParams, + DeviceSortParams, + DeviceSortParams>, + DeviceSortParams>, + DeviceSortParams> > RocprimDeviceSortTestsParams; std::vector get_sizes() @@ -74,7 +78,7 @@ std::vector get_sizes() 1024, 2048, 5096, 34567, (1 << 17) - 1220 }; - const std::vector random_sizes = test_utils::get_random_data(2, 1, 16384); + const std::vector random_sizes = test_utils::get_random_data(5, 1, 100000); sizes.insert(sizes.end(), random_sizes.begin(), random_sizes.end()); std::sort(sizes.begin(), sizes.end()); return sizes; @@ -138,9 +142,7 @@ TYPED_TEST(RocprimDeviceSortTests, SortKey) std::vector output = d_output; for(size_t i = 0; i < output.size(); i++) { - auto diff = std::max(std::abs(0.01f * expected[i]), key_type(0.01f)); - if(std::is_integral::value) diff = 0; - ASSERT_NEAR(output[i], expected[i], diff); + ASSERT_EQ(output[i], expected[i]) << "where index = " << i; } } } @@ -220,8 +222,8 @@ TYPED_TEST(RocprimDeviceSortTests, SortKeyValue) std::vector values_output = d_values_output; for(size_t i = 0; i < keys_output.size(); i++) { - ASSERT_EQ(keys_output[i], expected[i].first); - ASSERT_EQ(values_output[i], expected[i].second); + ASSERT_EQ(keys_output[i], expected[i].first) << "where index = " << i; + ASSERT_EQ(values_output[i], expected[i].second) << "where index = " << i; } } } diff --git a/test/rocprim/test_hc_device_reduce_by_key.cpp b/test/rocprim/test_hc_device_reduce_by_key.cpp index b99ae1bff..beecfbc7e 100644 --- a/test/rocprim/test_hc_device_reduce_by_key.cpp +++ b/test/rocprim/test_hc_device_reduce_by_key.cpp @@ -95,10 +95,12 @@ typedef ::testing::Types< params, 1, 10000>, params, 1, 10>, params, 1, 30>, + params, params, 20, 100>, params, 100, 400, long long, custom_key_compare_op1>, - params, 200, 600>, + params, 200, 600>, params, 100, 2000, double, custom_key_compare_op1>, + params, params, 1000, 5000>, params, 2048, 2048>, params, 1000, 10000, long long>, @@ -238,14 +240,7 @@ TYPED_TEST(RocprimDeviceReduceByKey, ReduceByKey) ASSERT_EQ(unique_count_output[0], unique_count_expected); - for(size_t i = 0; i < unique_count_expected; i++) - { - ASSERT_EQ(unique_output[i], unique_expected[i]); - } - - for(size_t i = 0; i < unique_count_expected; i++) - { - ASSERT_EQ(aggregates_output[i], aggregates_expected[i]); - } + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(unique_output, unique_expected)); + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(aggregates_output, aggregates_expected)); } } diff --git a/test/rocprim/test_hc_device_scan.cpp b/test/rocprim/test_hc_device_scan.cpp index f05cd876d..90bc74018 100644 --- a/test/rocprim/test_hc_device_scan.cpp +++ b/test/rocprim/test_hc_device_scan.cpp @@ -39,12 +39,14 @@ namespace rp = rocprim; // Params for tests template< class InputType, - class OutputType = InputType + class OutputType = InputType, + class ScanOp = ::rocprim::plus > struct DeviceScanParams { using input_type = InputType; using output_type = OutputType; + using scan_op_type = ScanOp; }; // --------------------------------------------------------- @@ -57,24 +59,33 @@ class RocprimDeviceScanTests : public ::testing::Test public: using input_type = typename Params::input_type; using output_type = typename Params::output_type; + using scan_op_type = typename Params::scan_op_type; const bool debug_synchronous = false; }; typedef ::testing::Types< - DeviceScanParams, - DeviceScanParams, - DeviceScanParams, test_utils::custom_test_type> - // DeviceScanParams + DeviceScanParams, + DeviceScanParams, + DeviceScanParams >, + DeviceScanParams, + DeviceScanParams >, + DeviceScanParams >, + DeviceScanParams< + test_utils::custom_test_type, test_utils::custom_test_type, + rp::plus > + >, + DeviceScanParams, + DeviceScanParams > RocprimDeviceScanTestsParams; std::vector get_sizes() { std::vector sizes = { - 2, 32, 32, 32, 65, 378, - 1512, 3048, 4096, - 27845, (1 << 18) + 1111 + 1, 10, 53, 211, + 1024, 2048, 5096, + 34567, (1 << 18) }; - const std::vector random_sizes = test_utils::get_random_data(2, 1, 16384); + const std::vector random_sizes = test_utils::get_random_data(3, 1, 100000); sizes.insert(sizes.end(), random_sizes.begin(), random_sizes.end()); std::sort(sizes.begin(), sizes.end()); return sizes; @@ -82,10 +93,11 @@ std::vector get_sizes() TYPED_TEST_CASE(RocprimDeviceScanTests, RocprimDeviceScanTestsParams); -TYPED_TEST(RocprimDeviceScanTests, InclusiveScanSumEmptyInput) +TYPED_TEST(RocprimDeviceScanTests, InclusiveScanEmptyInput) { using T = typename TestFixture::input_type; using U = typename TestFixture::output_type; + using scan_op_type = typename TestFixture::scan_op_type; const bool debug_synchronous = TestFixture::debug_synchronous; hc::accelerator acc; @@ -107,7 +119,7 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanSumEmptyInput) nullptr, temp_storage_size_bytes, rocprim::make_constant_iterator(345), d_checking_output, - 0, ::rocprim::plus(), acc_view, debug_synchronous + 0, scan_op_type(), acc_view, debug_synchronous ); // allocate temporary storage @@ -119,17 +131,18 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanSumEmptyInput) d_temp_storage.accelerator_pointer(), temp_storage_size_bytes, rocprim::make_constant_iterator(345), d_checking_output, - 0, ::rocprim::plus(), acc_view, debug_synchronous + 0, scan_op_type(), acc_view, debug_synchronous ); acc_view.wait(); ASSERT_FALSE(out_of_bounds.get()); } -TYPED_TEST(RocprimDeviceScanTests, InclusiveScanSum) +TYPED_TEST(RocprimDeviceScanTests, InclusiveScan) { using T = typename TestFixture::input_type; using U = typename TestFixture::output_type; + using scan_op_type = typename TestFixture::scan_op_type; const bool debug_synchronous = TestFixture::debug_synchronous; hc::accelerator acc; @@ -148,12 +161,12 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanSum) acc_view.wait(); // scan function - ::rocprim::plus plus_op; + scan_op_type scan_op; // Calculate expected results on host std::vector expected(input.size()); test_utils::host_inclusive_scan( - input.begin(), input.end(), expected.begin(), plus_op + input.begin(), input.end(), expected.begin(), scan_op ); // temp storage @@ -165,7 +178,7 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanSum) d_input.accelerator_pointer(), d_output.accelerator_pointer(), input.size(), - plus_op, + scan_op, acc_view, debug_synchronous ); @@ -185,7 +198,7 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanSum) d_input.accelerator_pointer(), d_output.accelerator_pointer(), input.size(), - plus_op, + scan_op, acc_view, debug_synchronous ); @@ -197,10 +210,11 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanSum) } } -TYPED_TEST(RocprimDeviceScanTests, ExclusiveScanSum) +TYPED_TEST(RocprimDeviceScanTests, ExclusiveScan) { using T = typename TestFixture::input_type; using U = typename TestFixture::output_type; + using scan_op_type = typename TestFixture::scan_op_type; const bool debug_synchronous = TestFixture::debug_synchronous; hc::accelerator acc; @@ -219,14 +233,14 @@ TYPED_TEST(RocprimDeviceScanTests, ExclusiveScanSum) acc_view.wait(); // scan function - ::rocprim::plus plus_op; + scan_op_type scan_op; // Calculate expected results on host std::vector expected(input.size(), 0); T initial_value = test_utils::get_random_value(1, 100); test_utils::host_exclusive_scan( input.begin(), input.end(), - initial_value, expected.begin(), plus_op + initial_value, expected.begin(), scan_op ); // temp storage @@ -238,7 +252,7 @@ TYPED_TEST(RocprimDeviceScanTests, ExclusiveScanSum) d_output.accelerator_pointer(), initial_value, input.size(), - plus_op, + scan_op, acc_view, debug_synchronous ); @@ -259,7 +273,7 @@ TYPED_TEST(RocprimDeviceScanTests, ExclusiveScanSum) d_output.accelerator_pointer(), initial_value, input.size(), - plus_op, + scan_op, acc_view, debug_synchronous ); @@ -276,6 +290,7 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanByKey) using T = typename TestFixture::input_type; using K = unsigned int; // key type using U = typename TestFixture::output_type; + using scan_op_type = typename TestFixture::scan_op_type; const bool debug_synchronous = TestFixture::debug_synchronous; hc::accelerator acc; @@ -297,7 +312,7 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanByKey) acc_view.wait(); // scan function - rocprim::plus scan_op; + scan_op_type scan_op; // key compare function rocprim::equal_to keys_compare_op; @@ -378,6 +393,7 @@ TYPED_TEST(RocprimDeviceScanTests, ExclusiveScanByKey) using T = typename TestFixture::input_type; using K = unsigned int; // key type using U = typename TestFixture::output_type; + using scan_op_type = typename TestFixture::scan_op_type; const bool debug_synchronous = TestFixture::debug_synchronous; hc::accelerator acc; @@ -400,7 +416,7 @@ TYPED_TEST(RocprimDeviceScanTests, ExclusiveScanByKey) acc_view.wait(); // scan function - rocprim::plus scan_op; + scan_op_type scan_op; // key compare function rocprim::equal_to keys_compare_op; diff --git a/test/rocprim/test_hc_device_segmented_reduce.cpp b/test/rocprim/test_hc_device_segmented_reduce.cpp index bb3d9f980..50c1d44b4 100644 --- a/test/rocprim/test_hc_device_segmented_reduce.cpp +++ b/test/rocprim/test_hc_device_segmented_reduce.cpp @@ -71,9 +71,13 @@ typedef ::testing::Types< params>, params, -100, 0, 10000>, params, 1000, 0, 10000>, + params, 0, 0, 2000>, params, 10, 1000, 10000>, params, 50, 2, 10>, - params, 123, 100, 200> + params, 123, 100, 200>, + params, 10, 3000, 4000>, + params, 0, 10, 300>, + params > Params; TYPED_TEST_CASE(RocprimDeviceSegmentedReduce, Params); diff --git a/test/rocprim/test_hc_device_segmented_scan.cpp b/test/rocprim/test_hc_device_segmented_scan.cpp index 5ec6f1958..697e3b425 100644 --- a/test/rocprim/test_hc_device_segmented_scan.cpp +++ b/test/rocprim/test_hc_device_segmented_scan.cpp @@ -37,6 +37,8 @@ #include "test_utils.hpp" +namespace rp = rocprim; + template< class Input, class Output, @@ -71,7 +73,10 @@ typedef ::testing::Types< params, 1000, 0, 10000>, params, 10, 1000, 10000>, params, 50, 2, 10>, - params, 123, 100, 200> + params, 123, 100, 200>, + params, 10, 3000, 4000>, + params, 0, 10, 300>, + params > Params; TYPED_TEST_CASE(RocprimDeviceSegmentedScan, Params); diff --git a/test/rocprim/test_hip_block_discontinuity.cpp b/test/rocprim/test_hip_block_discontinuity.cpp index a5457bf08..414599d5c 100644 --- a/test/rocprim/test_hip_block_discontinuity.cpp +++ b/test/rocprim/test_hip_block_discontinuity.cpp @@ -123,6 +123,7 @@ typedef ::testing::Types< params >, params >, params >, + params, // Non-power of 2 BlockSize params >, @@ -136,12 +137,14 @@ typedef ::testing::Types< params >, params, params >, + params, // Non-power of 2 BlockSize and ItemsPerThread > 1 params, params >, params >, - params > + params >, + params > Params; TYPED_TEST_CASE(RocprimBlockDiscontinuity, Params); @@ -569,4 +572,3 @@ TYPED_TEST(RocprimBlockDiscontinuity, FlagHeadsAndTails) HIP_CHECK(hipFree(device_heads)); HIP_CHECK(hipFree(device_tails)); } - diff --git a/test/rocprim/test_hip_block_exchange.cpp b/test/rocprim/test_hip_block_exchange.cpp index eb4ef6e76..07f844f3d 100644 --- a/test/rocprim/test_hip_block_exchange.cpp +++ b/test/rocprim/test_hip_block_exchange.cpp @@ -69,20 +69,23 @@ typedef ::testing::Types< params, params, params, + params, // Power of 2 BlockSize and ItemsPerThread > 1 params, params, params, params, - params, + params, params, + params, // Non-power of 2 BlockSize and ItemsPerThread > 1 params, params, params, - params + params, + params > Params; TYPED_TEST_CASE(RocprimBlockExchangeTests, Params); @@ -180,10 +183,7 @@ TYPED_TEST(RocprimBlockExchangeTests, BlockedToStriped) ) ); - for(size_t i = 0; i < size; i++) - { - ASSERT_EQ(output[i], expected[i]); - } + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); HIP_CHECK(hipFree(device_input)); HIP_CHECK(hipFree(device_output)); @@ -282,10 +282,7 @@ TYPED_TEST(RocprimBlockExchangeTests, StripedToBlocked) ) ); - for(size_t i = 0; i < size; i++) - { - ASSERT_EQ(output[i], expected[i]); - } + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); HIP_CHECK(hipFree(device_input)); HIP_CHECK(hipFree(device_output)); @@ -397,10 +394,7 @@ TYPED_TEST(RocprimBlockExchangeTests, BlockedToWarpStriped) ) ); - for(size_t i = 0; i < size; i++) - { - ASSERT_EQ(output[i], expected[i]); - } + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); HIP_CHECK(hipFree(device_input)); HIP_CHECK(hipFree(device_output)); @@ -510,10 +504,7 @@ TYPED_TEST(RocprimBlockExchangeTests, WarpStripedToBlocked) ) ); - for(size_t i = 0; i < size; i++) - { - ASSERT_EQ(output[i], expected[i]); - } + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); HIP_CHECK(hipFree(device_input)); HIP_CHECK(hipFree(device_output)); @@ -631,10 +622,7 @@ TYPED_TEST(RocprimBlockExchangeTests, ScatterToBlocked) ) ); - for(size_t i = 0; i < size; i++) - { - ASSERT_EQ(output[i], expected[i]); - } + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); HIP_CHECK(hipFree(device_input)); HIP_CHECK(hipFree(device_output)); @@ -755,14 +743,9 @@ TYPED_TEST(RocprimBlockExchangeTests, ScatterToStriped) ) ); - for(size_t i = 0; i < size; i++) - { - ASSERT_EQ(output[i], expected[i]); - } + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); HIP_CHECK(hipFree(device_input)); HIP_CHECK(hipFree(device_output)); HIP_CHECK(hipFree(device_ranks)); - } - diff --git a/test/rocprim/test_hip_block_load_store.cpp b/test/rocprim/test_hip_block_load_store.cpp index 6dd1c5e3b..4a16d1171 100644 --- a/test/rocprim/test_hip_block_load_store.cpp +++ b/test/rocprim/test_hip_block_load_store.cpp @@ -85,34 +85,34 @@ typedef ::testing::Types< // block_load_direct class_params, - class_params, + class_params, class_params, - class_params, class_params, class_params, + rp::block_store_method::block_store_direct, 512U, 3>, class_params, - class_params, + class_params, class_params, - class_params, + class_params, class_params, class_params, + rp::block_store_method::block_store_direct, 512U, 2>, class_params, rp::block_load_method::block_load_direct, rp::block_store_method::block_store_direct, 64U, 1>, class_params, rp::block_load_method::block_load_direct, - rp::block_store_method::block_store_direct, 64U, 4>, + rp::block_store_method::block_store_direct, 64U, 5>, class_params, rp::block_load_method::block_load_direct, rp::block_store_method::block_store_direct, 256U, 1>, class_params, rp::block_load_method::block_load_direct, @@ -122,12 +122,12 @@ typedef ::testing::Types< class_params, class_params, - class_params, + class_params, class_params, - class_params, class_params, @@ -139,11 +139,11 @@ typedef ::testing::Types< class_params, class_params, + rp::block_store_method::block_store_vectorize, 256U, 8>, class_params, class_params, + rp::block_store_method::block_store_vectorize, 512U, 2>, class_params, rp::block_load_method::block_load_vectorize, rp::block_store_method::block_store_vectorize, 64U, 1>, @@ -158,20 +158,20 @@ typedef ::testing::Types< class_params, class_params, + rp::block_store_method::block_store_transpose, 64U, 9>, class_params, - class_params, class_params, - class_params, class_params, class_params, + rp::block_store_method::block_store_transpose, 64U, 7>, class_params, class_params, class_params, + rp::block_store_method::block_store_transpose, 512U, 3>, class_params, rp::block_load_method::block_load_transpose, rp::block_store_method::block_store_transpose, 64U, 1>, class_params, rp::block_load_method::block_load_transpose, - rp::block_store_method::block_store_transpose, 64U, 4>, + rp::block_store_method::block_store_transpose, 64U, 5>, class_params, rp::block_load_method::block_load_transpose, rp::block_store_method::block_store_transpose, 256U, 1>, class_params, rp::block_load_method::block_load_transpose, @@ -331,10 +331,7 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClass) ); // Validating results - for(size_t i = 0; i < output.size(); i++) - { - ASSERT_EQ(output[i], expected[i]); - } + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); HIP_CHECK(hipFree(device_input)); HIP_CHECK(hipFree(device_output)); @@ -438,10 +435,7 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClassValid) ); // Validating results - for(size_t i = 0; i < output.size(); i++) - { - ASSERT_EQ(output[i], expected[i]); - } + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); HIP_CHECK(hipFree(device_input)); HIP_CHECK(hipFree(device_output)); @@ -537,10 +531,7 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClassDefault) ); // Validating results - for(size_t i = 0; i < output.size(); i++) - { - ASSERT_EQ(output[i], expected[i]); - } + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); HIP_CHECK(hipFree(device_input)); HIP_CHECK(hipFree(device_output)); @@ -564,5 +555,3 @@ TYPED_TEST(RocprimVectorizationTests, MatchVectorType) bool input = std::is_same::value; ASSERT_TRUE(input); } - - diff --git a/test/rocprim/test_hip_block_reduce.cpp b/test/rocprim/test_hip_block_reduce.cpp index 83ae7e363..596685241 100644 --- a/test/rocprim/test_hip_block_reduce.cpp +++ b/test/rocprim/test_hip_block_reduce.cpp @@ -34,17 +34,25 @@ namespace rp = rocprim; +template +T apply(BinaryOp binary_op, const T& a, const T& b) +{ + return binary_op(a, b); +} + // Params for tests template< class T, unsigned int BlockSize = 256U, unsigned int ItemsPerThread = 1U, - rocprim::block_reduce_algorithm Algorithm = rocprim::block_reduce_algorithm::using_warp_reduce + rp::block_reduce_algorithm Algorithm = rp::block_reduce_algorithm::using_warp_reduce, + class BinaryOp = rocprim::plus > struct params { using type = T; - static constexpr rocprim::block_reduce_algorithm algorithm = Algorithm; + using binary_op_type = BinaryOp; + static constexpr rp::block_reduce_algorithm algorithm = Algorithm; static constexpr unsigned int block_size = BlockSize; static constexpr unsigned int items_per_thread = ItemsPerThread; }; @@ -58,7 +66,8 @@ class RocprimBlockReduceSingleValueTests : public ::testing::Test { public: using type = typename Params::type; - static constexpr rocprim::block_reduce_algorithm algorithm = Params::algorithm; + using binary_op_type = typename Params::binary_op_type; + static constexpr rp::block_reduce_algorithm algorithm = Params::algorithm; static constexpr unsigned int block_size = Params::block_size; }; @@ -81,6 +90,14 @@ typedef ::testing::Types< params, params, params, + // char tests + params, + params, + params, + // half tests + params, + params, + params, // long tests params, params, @@ -88,18 +105,26 @@ typedef ::testing::Types< // ----------------------------------------------------------------------- // rocprim::block_reduce_algorithm::raking_reduce // ----------------------------------------------------------------------- - params, - params, - params, - params, - params, - params, - params, - params, - params, - params, - params, - params + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params > SingleValueTestParams; TYPED_TEST_CASE(RocprimBlockReduceSingleValueTests, SingleValueTestParams); @@ -107,7 +132,8 @@ TYPED_TEST_CASE(RocprimBlockReduceSingleValueTests, SingleValueTestParams); template< unsigned int BlockSize, rocprim::block_reduce_algorithm Algorithm, - class T + class T, + class BinaryOp > __global__ void reduce_kernel(T* device_output, T* device_output_reductions) @@ -115,7 +141,7 @@ void reduce_kernel(T* device_output, T* device_output_reductions) const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; T value = device_output[index]; rp::block_reduce breduce; - breduce.reduce(value, value); + breduce.reduce(value, value, BinaryOp()); if(hipThreadIdx_x == 0) { device_output_reductions[hipBlockIdx_x] = value; @@ -125,6 +151,7 @@ void reduce_kernel(T* device_output, T* device_output_reductions) TYPED_TEST(RocprimBlockReduceSingleValueTests, Reduce) { using T = typename TestFixture::type; + using binary_op_type = typename TestFixture::binary_op_type; constexpr auto algorithm = TestFixture::algorithm; constexpr size_t block_size = TestFixture::block_size; @@ -137,18 +164,19 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, Reduce) const size_t size = block_size * 113; const size_t grid_size = size / block_size; // Generate data - std::vector output = test_utils::get_random_data(size, 2, 200); + std::vector output = test_utils::get_random_data(size, 2, 50); std::vector output_reductions(size / block_size); // Calculate expected results on host std::vector expected_reductions(output_reductions.size(), 0); + binary_op_type binary_op; for(size_t i = 0; i < output.size() / block_size; i++) { T value = 0; for(size_t j = 0; j < block_size; j++) { auto idx = i * block_size + j; - value += output[idx]; + value = apply(binary_op, value, output[idx]); } expected_reductions[i] = value; } @@ -169,7 +197,7 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, Reduce) // Running kernel hipLaunchKernelGGL( - HIP_KERNEL_NAME(reduce_kernel), + HIP_KERNEL_NAME(reduce_kernel), dim3(grid_size), dim3(block_size), 0, 0, device_output, device_output_reductions ); @@ -184,17 +212,12 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, Reduce) ); // Verifying results - for(size_t i = 0; i < output_reductions.size(); i++) - { - ASSERT_EQ(output_reductions[i], expected_reductions[i]); - } + test_utils::assert_eq(output_reductions, expected_reductions); HIP_CHECK(hipFree(device_output)); HIP_CHECK(hipFree(device_output_reductions)); } -TYPED_TEST_CASE(RocprimBlockReduceSingleValueTests, SingleValueTestParams); - template< unsigned int BlockSize, rocprim::block_reduce_algorithm Algorithm, @@ -219,6 +242,12 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceMultiplies) constexpr auto algorithm = TestFixture::algorithm; constexpr size_t block_size = TestFixture::block_size; + // Half not tested here + if(std::is_same::value) + { + return; + } + // Given block size not supported if(block_size > test_utils::get_max_block_size()) { @@ -280,19 +309,19 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceMultiplies) ); // Verifying results - for(size_t i = 0; i < output_reductions.size(); i++) - { - ASSERT_EQ(output_reductions[i], expected_reductions[i]); - } + test_utils::assert_eq(output_reductions, expected_reductions); HIP_CHECK(hipFree(device_output)); HIP_CHECK(hipFree(device_output_reductions)); } +TYPED_TEST_CASE(RocprimBlockReduceSingleValueTests, SingleValueTestParams); + template< unsigned int BlockSize, rocprim::block_reduce_algorithm Algorithm, - class T + class T, + class BinaryOp > __global__ void reduce_valid_kernel(T* device_output, T* device_output_reductions, const unsigned int valid_items) @@ -300,7 +329,7 @@ void reduce_valid_kernel(T* device_output, T* device_output_reductions, const un const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; T value = device_output[index]; rp::block_reduce breduce; - breduce.reduce(value, value, valid_items); + breduce.reduce(value, value, valid_items, BinaryOp()); if(hipThreadIdx_x == 0) { device_output_reductions[hipBlockIdx_x] = value; @@ -310,6 +339,7 @@ void reduce_valid_kernel(T* device_output, T* device_output_reductions, const un TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceValid) { using T = typename TestFixture::type; + using binary_op_type = typename TestFixture::binary_op_type; constexpr auto algorithm = TestFixture::algorithm; constexpr size_t block_size = TestFixture::block_size; const unsigned int valid_items = test_utils::get_random_value(block_size - 10, block_size); @@ -323,18 +353,19 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceValid) const size_t size = block_size * 113; const size_t grid_size = size / block_size; // Generate data - std::vector output = test_utils::get_random_data(size, 2, 200); + std::vector output = test_utils::get_random_data(size, 2, 50); std::vector output_reductions(size / block_size); // Calculate expected results on host std::vector expected_reductions(output_reductions.size(), 0); + binary_op_type binary_op; for(size_t i = 0; i < output.size() / block_size; i++) { T value = 0; for(size_t j = 0; j < valid_items; j++) { auto idx = i * block_size + j; - value += output[idx]; + value = apply(binary_op, value, output[idx]); } expected_reductions[i] = value; } @@ -355,7 +386,7 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceValid) // Running kernel hipLaunchKernelGGL( - HIP_KERNEL_NAME(reduce_valid_kernel), + HIP_KERNEL_NAME(reduce_valid_kernel), dim3(grid_size), dim3(block_size), 0, 0, device_output, device_output_reductions, valid_items ); @@ -370,10 +401,7 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceValid) ); // Verifying results - for(size_t i = 0; i < output_reductions.size(); i++) - { - ASSERT_EQ(output_reductions[i], expected_reductions[i]); - } + test_utils::assert_eq(output_reductions, expected_reductions); HIP_CHECK(hipFree(device_output)); HIP_CHECK(hipFree(device_output_reductions)); @@ -385,6 +413,7 @@ class RocprimBlockReduceInputArrayTests : public ::testing::Test { public: using type = typename Params::type; + using binary_op_type = typename Params::binary_op_type; static constexpr unsigned int block_size = Params::block_size; static constexpr rocprim::block_reduce_algorithm algorithm = Params::algorithm; static constexpr unsigned int items_per_thread = Params::items_per_thread; @@ -403,18 +432,30 @@ typedef ::testing::Types< params, params, params, + params, + params, + params, + params, + params, + params, // ----------------------------------------------------------------------- // rocprim::block_reduce_algorithm::raking_reduce // ----------------------------------------------------------------------- - params, - params, - params, - params, - params, - params, - params, - params, - params + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params > InputArrayTestParams; TYPED_TEST_CASE(RocprimBlockReduceInputArrayTests, InputArrayTestParams); @@ -423,7 +464,8 @@ template< unsigned int BlockSize, unsigned int ItemsPerThread, rocprim::block_reduce_algorithm Algorithm, - class T + class T, + class BinaryOp > __global__ void reduce_array_kernel(T* device_output, T* device_output_reductions) @@ -438,7 +480,7 @@ void reduce_array_kernel(T* device_output, T* device_output_reductions) rp::block_reduce breduce; T reduction; - breduce.reduce(in_out, reduction); + breduce.reduce(in_out, reduction, BinaryOp()); if(hipThreadIdx_x == 0) { @@ -449,6 +491,7 @@ void reduce_array_kernel(T* device_output, T* device_output_reductions) TYPED_TEST(RocprimBlockReduceInputArrayTests, Reduce) { using T = typename TestFixture::type; + using binary_op_type = typename TestFixture::binary_op_type; constexpr auto algorithm = TestFixture::algorithm; constexpr size_t block_size = TestFixture::block_size; constexpr size_t items_per_thread = TestFixture::items_per_thread; @@ -463,20 +506,21 @@ TYPED_TEST(RocprimBlockReduceInputArrayTests, Reduce) const size_t size = items_per_block * 37; const size_t grid_size = size / items_per_block; // Generate data - std::vector output = test_utils::get_random_data(size, 2, 200); + std::vector output = test_utils::get_random_data(size, 2, 50); // Output reduce results std::vector output_reductions(size / block_size, 0); // Calculate expected results on host std::vector expected_reductions(output_reductions.size(), 0); + binary_op_type binary_op; for(size_t i = 0; i < output.size() / items_per_block; i++) { T value = 0; for(size_t j = 0; j < items_per_block; j++) { auto idx = i * items_per_block + j; - value += output[idx]; + value = apply(binary_op, value, output[idx]); } expected_reductions[i] = value; } @@ -505,7 +549,7 @@ TYPED_TEST(RocprimBlockReduceInputArrayTests, Reduce) // Running kernel hipLaunchKernelGGL( - HIP_KERNEL_NAME(reduce_array_kernel), + HIP_KERNEL_NAME(reduce_array_kernel), dim3(grid_size), dim3(block_size), 0, 0, device_output, device_output_reductions ); @@ -520,13 +564,7 @@ TYPED_TEST(RocprimBlockReduceInputArrayTests, Reduce) ); // Verifying results - for(size_t i = 0; i < output_reductions.size(); i++) - { - ASSERT_NEAR( - output_reductions[i], expected_reductions[i], - static_cast(0.05) * expected_reductions[i] - ); - } + test_utils::assert_near(output_reductions, expected_reductions, 0.05); HIP_CHECK(hipFree(device_output)); HIP_CHECK(hipFree(device_output_reductions)); diff --git a/test/rocprim/test_hip_block_scan.cpp b/test/rocprim/test_hip_block_scan.cpp index 3bf1857a5..dace5313c 100644 --- a/test/rocprim/test_hip_block_scan.cpp +++ b/test/rocprim/test_hip_block_scan.cpp @@ -39,11 +39,13 @@ template< class T, unsigned int BlockSize = 256U, unsigned int ItemsPerThread = 1U, - rocprim::block_scan_algorithm Algorithm = rocprim::block_scan_algorithm::using_warp_scan + rocprim::block_scan_algorithm Algorithm = rocprim::block_scan_algorithm::using_warp_scan, + class BinaryOp = rocprim::plus > struct params { using type = T; + using binary_op_type = BinaryOp; static constexpr rocprim::block_scan_algorithm algorithm = Algorithm; static constexpr unsigned int block_size = BlockSize; static constexpr unsigned int items_per_thread = ItemsPerThread; @@ -58,6 +60,7 @@ class RocprimBlockScanSingleValueTests : public ::testing::Test { public: using type = typename Params::type; + using binary_op_type = typename Params::binary_op_type; static constexpr rocprim::block_scan_algorithm algorithm = Params::algorithm; static constexpr unsigned int block_size = Params::block_size; }; @@ -79,6 +82,10 @@ typedef ::testing::Types< params, params, params, + // char tests + params, + params, + params, // long tests params, params, @@ -131,7 +138,7 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, InclusiveScan) const size_t size = block_size * 113; const size_t grid_size = size / block_size; // Generate data - std::vector output = test_utils::get_random_data(size, 2, 200); + std::vector output = test_utils::get_random_data(size, 2, 100); // Calculate expected results on host std::vector expected(output.size(), 0); @@ -219,7 +226,7 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, InclusiveScanReduce) const size_t size = block_size * 113; const size_t grid_size = size / block_size; // Generate data - std::vector output = test_utils::get_random_data(size, 2, 200); + std::vector output = test_utils::get_random_data(size, 2, 100); std::vector output_reductions(size / block_size); // Calculate expected results on host @@ -341,7 +348,7 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, InclusiveScanPrefixCallback) const size_t size = block_size * 113; const size_t grid_size = size / block_size; // Generate data - std::vector output = test_utils::get_random_data(size, 2, 200); + std::vector output = test_utils::get_random_data(size, 2, 100); std::vector output_block_prefixes(size / block_size); T block_prefix = test_utils::get_random_value(0, 100); @@ -450,7 +457,7 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, ExclusiveScan) const size_t size = block_size * 113; const size_t grid_size = size / block_size; // Generate data - std::vector output = test_utils::get_random_data(size, 2, 241); + std::vector output = test_utils::get_random_data(size, 2, 100); const T init = test_utils::get_random_value(0, 100); // Calculate expected results on host @@ -539,7 +546,7 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, ExclusiveScanReduce) const size_t size = block_size * 113; const size_t grid_size = size / block_size; // Generate data - std::vector output = test_utils::get_random_data(size, 2, 200); + std::vector output = test_utils::get_random_data(size, 2, 100); const T init = test_utils::get_random_value(0, 100); // Output reduce results @@ -671,7 +678,7 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, ExclusiveScanPrefixCallback) const size_t size = block_size * 113; const size_t grid_size = size / block_size; // Generate data - std::vector output = test_utils::get_random_data(size, 2, 200); + std::vector output = test_utils::get_random_data(size, 2, 100); std::vector output_block_prefixes(size / block_size); T block_prefix = test_utils::get_random_value(0, 100); @@ -775,7 +782,7 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, CustomStruct) std::vector output(size); { std::vector random_values = - test_utils::get_random_data(2 * output.size(), 2, 200); + test_utils::get_random_data(2 * output.size(), 2, 100); for(size_t i = 0; i < output.size(); i++) { output[i].x = random_values[i], @@ -843,6 +850,7 @@ class RocprimBlockScanInputArrayTests : public ::testing::Test { public: using type = typename Params::type; + using binary_op_type = typename Params::binary_op_type; static constexpr unsigned int block_size = Params::block_size; static constexpr rocprim::block_scan_algorithm algorithm = Params::algorithm; static constexpr unsigned int items_per_thread = Params::items_per_thread; @@ -861,6 +869,9 @@ typedef ::testing::Types< params, params, params, + params, + params, + params, // ----------------------------------------------------------------------- // rocprim::block_scan_algorithm::reduce_then_scan // ----------------------------------------------------------------------- @@ -872,7 +883,10 @@ typedef ::testing::Types< params, params, params, - params + params, + params, + params, + params > InputArrayTestParams; TYPED_TEST_CASE(RocprimBlockScanInputArrayTests, InputArrayTestParams); @@ -923,7 +937,7 @@ TYPED_TEST(RocprimBlockScanInputArrayTests, InclusiveScan) const size_t size = items_per_block * 37; const size_t grid_size = size / items_per_block; // Generate data - std::vector output = test_utils::get_random_data(size, 2, 200); + std::vector output = test_utils::get_random_data(size, 2, 100); // Calculate expected results on host std::vector expected(output.size(), 0); @@ -1030,7 +1044,7 @@ TYPED_TEST(RocprimBlockScanInputArrayTests, InclusiveScanReduce) const size_t size = items_per_block * 37; const size_t grid_size = size / items_per_block; // Generate data - std::vector output = test_utils::get_random_data(size, 2, 200); + std::vector output = test_utils::get_random_data(size, 2, 100); // Output reduce results std::vector output_reductions(size / block_size, 0); @@ -1181,7 +1195,7 @@ TYPED_TEST(RocprimBlockScanInputArrayTests, InclusiveScanPrefixCallback) const size_t size = items_per_block * 37; const size_t grid_size = size / items_per_block; // Generate data - std::vector output = test_utils::get_random_data(size, 2, 200); + std::vector output = test_utils::get_random_data(size, 2, 100); std::vector output_block_prefixes(size / items_per_block, 0); T block_prefix = test_utils::get_random_value(0, 100); @@ -1320,7 +1334,7 @@ TYPED_TEST(RocprimBlockScanInputArrayTests, ExclusiveScan) const size_t size = items_per_block * 37; const size_t grid_size = size / items_per_block; // Generate data - std::vector output = test_utils::get_random_data(size, 2, 200); + std::vector output = test_utils::get_random_data(size, 2, 100); const T init = test_utils::get_random_value(0, 100); // Calculate expected results on host @@ -1428,7 +1442,7 @@ TYPED_TEST(RocprimBlockScanInputArrayTests, ExclusiveScanReduce) const size_t size = items_per_block * 37; const size_t grid_size = size / items_per_block; // Generate data - std::vector output = test_utils::get_random_data(size, 2, 200); + std::vector output = test_utils::get_random_data(size, 2, 100); // Output reduce results std::vector output_reductions(size / block_size); @@ -1579,7 +1593,7 @@ TYPED_TEST(RocprimBlockScanInputArrayTests, ExclusiveScanPrefixCallback) const size_t size = items_per_block * 37; const size_t grid_size = size / items_per_block; // Generate data - std::vector output = test_utils::get_random_data(size, 2, 200); + std::vector output = test_utils::get_random_data(size, 2, 100); std::vector output_block_prefixes(size / items_per_block); T block_prefix = test_utils::get_random_value(0, 100); @@ -1670,4 +1684,3 @@ TYPED_TEST(RocprimBlockScanInputArrayTests, ExclusiveScanPrefixCallback) HIP_CHECK(hipFree(device_output)); HIP_CHECK(hipFree(device_output_bp)); } - diff --git a/test/rocprim/test_hip_device_merge_sort.cpp b/test/rocprim/test_hip_device_merge_sort.cpp index c8a212109..f56c9bd62 100644 --- a/test/rocprim/test_hip_device_merge_sort.cpp +++ b/test/rocprim/test_hip_device_merge_sort.cpp @@ -68,8 +68,10 @@ typedef ::testing::Types< DeviceSortParams, DeviceSortParams>, DeviceSortParams, - DeviceSortParams, + DeviceSortParams, DeviceSortParams, + DeviceSortParams>, + DeviceSortParams>, DeviceSortParams> > RocprimDeviceSortTestsParams; @@ -80,7 +82,7 @@ std::vector get_sizes() 1024, 2048, 5096, 34567, (1 << 17) - 1220 }; - const std::vector random_sizes = test_utils::get_random_data(2, 1, 16384); + const std::vector random_sizes = test_utils::get_random_data(5, 1, 100000); sizes.insert(sizes.end(), random_sizes.begin(), random_sizes.end()); std::sort(sizes.begin(), sizes.end()); return sizes; @@ -102,7 +104,7 @@ TYPED_TEST(RocprimDeviceSortTests, SortKey) // Generate data std::vector input = test_utils::get_random_data(size, 0, size); - std::vector output(size, 0); + std::vector output(size); key_type * d_input; key_type * d_output; @@ -169,7 +171,7 @@ TYPED_TEST(RocprimDeviceSortTests, SortKey) // Check if output values are as expected for(size_t i = 0; i < output.size(); i++) { - ASSERT_NO_FATAL_FAILURE(test_utils::assert_near(output[i], expected[i], 0.01f)); + ASSERT_EQ(output[i], expected[i]) << "where index = " << i; } hipFree(d_input); @@ -200,8 +202,8 @@ TYPED_TEST(RocprimDeviceSortTests, SortKeyValue) std::mt19937{std::random_device{}()} ); std::vector values_input = test_utils::get_random_data(size, -1000, 1000); - std::vector keys_output(size, key_type(0)); - std::vector values_output(size, value_type(0)); + std::vector keys_output(size); + std::vector values_output(size); key_type * d_keys_input; key_type * d_keys_output; @@ -295,8 +297,8 @@ TYPED_TEST(RocprimDeviceSortTests, SortKeyValue) // Check if output values are as expected for(size_t i = 0; i < keys_output.size(); i++) { - ASSERT_EQ(keys_output[i], expected[i].first); - ASSERT_EQ(values_output[i], expected[i].second); + ASSERT_EQ(keys_output[i], expected[i].first) << "where index = " << i; + ASSERT_EQ(values_output[i], expected[i].second) << "where index = " << i; } hipFree(d_keys_input); diff --git a/test/rocprim/test_hip_device_reduce_by_key.cpp b/test/rocprim/test_hip_device_reduce_by_key.cpp index 02ee5aeb8..7f69e803b 100644 --- a/test/rocprim/test_hip_device_reduce_by_key.cpp +++ b/test/rocprim/test_hip_device_reduce_by_key.cpp @@ -100,10 +100,12 @@ typedef ::testing::Types< params, 1, 10000>, params, 1, 10>, params, 1, 30>, + params, params, 20, 100>, params, 100, 400, long long, custom_key_compare_op1>, - params, 200, 600>, + params, 200, 600>, params, 100, 2000, double, custom_key_compare_op1>, + params, params, 1000, 5000>, params, 2048, 2048>, params, 1000, 10000, long long>, @@ -296,14 +298,7 @@ TYPED_TEST(RocprimDeviceReduceByKey, ReduceByKey) ASSERT_EQ(unique_count_output[0], unique_count_expected); - for(size_t i = 0; i < unique_count_expected; i++) - { - ASSERT_EQ(unique_output[i], unique_expected[i]); - } - - for(size_t i = 0; i < unique_count_expected; i++) - { - ASSERT_EQ(aggregates_output[i], aggregates_expected[i]); - } + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(unique_output, unique_expected)); + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(aggregates_output, aggregates_expected)); } } diff --git a/test/rocprim/test_hip_device_scan.cpp b/test/rocprim/test_hip_device_scan.cpp index 26ac3eef6..a18036637 100644 --- a/test/rocprim/test_hip_device_scan.cpp +++ b/test/rocprim/test_hip_device_scan.cpp @@ -34,12 +34,15 @@ #include "test_utils.hpp" +namespace rp = rocprim; + #define HIP_CHECK(error) ASSERT_EQ(static_cast(error),hipSuccess) // Params for tests template< class InputType, class OutputType = InputType, + class ScanOp = ::rocprim::plus, // Tests output iterator with void value_type (OutputIterator concept) // scan-by-key primitives don't support output iterator with void value_type bool UseIdentityIteratorIfSupported = false @@ -48,6 +51,7 @@ struct DeviceScanParams { using input_type = InputType; using output_type = OutputType; + using scan_op_type = ScanOp; static constexpr bool use_identity_iterator = UseIdentityIteratorIfSupported; }; @@ -61,16 +65,24 @@ class RocprimDeviceScanTests : public ::testing::Test public: using input_type = typename Params::input_type; using output_type = typename Params::output_type; + using scan_op_type = typename Params::scan_op_type; const bool debug_synchronous = false; static constexpr bool use_identity_iterator = Params::use_identity_iterator; }; typedef ::testing::Types< + DeviceScanParams, DeviceScanParams, - DeviceScanParams, + DeviceScanParams, true>, DeviceScanParams, - DeviceScanParams, - DeviceScanParams, test_utils::custom_test_type, true> + DeviceScanParams >, + DeviceScanParams >, + DeviceScanParams< + test_utils::custom_test_type, test_utils::custom_test_type, + rp::plus >, true + >, + DeviceScanParams, + DeviceScanParams > RocprimDeviceScanTestsParams; std::vector get_sizes() @@ -80,7 +92,7 @@ std::vector get_sizes() 1024, 2048, 5096, 34567, (1 << 18) }; - const std::vector random_sizes = test_utils::get_random_data(2, 1, 16384); + const std::vector random_sizes = test_utils::get_random_data(3, 1, 100000); sizes.insert(sizes.end(), random_sizes.begin(), random_sizes.end()); std::sort(sizes.begin(), sizes.end()); return sizes; @@ -88,10 +100,11 @@ std::vector get_sizes() TYPED_TEST_CASE(RocprimDeviceScanTests, RocprimDeviceScanTestsParams); -TYPED_TEST(RocprimDeviceScanTests, InclusiveScanSumEmptyInput) +TYPED_TEST(RocprimDeviceScanTests, InclusiveScanEmptyInput) { using T = typename TestFixture::input_type; using U = typename TestFixture::output_type; + using scan_op_type = typename TestFixture::scan_op_type; const bool debug_synchronous = TestFixture::debug_synchronous; hipStream_t stream = 0; // default @@ -115,7 +128,7 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanSumEmptyInput) d_temp_storage, temp_storage_size_bytes, rocprim::make_constant_iterator(345), d_checking_output, - 0, ::rocprim::plus(), stream, debug_synchronous + 0, scan_op_type(), stream, debug_synchronous ) ); @@ -128,7 +141,7 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanSumEmptyInput) d_temp_storage, temp_storage_size_bytes, rocprim::make_constant_iterator(345), d_checking_output, - 0, ::rocprim::plus(), stream, debug_synchronous + 0, scan_op_type(), stream, debug_synchronous ) ); HIP_CHECK(hipPeekAtLastError()); @@ -140,10 +153,11 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanSumEmptyInput) hipFree(d_temp_storage); } -TYPED_TEST(RocprimDeviceScanTests, InclusiveScanSum) +TYPED_TEST(RocprimDeviceScanTests, InclusiveScan) { using T = typename TestFixture::input_type; using U = typename TestFixture::output_type; + using scan_op_type = typename TestFixture::scan_op_type; const bool debug_synchronous = TestFixture::debug_synchronous; static constexpr bool use_identity_iterator = TestFixture::use_identity_iterator; @@ -172,13 +186,13 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanSum) HIP_CHECK(hipDeviceSynchronize()); // scan function - ::rocprim::plus plus_op; + scan_op_type scan_op; // Calculate expected results on host std::vector expected(input.size()); test_utils::host_inclusive_scan( input.begin(), input.end(), - expected.begin(), plus_op + expected.begin(), scan_op ); // temp storage @@ -190,7 +204,7 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanSum) d_temp_storage, temp_storage_size_bytes, d_input, test_utils::wrap_in_identity_iterator(d_output), - input.size(), plus_op, stream, debug_synchronous + input.size(), scan_op, stream, debug_synchronous ) ); @@ -207,7 +221,7 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanSum) d_temp_storage, temp_storage_size_bytes, d_input, test_utils::wrap_in_identity_iterator(d_output), - input.size(), plus_op, stream, debug_synchronous + input.size(), scan_op, stream, debug_synchronous ) ); HIP_CHECK(hipPeekAtLastError()); @@ -232,10 +246,11 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanSum) } } -TYPED_TEST(RocprimDeviceScanTests, ExclusiveScanSum) +TYPED_TEST(RocprimDeviceScanTests, ExclusiveScan) { using T = typename TestFixture::input_type; using U = typename TestFixture::output_type; + using scan_op_type = typename TestFixture::scan_op_type; const bool debug_synchronous = TestFixture::debug_synchronous; static constexpr bool use_identity_iterator = TestFixture::use_identity_iterator; @@ -264,7 +279,7 @@ TYPED_TEST(RocprimDeviceScanTests, ExclusiveScanSum) HIP_CHECK(hipDeviceSynchronize()); // scan function - ::rocprim::plus plus_op; + scan_op_type scan_op; // Calculate expected results on host std::vector expected(input.size()); @@ -272,7 +287,7 @@ TYPED_TEST(RocprimDeviceScanTests, ExclusiveScanSum) test_utils::host_exclusive_scan( input.begin(), input.end(), initial_value, expected.begin(), - plus_op + scan_op ); // temp storage @@ -284,7 +299,7 @@ TYPED_TEST(RocprimDeviceScanTests, ExclusiveScanSum) d_temp_storage, temp_storage_size_bytes, d_input, test_utils::wrap_in_identity_iterator(d_output), - initial_value, input.size(), plus_op, stream, debug_synchronous + initial_value, input.size(), scan_op, stream, debug_synchronous ) ); @@ -301,7 +316,7 @@ TYPED_TEST(RocprimDeviceScanTests, ExclusiveScanSum) d_temp_storage, temp_storage_size_bytes, d_input, test_utils::wrap_in_identity_iterator(d_output), - initial_value, input.size(), plus_op, stream, debug_synchronous + initial_value, input.size(), scan_op, stream, debug_synchronous ) ); HIP_CHECK(hipPeekAtLastError()); @@ -332,6 +347,7 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanByKey) using T = typename TestFixture::input_type; using K = unsigned int; // key type using U = typename TestFixture::output_type; + using scan_op_type = typename TestFixture::scan_op_type; const bool debug_synchronous = TestFixture::debug_synchronous; const std::vector sizes = get_sizes(); @@ -370,7 +386,7 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanByKey) HIP_CHECK(hipDeviceSynchronize()); // scan function - rocprim::plus scan_op; + scan_op_type scan_op; // key compare function rocprim::equal_to keys_compare_op; @@ -457,6 +473,7 @@ TYPED_TEST(RocprimDeviceScanTests, ExclusiveScanByKey) using T = typename TestFixture::input_type; using K = unsigned int; // key type using U = typename TestFixture::output_type; + using scan_op_type = typename TestFixture::scan_op_type; const bool debug_synchronous = TestFixture::debug_synchronous; const std::vector sizes = get_sizes(); @@ -496,7 +513,7 @@ TYPED_TEST(RocprimDeviceScanTests, ExclusiveScanByKey) HIP_CHECK(hipDeviceSynchronize()); // scan function - rocprim::plus scan_op; + scan_op_type scan_op; // key compare function rocprim::equal_to keys_compare_op; diff --git a/test/rocprim/test_hip_device_segmented_reduce.cpp b/test/rocprim/test_hip_device_segmented_reduce.cpp index 4ae9e3df2..d62b4314b 100644 --- a/test/rocprim/test_hip_device_segmented_reduce.cpp +++ b/test/rocprim/test_hip_device_segmented_reduce.cpp @@ -77,9 +77,13 @@ typedef ::testing::Types< params>, params, -100, 0, 10000>, params, 1000, 0, 10000>, + params, 0, 0, 2000>, params, 10, 1000, 10000>, params, 50, 2, 10>, - params, 123, 100, 200, true> + params, 123, 100, 200>, + params, 10, 3000, 4000>, + params, 0, 10, 300>, + params > Params; TYPED_TEST_CASE(RocprimDeviceSegmentedReduce, Params); diff --git a/test/rocprim/test_hip_device_segmented_scan.cpp b/test/rocprim/test_hip_device_segmented_scan.cpp index 744810ad1..050c2f4b2 100644 --- a/test/rocprim/test_hip_device_segmented_scan.cpp +++ b/test/rocprim/test_hip_device_segmented_scan.cpp @@ -80,7 +80,10 @@ typedef ::testing::Types< params, 1000, 0, 10000>, params, 10, 1000, 10000>, params, 50, 2, 10>, - params, 123, 100, 200, true> + params, 123, 100, 200, true>, + params, 10, 3000, 4000>, + params, 0, 10, 300, true>, + params > Params; TYPED_TEST_CASE(RocprimDeviceSegmentedScan, Params); diff --git a/test/rocprim/test_utils.hpp b/test/rocprim/test_utils.hpp index eb7621f84..65f35d6d0 100644 --- a/test/rocprim/test_utils.hpp +++ b/test/rocprim/test_utils.hpp @@ -49,6 +49,137 @@ std::ostream& operator<<(std::ostream& stream, const rocprim::half& value) namespace test_utils { +// Support half operators on host side + +#if defined(__HCC_ACCELERATOR__) || defined(__HIP_DEVICE_COMPILE__) + +ROCPRIM_DEVICE inline +rocprim::half half_to_native(const rocprim::half& x) +{ + return x; +} + +ROCPRIM_DEVICE inline +rocprim::half native_to_half(const rocprim::half& x) +{ + return x; +} + +#else + +ROCPRIM_HOST inline +_Float16 half_to_native(const rocprim::half& x) +{ + return *reinterpret_cast(&x); +} + +ROCPRIM_HOST inline +rocprim::half native_to_half(const _Float16& x) +{ + return *reinterpret_cast(&x); +} + +#endif + +struct half_less +{ + ROCPRIM_HOST_DEVICE inline + bool operator()(const rocprim::half& a, const rocprim::half& b) const + { + return half_to_native(a) < half_to_native(b); + } +}; + +struct half_less_equal +{ + ROCPRIM_HOST_DEVICE inline + bool operator()(const rocprim::half& a, const rocprim::half& b) const + { + return half_to_native(a) <= half_to_native(b); + } +}; + +struct half_greater +{ + ROCPRIM_HOST_DEVICE inline + bool operator()(const rocprim::half& a, const rocprim::half& b) const + { + return half_to_native(a) > half_to_native(b); + } +}; + +struct half_greater_equal +{ + ROCPRIM_HOST_DEVICE inline + bool operator()(const rocprim::half& a, const rocprim::half& b) const + { + return half_to_native(a) >= half_to_native(b); + } +}; + +struct half_equal_to +{ + ROCPRIM_HOST_DEVICE inline + bool operator()(const rocprim::half& a, const rocprim::half& b) const + { + return half_to_native(a) == half_to_native(b); + } +}; + +struct half_not_equal_to +{ + ROCPRIM_HOST_DEVICE inline + bool operator()(const rocprim::half& a, const rocprim::half& b) const + { + return half_to_native(a) != half_to_native(b); + } +}; + +struct half_plus +{ + ROCPRIM_HOST_DEVICE inline + rocprim::half operator()(const rocprim::half& a, const rocprim::half& b) const + { + return native_to_half(half_to_native(a) + half_to_native(b)); + } +}; + +struct half_minus +{ + ROCPRIM_HOST_DEVICE inline + rocprim::half operator()(const rocprim::half& a, const rocprim::half& b) const + { + return native_to_half(half_to_native(a) - half_to_native(b)); + } +}; + +struct half_multiplies +{ + ROCPRIM_HOST_DEVICE inline + rocprim::half operator()(const rocprim::half& a, const rocprim::half& b) const + { + return native_to_half(half_to_native(a) * half_to_native(b)); + } +}; + +struct half_maximum +{ + ROCPRIM_HOST_DEVICE inline + rocprim::half operator()(const rocprim::half& a, const rocprim::half& b) const + { + return half_to_native(a) < half_to_native(b) ? b : a; + } +}; + +struct half_minimum +{ + ROCPRIM_HOST_DEVICE inline + rocprim::half operator()(const rocprim::half& a, const rocprim::half& b) const + { + return half_to_native(a) < half_to_native(b) ? a : b; + } +}; + template inline auto get_random_data(size_t size, T min, T max) -> typename std::enable_if::value, std::vector>::type @@ -248,9 +379,6 @@ struct custom_test_type y = other.y; } - ROCPRIM_HOST_DEVICE inline - custom_test_type(const custom_test_type& other) : x(other.x), y(other.y) {} - ROCPRIM_HOST_DEVICE inline ~custom_test_type() {} @@ -277,13 +405,13 @@ struct custom_test_type ROCPRIM_HOST_DEVICE inline bool operator<(const custom_test_type& other) const { - return (x < other.x && y < other.y); + return (x < other.x || (x == other.x && y < other.y)); } ROCPRIM_HOST_DEVICE inline bool operator>(const custom_test_type& other) const { - return (x > other.x && y > other.y); + return (x > other.x || (x == other.x && y > other.y)); } ROCPRIM_HOST_DEVICE inline @@ -347,7 +475,7 @@ inline auto get_random_data(size_t size, typename T::value_type min, typename T: std::default_random_engine gen(rd()); std::uniform_int_distribution distribution(min, max); std::vector data(size); - std::generate(data.begin(), data.end(), [&]() { return distribution(gen); }); + std::generate(data.begin(), data.end(), [&]() { return T(distribution(gen), distribution(gen)); }); return data; } @@ -362,7 +490,7 @@ inline auto get_random_data(size_t size, typename T::value_type min, typename T: std::default_random_engine gen(rd()); std::uniform_real_distribution distribution(min, max); std::vector data(size); - std::generate(data.begin(), data.end(), [&]() { return T(distribution(gen)); }); + std::generate(data.begin(), data.end(), [&]() { return T(distribution(gen), distribution(gen)); }); return data; } @@ -381,9 +509,25 @@ auto assert_near(const std::vector& result, const std::vector& expected, c ASSERT_EQ(result.size(), expected.size()); for(size_t i = 0; i < result.size(); i++) { - auto diff = std::max(std::abs(percent * expected[i]), T(percent)); - if(std::is_integral::value) diff = 0; - ASSERT_NEAR(result[i], expected[i], diff) << "where index = " << i; + if(std::is_integral::value) + { + ASSERT_EQ(result[i], expected[i]) << "where index = " << i; + } + else + { + auto diff = std::max(std::abs(percent * expected[i]), T(percent)); + ASSERT_NEAR(result[i], expected[i], diff) << "where index = " << i; + } + } +} + +void assert_near(const std::vector& result, const std::vector& expected, float percent) +{ + ASSERT_EQ(result.size(), expected.size()); + for(size_t i = 0; i < result.size(); i++) + { + auto diff = std::max(std::abs(percent * static_cast(expected[i])), percent); + ASSERT_NEAR(static_cast(result[i]), static_cast(expected[i]), diff) << "where index = " << i; } } @@ -459,7 +603,7 @@ void assert_eq(const std::vector& result, const std::vector(result[i]), static_cast(expected[i])) << "where index = " << i; + ASSERT_EQ(half_to_native(result[i]), half_to_native(expected[i])) << "where index = " << i; } }