From 74fa48daaa1b554835af929abe0a09398c070ea9 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Fri, 1 Aug 2025 12:01:34 -0700 Subject: [PATCH 01/10] chery-pick syntax fix from fork --- .../include/rocprim/block/block_shuffle.hpp | 54 ++++++++++--------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/projects/rocprim/rocprim/include/rocprim/block/block_shuffle.hpp b/projects/rocprim/rocprim/include/rocprim/block/block_shuffle.hpp index 0db3ea34fd0..c1278bb8f28 100644 --- a/projects/rocprim/rocprim/include/rocprim/block/block_shuffle.hpp +++ b/projects/rocprim/rocprim/include/rocprim/block/block_shuffle.hpp @@ -168,8 +168,8 @@ class block_shuffle /// \param [out] output reference to a output value, that receives data from another thread /// \param [in] distance The input threadId + distance = output threadId. /// \param [in] storage reference to a temporary storage object of type storage_type. - ROCPRIM_DEVICE ROCPRIM_INLINE void - offset(const size_t& flat_id, T input, T& output, int distance, storage_type& storage) + ROCPRIM_DEVICE ROCPRIM_INLINE + void offset(const size_t& flat_id, T input, T& output, int distance, storage_type& storage) { storage.buffer.emplace(flat_id, input); @@ -243,8 +243,8 @@ class block_shuffle /// \param [out] output reference to a output value, that receives data from another thread /// \param [in] distance The input threadId + distance = output threadId. /// \param [in] storage reference to a temporary storage object of type storage_type. - ROCPRIM_DEVICE ROCPRIM_INLINE void - rotate(const size_t& flat_id, T input, T& output, int distance, storage_type& storage) + ROCPRIM_DEVICE ROCPRIM_INLINE + void rotate(const size_t& flat_id, T input, T& output, int distance, storage_type& storage) { storage.buffer.emplace(flat_id, input); @@ -320,10 +320,11 @@ class block_shuffle /// \param [in] storage reference to a temporary storage object of type storage_type. /// The item \p prev[0] is not updated for thread0. template - ROCPRIM_DEVICE ROCPRIM_INLINE void up(const size_t& flat_id, - T (&input)[ItemsPerThread], - T (&prev)[ItemsPerThread], - storage_type& storage) + ROCPRIM_DEVICE ROCPRIM_INLINE + void up(const size_t& flat_id, + T (&input)[ItemsPerThread], + T (&prev)[ItemsPerThread], + storage_type& storage) { storage.buffer.emplace(flat_id, input[ItemsPerThread - 1]); @@ -390,16 +391,17 @@ class block_shuffle /// threadBlockSize-1, provided to all threads /// \param [in] storage reference to a temporary storage object of type storage_type. template - ROCPRIM_DEVICE ROCPRIM_INLINE void up(const size_t& flat_id, - T (&input)[ItemsPerThread], - T (&prev)[ItemsPerThread], - T& block_suffix, - storage_type& storage) + ROCPRIM_DEVICE ROCPRIM_INLINE + void up(const size_t& flat_id, + T (&input)[ItemsPerThread], + T (&prev)[ItemsPerThread], + T& block_suffix, + storage_type& storage) { up(flat_id, input, prev, storage); // Update block prefix - block_suffix = storage->buffer.get_unsafe_array()[BlockSize - 1]; + block_suffix = storage.buffer.get_unsafe_array()[BlockSize - 1]; } /// \brief The thread block rotates a blocked arrange of input items, @@ -458,10 +460,11 @@ class block_shuffle /// The item \p prev[0] is not updated for threadBlockSize - 1. /// \param [in] storage reference to a temporary storage object of type storage_type. template - ROCPRIM_DEVICE ROCPRIM_INLINE void down(const size_t& flat_id, - T (&input)[ItemsPerThread], - T (&next)[ItemsPerThread], - storage_type& storage) + ROCPRIM_DEVICE ROCPRIM_INLINE + void down(const size_t& flat_id, + T (&input)[ItemsPerThread], + T (&next)[ItemsPerThread], + storage_type& storage) { storage.buffer.emplace(flat_id, input[0]); @@ -525,16 +528,17 @@ class block_shuffle /// \param [out] block_prefix The item \p input[0] from thread0, provided to all threads /// \param [in] storage reference to a temporary storage object of type storage_type. template - ROCPRIM_DEVICE ROCPRIM_INLINE void down(const size_t& flat_id, - T (&input)[ItemsPerThread], - T (&next)[ItemsPerThread], - T& block_prefix, - storage_type& storage) + ROCPRIM_DEVICE ROCPRIM_INLINE + void down(const size_t& flat_id, + T (&input)[ItemsPerThread], + T (&next)[ItemsPerThread], + T& block_prefix, + storage_type& storage) { this->down(flat_id, input, next, storage); // Update block prefixstorage_-> - block_prefix = storage->next[0]; + block_prefix = storage.buffer.get_unsafe_array()[0]; } }; @@ -543,4 +547,4 @@ END_ROCPRIM_NAMESPACE /// @} // end of group blockmodule -#endif // ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_ +#endif // ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_ \ No newline at end of file From ee7fe6c0b7bd1c2f9935890fbe8421dff7219a24 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Fri, 1 Aug 2025 12:50:56 -0700 Subject: [PATCH 02/10] deleted .jenkins to get correct math-ci configuration --- projects/rocprim/.jenkins/common.groovy | 105 -------------------- projects/rocprim/.jenkins/precheckin.groovy | 81 --------------- projects/rocprim/.jenkins/static.groovy | 82 --------------- 3 files changed, 268 deletions(-) delete mode 100644 projects/rocprim/.jenkins/common.groovy delete mode 100644 projects/rocprim/.jenkins/precheckin.groovy delete mode 100644 projects/rocprim/.jenkins/static.groovy diff --git a/projects/rocprim/.jenkins/common.groovy b/projects/rocprim/.jenkins/common.groovy deleted file mode 100644 index 0ffd1dee600..00000000000 --- a/projects/rocprim/.jenkins/common.groovy +++ /dev/null @@ -1,105 +0,0 @@ -// This file is for internal AMD use. -// If you are interested in running your own Jenkins, please raise a github issue for assistance. - -def runCompileCommand(platform, project, jobName, boolean debug=false, boolean staticLibrary=false) -{ - project.paths.construct_build_prefix() - - String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug' : '-DCMAKE_BUILD_TYPE=Release' - String buildStatic = staticLibrary ? '-DBUILD_SHARED_LIBS=OFF' : '-DBUILD_SHARED_LIBS=ON' - String buildTypeDir = debug ? 'debug' : 'release' - String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake' - //Set CI node's gfx arch as target if PR, otherwise use default targets of the library - String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : '' - - def command = """#!/usr/bin/env bash - set -x - cd ${project.paths.project_build_prefix} - mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir} - ${auxiliary.gfxTargetParser()} - ${cmake} --toolchain=toolchain-linux.cmake ${buildTypeArg} ${buildStatic} ${amdgpuTargets} -DBUILD_TEST=ON -DBUILD_BENCHMARK=ON ../.. - make -j\$(nproc) - """ - - platform.runCommand(this, command) -} - - -def runTestCommand (platform, project, boolean rocmExamples=false) -{ - String sudo = auxiliary.sudo(platform.jenkinsLabel) - - def testCommand = "ctest --output-on-failure " - def testCommandExcludeRegex = /(rocprim.block_histogram)/ - def testCommandExclude = "--exclude-regex \"${testCommandExcludeRegex}\"" - def hmmExcludeRegex = '' - def hmmTestCommandExclude = "--exclude-regex \"${hmmExcludeRegex}\"" - def hmmTestCommand = '' - if (platform.jenkinsLabel.contains('gfx90a')) - { - echo("HMM TESTS DISABLED") - /*hmmTestCommand = """ - export HSA_XNACK=1 - export ROCPRIM_USE_HMM=1 - ${testCommand} ${hmmTestCommandExclude} - """*/ - } - echo(env.JOB_NAME) - if (env.JOB_NAME.contains('bleeding-edge')) - { - testCommand = '' - testCommandExclude = '' - hmmTestCommand = '' - echo("TESTS DISABLED") - } - def command = """#!/usr/bin/env bash - set -x - cd ${project.paths.project_build_prefix} - cd ${project.testDirectory} - ${testCommand} ${testCommandExclude} - if (( \$? != 0 )); then - exit 1 - fi - ${hmmTestCommand} - """ - platform.runCommand(this, command) - //ROCM Examples - if (rocmExamples){ - String buildString = "" - if (platform.os.contains("ubuntu")){ - buildString += "sudo dpkg -i *.deb" - } - else { - buildString += "sudo rpm -i *.rpm" - } - testCommand = """#!/usr/bin/env bash - set -ex - cd ${project.paths.project_build_prefix}/build/release/package - ls - ${buildString} - cd ../../.. - testDirs=("Libraries/rocPRIM") - git clone https://github.com/ROCm/rocm-examples.git - rocm_examples_dir=\$(readlink -f rocm-examples) - for testDir in \${testDirs[@]}; do - cd \${rocm_examples_dir}/\${testDir} - cmake -S . -B build - cmake --build build - cd ./build - ctest --output-on-failure - done - """ - platform.runCommand(this, testCommand, "ROCM Examples") - - } -} - -def runPackageCommand(platform, project) -{ - def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/release") - - platform.runCommand(this, packageHelper[0]) - platform.archiveArtifacts(this, packageHelper[1]) -} - -return this diff --git a/projects/rocprim/.jenkins/precheckin.groovy b/projects/rocprim/.jenkins/precheckin.groovy deleted file mode 100644 index bbb8274743c..00000000000 --- a/projects/rocprim/.jenkins/precheckin.groovy +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env groovy -@Library('rocJenkins@pong') _ -import com.amd.project.* -import com.amd.docker.* -import java.nio.file.Path; - -def runCI = -{ - nodeDetails, jobName-> - - def prj = new rocProject('rocPRIM', 'PreCheckin') - prj.paths.build_command = './install -c' - prj.timeout.compile = 600 - - def nodes = new dockerNodes(nodeDetails, jobName, prj) - - def commonGroovy - - boolean formatCheck = false - - def compileCommand = - { - platform, project-> - - commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" - commonGroovy.runCompileCommand(platform, project, jobName) - } - - def testCommand = - { - platform, project-> - - commonGroovy.runTestCommand(platform, project, true) - } - - def packageCommand = - { - platform, project-> - - commonGroovy.runPackageCommand(platform, project) - } - - buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) -} - -ci: { - String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) - - def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])], - "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])], - "rocm-docker":[]] - propertyList = auxiliary.appendPropertyList(propertyList) - - def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])] - jobNameList = auxiliary.appendJobNameList(jobNameList, 'rocPRIM') - - propertyList.each - { - jobName, property-> - if (urlJobName == jobName) - properties(auxiliary.addCommonProperties(property)) - } - - jobNameList.each - { - jobName, nodeDetails-> - if (urlJobName == jobName) - stage(jobName) { - runCI(nodeDetails, jobName) - } - } - - // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 - if(!jobNameList.keySet().contains(urlJobName)) - { - properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) - stage(urlJobName) { - runCI([ubuntu16:['gfx906']], urlJobName) - } - } -} diff --git a/projects/rocprim/.jenkins/static.groovy b/projects/rocprim/.jenkins/static.groovy deleted file mode 100644 index 75606419fdf..00000000000 --- a/projects/rocprim/.jenkins/static.groovy +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env groovy -@Library('rocJenkins@pong') _ -import com.amd.project.* -import com.amd.docker.* -import java.nio.file.Path; - -def runCI = -{ - nodeDetails, jobName-> - - def prj = new rocProject('rocPRIM', 'static') - prj.paths.build_command = './install -c -s' - prj.timeout.compile = 600 - prj.timeout.packaging = 120 - - def nodes = new dockerNodes(nodeDetails, jobName, prj) - - def commonGroovy - - boolean formatCheck = false - - def compileCommand = - { - platform, project-> - - commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" - commonGroovy.runCompileCommand(platform, project, jobName, debug=false, staticLibrary=true) - } - - def testCommand = - { - platform, project-> - - commonGroovy.runTestCommand(platform, project) - } - - def packageCommand = - { - platform, project-> - - commonGroovy.runPackageCommand(platform, project) - } - - buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) -} - -ci: { - String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) - - def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])], - "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])], - "rocm-docker":[]] - propertyList = auxiliary.appendPropertyList(propertyList) - - def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])] - jobNameList = auxiliary.appendJobNameList(jobNameList, 'rocPRIM') - - propertyList.each - { - jobName, property-> - if (urlJobName == jobName) - properties(auxiliary.addCommonProperties(property)) - } - - jobNameList.each - { - jobName, nodeDetails-> - if (urlJobName == jobName) - stage(jobName) { - runCI(nodeDetails, jobName) - } - } - - // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 - if(!jobNameList.keySet().contains(urlJobName)) - { - properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) - stage(urlJobName) { - runCI([ubuntu16:['gfx906']], urlJobName) - } - } -} From ba1ca085bbe07bb0a3ed98bd5f450f45360b5d00 Mon Sep 17 00:00:00 2001 From: Di Nguyen Date: Wed, 16 Jul 2025 09:23:54 -0600 Subject: [PATCH 03/10] [hipCUB][Code Coverage] Increase Code Coverage (#411) Re making PR #172 to merge to develop instead of release-staging --- projects/hipcub/CHANGELOG.md | 7 + .../test_hipcub_block_discontinuity.cpp | 361 ++- .../hipcub/test_hipcub_block_exchange.cpp | 1618 +++++++++++-- .../hipcub/test_hipcub_block_merge_sort.cpp | 903 ++++++- .../hipcub/test_hipcub_block_radix_rank.cpp | 524 ++++- .../hipcub/test_hipcub_block_radix_sort.cpp | 443 ++-- .../test/hipcub/test_hipcub_block_reduce.cpp | 572 +++-- .../test_hipcub_block_run_length_decode.cpp | 194 +- .../test/hipcub/test_hipcub_block_scan.cpp | 2072 ++++++++++++++--- .../test/hipcub/test_hipcub_block_shuffle.cpp | 512 ++-- 10 files changed, 5764 insertions(+), 1442 deletions(-) diff --git a/projects/hipcub/CHANGELOG.md b/projects/hipcub/CHANGELOG.md index a388d28f69d..bc353989653 100644 --- a/projects/hipcub/CHANGELOG.md +++ b/projects/hipcub/CHANGELOG.md @@ -19,6 +19,13 @@ Full documentation for hipCUB is available at [https://rocm.docs.amd.com/project * `UnrolledThreadLoad`, `UnrolledCopy`, and `ThreadLoadVolatilePointer` were added to align hipCUB with CUB. * `ThreadStoreVolatilePtr` and the `IterateThreadStore` struct were added to align hipCUB with CUB. * Added `hipcub::InclusiveScanInit` for CUB parity. +* Additional Unit Tests for: + * block_exchange + * block_merge_sort + * block_radix_rank + * block_radix_sort + * block_reduce + * block_shuffle ### Removed diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_discontinuity.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_discontinuity.cpp index 29e07b759a3..827d34451f5 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_discontinuity.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_discontinuity.cpp @@ -24,28 +24,23 @@ // hipcub API #include "hipcub/block/block_discontinuity.hpp" -#include "hipcub/thread/thread_operators.hpp" #include "hipcub/block/block_load.hpp" #include "hipcub/block/block_store.hpp" +#include "hipcub/thread/thread_operators.hpp" -template< - class T, - class Flag, - unsigned int BlockSize, - unsigned int ItemsPerThread, - class FlagOp -> +template struct params { - using type = T; - using flag_type = Flag; - static constexpr unsigned int block_size = BlockSize; + using type = T; + using flag_type = Flag; + static constexpr unsigned int block_size = BlockSize; static constexpr unsigned int items_per_thread = ItemsPerThread; - using flag_op_type = FlagOp; + using flag_op_type = FlagOp; }; template -class HipcubBlockDiscontinuity : public ::testing::Test { +class HipcubBlockDiscontinuity : public ::testing::Test +{ public: using params = Params; }; @@ -111,20 +106,18 @@ using Params = ::testing::Types< TYPED_TEST_SUITE(HipcubBlockDiscontinuity, Params); -template< - class Type, - class FlagType, - class FlagOpType, - unsigned int BlockSize, - unsigned int ItemsPerThread -> +template __global__ __launch_bounds__(BlockSize) void flag_heads_kernel(Type* device_input, long long* device_heads) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int items_per_block = BlockSize * ItemsPerThread; - const unsigned int block_offset = hipBlockIdx_x * items_per_block; + const unsigned int block_offset = hipBlockIdx_x * items_per_block; Type input[ItemsPerThread]; hipcub::LoadDirectBlocked(lid, device_input + block_offset, input); @@ -154,17 +147,16 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagHeads) using type = typename TestFixture::params::type; // std::vector is a special case that will cause an error in hipMemcpy using stored_flag_type = typename std::conditional< - std::is_same::value, - int, - typename TestFixture::params::flag_type - >::type; - using flag_type = typename TestFixture::params::flag_type; - using flag_op_type = typename TestFixture::params::flag_op_type; - constexpr size_t block_size = TestFixture::params::block_size; + std::is_same::value, + int, + typename TestFixture::params::flag_type>::type; + using flag_type = typename TestFixture::params::flag_type; + using flag_op_type = typename TestFixture::params::flag_op_type; + constexpr size_t block_size = TestFixture::params::block_size; constexpr size_t items_per_thread = TestFixture::params::items_per_thread; - constexpr size_t items_per_block = block_size * items_per_thread; - const size_t size = items_per_block * 2048; - constexpr size_t grid_size = size / items_per_block; + constexpr size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 2048; + constexpr size_t grid_size = size / items_per_block; // Given block size not supported if(block_size > test_utils::get_max_block_size()) @@ -172,10 +164,10 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagHeads) return; } - - for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { - unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate data @@ -188,7 +180,7 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagHeads) // Calculate expected results on host std::vector expected_heads(size); - flag_op_type flag_op; + flag_op_type flag_op; for(size_t bi = 0; bi < size / items_per_block; bi++) { for(size_t ii = 0; ii < items_per_block; ii++) @@ -196,9 +188,8 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagHeads) const size_t i = bi * items_per_block + ii; if(ii == 0) { - expected_heads[i] = bi % 2 == 1 - ? apply(flag_op, input[i - 1], input[i], ii) - : flag_type(true); + expected_heads[i] = bi % 2 == 1 ? apply(flag_op, input[i - 1], input[i], ii) + : flag_type(true); } else { @@ -209,40 +200,38 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagHeads) // Preparing Device type* device_input; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_input, + input.size() * sizeof(typename decltype(input)::value_type))); long long* device_heads; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_heads, heads.size() * sizeof(typename decltype(heads)::value_type))); + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_heads, + heads.size() * sizeof(typename decltype(heads)::value_type))); - HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(device_input, + input.data(), + input.size() * sizeof(type), + hipMemcpyHostToDevice)); + HIP_CHECK(hipGetLastError()); // Running kernel hipLaunchKernelGGL( HIP_KERNEL_NAME( - flag_heads_kernel< - type, flag_type, flag_op_type, - block_size, items_per_thread - > - ), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, device_heads - ); - HIP_CHECK(hipPeekAtLastError()); + flag_heads_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_heads); + HIP_CHECK(hipGetLastError()); HIP_CHECK(hipDeviceSynchronize()); // Reading results - HIP_CHECK( - hipMemcpy( - heads.data(), device_heads, - heads.size() * sizeof(typename decltype(heads)::value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(heads.data(), + device_heads, + heads.size() * sizeof(typename decltype(heads)::value_type), + hipMemcpyDeviceToHost)); // Validating results for(size_t i = 0; i < size; i++) @@ -255,20 +244,18 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagHeads) } } -template< - class Type, - class FlagType, - class FlagOpType, - unsigned int BlockSize, - unsigned int ItemsPerThread -> +template __global__ __launch_bounds__(BlockSize) void flag_tails_kernel(Type* device_input, long long* device_tails) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int items_per_block = BlockSize * ItemsPerThread; - const unsigned int block_offset = hipBlockIdx_x * items_per_block; + const unsigned int block_offset = hipBlockIdx_x * items_per_block; Type input[ItemsPerThread]; hipcub::LoadDirectBlocked(lid, device_input + block_offset, input); @@ -298,17 +285,16 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagTails) using type = typename TestFixture::params::type; // std::vector is a special case that will cause an error in hipMemcpy using stored_flag_type = typename std::conditional< - std::is_same::value, - int, - typename TestFixture::params::flag_type - >::type; - using flag_type = typename TestFixture::params::flag_type; - using flag_op_type = typename TestFixture::params::flag_op_type; - constexpr size_t block_size = TestFixture::params::block_size; + std::is_same::value, + int, + typename TestFixture::params::flag_type>::type; + using flag_type = typename TestFixture::params::flag_type; + using flag_op_type = typename TestFixture::params::flag_op_type; + constexpr size_t block_size = TestFixture::params::block_size; constexpr size_t items_per_thread = TestFixture::params::items_per_thread; - constexpr size_t items_per_block = block_size * items_per_thread; - const size_t size = items_per_block * 2048; - constexpr size_t grid_size = size / items_per_block; + constexpr size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 2048; + constexpr size_t grid_size = size / items_per_block; // Given block size not supported if(block_size > test_utils::get_max_block_size()) @@ -316,9 +302,10 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagTails) return; } - for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { - unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate data @@ -331,7 +318,7 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagTails) // Calculate expected results on host std::vector expected_tails(size); - flag_op_type flag_op; + flag_op_type flag_op; for(size_t bi = 0; bi < size / items_per_block; bi++) { for(size_t ii = 0; ii < items_per_block; ii++) @@ -339,9 +326,8 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagTails) const size_t i = bi * items_per_block + ii; if(ii == items_per_block - 1) { - expected_tails[i] = bi % 2 == 0 - ? apply(flag_op, input[i], input[i + 1], ii + 1) - : flag_type(true); + expected_tails[i] = bi % 2 == 0 ? apply(flag_op, input[i], input[i + 1], ii + 1) + : flag_type(true); } else { @@ -352,40 +338,39 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagTails) // Preparing Device type* device_input; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_input, + input.size() * sizeof(typename decltype(input)::value_type))); long long* device_tails; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_tails, tails.size() * sizeof(typename decltype(tails)::value_type))); + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_tails, + tails.size() * sizeof(typename decltype(tails)::value_type))); - HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(device_input, + input.data(), + input.size() * sizeof(type), + hipMemcpyHostToDevice)); + HIP_CHECK(hipGetLastError()); // Running kernel hipLaunchKernelGGL( HIP_KERNEL_NAME( - flag_tails_kernel< - type, flag_type, flag_op_type, - block_size, items_per_thread - > - ), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, device_tails - ); - HIP_CHECK(hipPeekAtLastError()); + flag_tails_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_tails); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); // Reading results - HIP_CHECK( - hipMemcpy( - tails.data(), device_tails, - tails.size() * sizeof(typename decltype(tails)::value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(tails.data(), + device_tails, + tails.size() * sizeof(typename decltype(tails)::value_type), + hipMemcpyDeviceToHost)); // Validating results for(size_t i = 0; i < size; i++) @@ -398,20 +383,20 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagTails) } } -template< - class Type, - class FlagType, - class FlagOpType, - unsigned int BlockSize, - unsigned int ItemsPerThread -> +template __global__ __launch_bounds__(BlockSize) -void flag_heads_and_tails_kernel(Type* device_input, long long* device_heads, long long* device_tails) +void flag_heads_and_tails_kernel(Type* device_input, + long long* device_heads, + long long* device_tails) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int items_per_block = BlockSize * ItemsPerThread; - const unsigned int block_offset = hipBlockIdx_x * items_per_block; + const unsigned int block_offset = hipBlockIdx_x * items_per_block; Type input[ItemsPerThread]; hipcub::LoadDirectBlocked(lid, device_input + block_offset, input); @@ -423,18 +408,31 @@ void flag_heads_and_tails_kernel(Type* device_input, long long* device_heads, lo if(hipBlockIdx_x % 4 == 0) { const Type tile_successor_item = device_input[block_offset + items_per_block]; - bdiscontinuity.FlagHeadsAndTails(head_flags, tail_flags, tile_successor_item, input, FlagOpType()); + bdiscontinuity.FlagHeadsAndTails(head_flags, + tail_flags, + tile_successor_item, + input, + FlagOpType()); } else if(hipBlockIdx_x % 4 == 1) { const Type tile_predecessor_item = device_input[block_offset - 1]; - const Type tile_successor_item = device_input[block_offset + items_per_block]; - bdiscontinuity.FlagHeadsAndTails(head_flags, tile_predecessor_item, tail_flags, tile_successor_item, input, FlagOpType()); + const Type tile_successor_item = device_input[block_offset + items_per_block]; + bdiscontinuity.FlagHeadsAndTails(head_flags, + tile_predecessor_item, + tail_flags, + tile_successor_item, + input, + FlagOpType()); } else if(hipBlockIdx_x % 4 == 2) { const Type tile_predecessor_item = device_input[block_offset - 1]; - bdiscontinuity.FlagHeadsAndTails(head_flags, tile_predecessor_item, tail_flags, input, FlagOpType()); + bdiscontinuity.FlagHeadsAndTails(head_flags, + tile_predecessor_item, + tail_flags, + input, + FlagOpType()); } else if(hipBlockIdx_x % 4 == 3) { @@ -454,17 +452,16 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagHeadsAndTails) using type = typename TestFixture::params::type; // std::vector is a special case that will cause an error in hipMemcpy using stored_flag_type = typename std::conditional< - std::is_same::value, - int, - typename TestFixture::params::flag_type - >::type; - using flag_type = typename TestFixture::params::flag_type; - using flag_op_type = typename TestFixture::params::flag_op_type; - constexpr size_t block_size = TestFixture::params::block_size; + std::is_same::value, + int, + typename TestFixture::params::flag_type>::type; + using flag_type = typename TestFixture::params::flag_type; + using flag_op_type = typename TestFixture::params::flag_op_type; + constexpr size_t block_size = TestFixture::params::block_size; constexpr size_t items_per_thread = TestFixture::params::items_per_thread; - constexpr size_t items_per_block = block_size * items_per_thread; - const size_t size = items_per_block * 2048; - constexpr size_t grid_size = size / items_per_block; + constexpr size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 2048; + constexpr size_t grid_size = size / items_per_block; // Given block size not supported if(block_size > test_utils::get_max_block_size()) @@ -472,10 +469,10 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagHeadsAndTails) return; } - - for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { - unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate data @@ -490,7 +487,7 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagHeadsAndTails) // Calculate expected results on host std::vector expected_heads(size); std::vector expected_tails(size); - flag_op_type flag_op; + flag_op_type flag_op; for(size_t bi = 0; bi < size / items_per_block; bi++) { for(size_t ii = 0; ii < items_per_block; ii++) @@ -499,8 +496,8 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagHeadsAndTails) if(ii == 0) { expected_heads[i] = (bi % 4 == 1 || bi % 4 == 2) - ? apply(flag_op, input[i - 1], input[i], ii) - : flag_type(true); + ? apply(flag_op, input[i - 1], input[i], ii) + : flag_type(true); } else { @@ -509,8 +506,8 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagHeadsAndTails) if(ii == items_per_block - 1) { expected_tails[i] = (bi % 4 == 0 || bi % 4 == 1) - ? apply(flag_op, input[i], input[i + 1], ii + 1) - : flag_type(true); + ? apply(flag_op, input[i], input[i + 1], ii + 1) + : flag_type(true); } else { @@ -521,50 +518,50 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagHeadsAndTails) // Preparing Device type* device_input; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_input, + input.size() * sizeof(typename decltype(input)::value_type))); long long* device_heads; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_heads, tails.size() * sizeof(typename decltype(heads)::value_type))); + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_heads, + tails.size() * sizeof(typename decltype(heads)::value_type))); long long* device_tails; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_tails, tails.size() * sizeof(typename decltype(tails)::value_type))); + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_tails, + tails.size() * sizeof(typename decltype(tails)::value_type))); - HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(device_input, + input.data(), + input.size() * sizeof(type), + hipMemcpyHostToDevice)); + HIP_CHECK(hipGetLastError()); // Running kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - flag_heads_and_tails_kernel< - type, flag_type, flag_op_type, - block_size, items_per_thread - > - ), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, device_heads, device_tails - ); - HIP_CHECK(hipPeekAtLastError()); + hipLaunchKernelGGL(HIP_KERNEL_NAME(flag_heads_and_tails_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_heads, + device_tails); + HIP_CHECK(hipGetLastError()); HIP_CHECK(hipDeviceSynchronize()); // Reading results - HIP_CHECK( - hipMemcpy( - heads.data(), device_heads, - heads.size() * sizeof(typename decltype(heads)::value_type), - hipMemcpyDeviceToHost - ) - ); - - HIP_CHECK( - hipMemcpy( - tails.data(), device_tails, - tails.size() * sizeof(typename decltype(tails)::value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(heads.data(), + device_heads, + heads.size() * sizeof(typename decltype(heads)::value_type), + hipMemcpyDeviceToHost)); + + HIP_CHECK(hipMemcpy(tails.data(), + device_tails, + tails.size() * sizeof(typename decltype(tails)::value_type), + hipMemcpyDeviceToHost)); // Validating results for(size_t i = 0; i < size; i++) diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_exchange.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_exchange.cpp index 4c14811ed2d..08833ccfb80 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_exchange.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_exchange.cpp @@ -26,22 +26,18 @@ #include "hipcub/block/block_load.hpp" #include "hipcub/block/block_store.hpp" -template< - class T, - class U, - unsigned int BlockSize, - unsigned int ItemsPerThread -> +template struct params { - using type = T; - using output_type = U; - static constexpr unsigned int block_size = BlockSize; + using type = T; + using output_type = U; + static constexpr unsigned int block_size = BlockSize; static constexpr unsigned int items_per_thread = ItemsPerThread; }; template -class HipcubBlockExchangeTests : public ::testing::Test { +class HipcubBlockExchangeTests : public ::testing::Test +{ public: using params = Params; }; @@ -58,8 +54,8 @@ struct dummy dummy() = default; template - HIPCUB_HOST_DEVICE - dummy(U a) : x(a + 1), y(a * 2) { } + HIPCUB_HOST_DEVICE dummy(U a) : x(a + 1), y(a * 2) + {} HIPCUB_HOST_DEVICE bool operator==(const dummy& rhs) const @@ -96,21 +92,16 @@ using Params = ::testing::Types< TYPED_TEST_SUITE(HipcubBlockExchangeTests, Params); -template< - class Type, - class OutputType, - unsigned int ItemsPerBlock, - unsigned int ItemsPerThread -> +template __global__ __launch_bounds__(512) void blocked_to_striped_kernel(Type* device_input, OutputType* device_output) { - constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; + constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; - Type input[ItemsPerThread]; + Type input[ItemsPerThread]; OutputType output[ItemsPerThread]; hipcub::LoadDirectBlocked(lid, device_input + block_offset, input); @@ -141,7 +132,7 @@ TYPED_TEST(HipcubBlockExchangeTests, BlockedToStriped) const size_t size = items_per_block * 113; // Generate data - std::vector input(size); + std::vector input(size); std::vector expected(size); std::vector output(size, test_utils::convert_to_device(0)); @@ -165,36 +156,37 @@ TYPED_TEST(HipcubBlockExchangeTests, BlockedToStriped) // Preparing device type* device_input; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_input, + input.size() * sizeof(typename decltype(input)::value_type))); output_type* device_output; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_output, + output.size() * sizeof(typename decltype(output)::value_type))); HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(type), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_input, input.data(), input.size() * sizeof(type), hipMemcpyHostToDevice)); + HIP_CHECK(hipGetLastError()); // Running kernel constexpr unsigned int grid_size = (size / items_per_block); hipLaunchKernelGGL( - HIP_KERNEL_NAME(blocked_to_striped_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, device_output - ); - HIP_CHECK(hipPeekAtLastError()); + HIP_KERNEL_NAME( + blocked_to_striped_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_output); + HIP_CHECK(hipGetLastError()); HIP_CHECK(hipDeviceSynchronize()); // Reading results - HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(typename decltype(output)::value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(typename decltype(output)::value_type), + hipMemcpyDeviceToHost)); for(size_t i = 0; i < size; i++) { @@ -206,21 +198,16 @@ TYPED_TEST(HipcubBlockExchangeTests, BlockedToStriped) HIP_CHECK(hipFree(device_output)); } -template< - class Type, - class OutputType, - unsigned int ItemsPerBlock, - unsigned int ItemsPerThread -> +template __global__ __launch_bounds__(512) void striped_to_blocked_kernel(Type* device_input, OutputType* device_output) { - constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; + constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; - Type input[ItemsPerThread]; + Type input[ItemsPerThread]; OutputType output[ItemsPerThread]; hipcub::LoadDirectBlocked(lid, device_input + block_offset, input); @@ -251,7 +238,7 @@ TYPED_TEST(HipcubBlockExchangeTests, StripedToBlocked) const size_t size = items_per_block * 113; // Generate data - std::vector input(size); + std::vector input(size); std::vector expected(size); std::vector output(size, test_utils::convert_to_device(0)); @@ -275,36 +262,36 @@ TYPED_TEST(HipcubBlockExchangeTests, StripedToBlocked) // Preparing device type* device_input; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_input, + input.size() * sizeof(typename decltype(input)::value_type))); output_type* device_output; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_output, + output.size() * sizeof(typename decltype(output)::value_type))); HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(type), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_input, input.data(), input.size() * sizeof(type), hipMemcpyHostToDevice)); // Running kernel constexpr unsigned int grid_size = (size / items_per_block); hipLaunchKernelGGL( - HIP_KERNEL_NAME(striped_to_blocked_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, device_output - ); - HIP_CHECK(hipPeekAtLastError()); + HIP_KERNEL_NAME( + striped_to_blocked_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_output); + HIP_CHECK(hipGetLastError()); HIP_CHECK(hipDeviceSynchronize()); // Reading results - HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(typename decltype(output)::value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(typename decltype(output)::value_type), + hipMemcpyDeviceToHost)); for(size_t i = 0; i < size; i++) { @@ -316,21 +303,16 @@ TYPED_TEST(HipcubBlockExchangeTests, StripedToBlocked) HIP_CHECK(hipFree(device_output)); } -template< - class Type, - class OutputType, - unsigned int ItemsPerBlock, - unsigned int ItemsPerThread -> +template __global__ __launch_bounds__(512) void blocked_to_warp_striped_kernel(Type* device_input, OutputType* device_output) { - constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; + constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; - Type input[ItemsPerThread]; + Type input[ItemsPerThread]; OutputType output[ItemsPerThread]; hipcub::LoadDirectBlocked(lid, device_input + block_offset, input); @@ -372,14 +354,16 @@ TYPED_TEST(HipcubBlockExchangeTests, BlockedToWarpStriped) const size_t size = items_per_block * 113; // Generate data - std::vector input(size); + std::vector input(size); std::vector expected(size); std::vector output(size, test_utils::convert_to_device(0)); - constexpr size_t warp_size_32 = test_utils::get_min_warp_size(block_size, size_t(HIPCUB_WARP_SIZE_32)); - constexpr size_t warp_size_64 = test_utils::get_min_warp_size(block_size, size_t(HIPCUB_WARP_SIZE_64)); - constexpr size_t warps_no_32 = (block_size + warp_size_32 - 1) / warp_size_32; - constexpr size_t warps_no_64 = (block_size + warp_size_64 - 1) / warp_size_64; + constexpr size_t warp_size_32 + = test_utils::get_min_warp_size(block_size, size_t(HIPCUB_WARP_SIZE_32)); + constexpr size_t warp_size_64 + = test_utils::get_min_warp_size(block_size, size_t(HIPCUB_WARP_SIZE_64)); + constexpr size_t warps_no_32 = (block_size + warp_size_32 - 1) / warp_size_32; + constexpr size_t warps_no_64 = (block_size + warp_size_64 - 1) / warp_size_64; constexpr size_t items_per_warp_32 = warp_size_32 * items_per_thread; constexpr size_t items_per_warp_64 = warp_size_64 * items_per_thread; @@ -387,24 +371,28 @@ TYPED_TEST(HipcubBlockExchangeTests, BlockedToWarpStriped) std::vector values(size); std::iota(values.begin(), values.end(), 0); - const size_t warps_no = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? warps_no_32 : warps_no_64; - const size_t warp_size = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? warp_size_32 : warp_size_64; - const size_t items_per_warp = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? items_per_warp_32 : items_per_warp_64; + const size_t warps_no + = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? warps_no_32 : warps_no_64; + const size_t warp_size + = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? warp_size_32 : warp_size_64; + const size_t items_per_warp + = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? items_per_warp_32 : items_per_warp_64; for(size_t bi = 0; bi < size / items_per_block; bi++) { for(size_t wi = 0; wi < warps_no; wi++) { - const size_t current_warp_size = wi == warps_no - 1 - ? (block_size % warp_size != 0 ? block_size % warp_size : warp_size) - : warp_size; + const size_t current_warp_size + = wi == warps_no - 1 + ? (block_size % warp_size != 0 ? block_size % warp_size : warp_size) + : warp_size; for(size_t li = 0; li < current_warp_size; li++) { for(size_t ii = 0; ii < items_per_thread; ii++) { const size_t offset = bi * items_per_block + wi * items_per_warp; - const size_t i0 = offset + li * items_per_thread + ii; - const size_t i1 = offset + ii * current_warp_size + li; + const size_t i0 = offset + li * items_per_thread + ii; + const size_t i1 = offset + ii * current_warp_size + li; input[i1] = test_utils::convert_to_device(values[i1]); expected[i0] = test_utils::convert_to_device(values[i1]); } @@ -414,38 +402,36 @@ TYPED_TEST(HipcubBlockExchangeTests, BlockedToWarpStriped) // Preparing device type* device_input; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_input, + input.size() * sizeof(typename decltype(input)::value_type))); output_type* device_output; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_output, + output.size() * sizeof(typename decltype(output)::value_type))); HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(type), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_input, input.data(), input.size() * sizeof(type), hipMemcpyHostToDevice)); // Running kernel constexpr unsigned int grid_size = (size / items_per_block); hipLaunchKernelGGL( - HIP_KERNEL_NAME(blocked_to_warp_striped_kernel< - type, output_type, items_per_block, items_per_thread - >), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, device_output - ); - HIP_CHECK(hipPeekAtLastError()); + HIP_KERNEL_NAME( + blocked_to_warp_striped_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_output); + HIP_CHECK(hipGetLastError()); HIP_CHECK(hipDeviceSynchronize()); // Reading results - HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(typename decltype(output)::value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(typename decltype(output)::value_type), + hipMemcpyDeviceToHost)); for(size_t i = 0; i < size; i++) { @@ -457,21 +443,16 @@ TYPED_TEST(HipcubBlockExchangeTests, BlockedToWarpStriped) HIP_CHECK(hipFree(device_output)); } -template< - class Type, - class OutputType, - unsigned int ItemsPerBlock, - unsigned int ItemsPerThread -> +template __global__ __launch_bounds__(512) void warp_striped_to_blocked_kernel(Type* device_input, OutputType* device_output) { - constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; + constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; - Type input[ItemsPerThread]; + Type input[ItemsPerThread]; OutputType output[ItemsPerThread]; hipcub::LoadDirectBlocked(lid, device_input + block_offset, input); @@ -513,14 +494,16 @@ TYPED_TEST(HipcubBlockExchangeTests, WarpStripedToBlocked) const size_t size = items_per_block * 113; // Generate data - std::vector input(size); + std::vector input(size); std::vector expected(size); std::vector output(size, test_utils::convert_to_device(0)); - constexpr size_t warp_size_32 = test_utils::get_min_warp_size(block_size, size_t(HIPCUB_WARP_SIZE_32)); - constexpr size_t warp_size_64 = test_utils::get_min_warp_size(block_size, size_t(HIPCUB_WARP_SIZE_64)); - constexpr size_t warps_no_32 = (block_size + warp_size_32 - 1) / warp_size_32; - constexpr size_t warps_no_64 = (block_size + warp_size_64 - 1) / warp_size_64; + constexpr size_t warp_size_32 + = test_utils::get_min_warp_size(block_size, size_t(HIPCUB_WARP_SIZE_32)); + constexpr size_t warp_size_64 + = test_utils::get_min_warp_size(block_size, size_t(HIPCUB_WARP_SIZE_64)); + constexpr size_t warps_no_32 = (block_size + warp_size_32 - 1) / warp_size_32; + constexpr size_t warps_no_64 = (block_size + warp_size_64 - 1) / warp_size_64; constexpr size_t items_per_warp_32 = warp_size_32 * items_per_thread; constexpr size_t items_per_warp_64 = warp_size_64 * items_per_thread; @@ -528,17 +511,21 @@ TYPED_TEST(HipcubBlockExchangeTests, WarpStripedToBlocked) std::vector values(size); std::iota(values.begin(), values.end(), 0); - const size_t warps_no = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? warps_no_32 : warps_no_64; - const size_t warp_size = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? warp_size_32 : warp_size_64; - const size_t items_per_warp = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? items_per_warp_32 : items_per_warp_64; + const size_t warps_no + = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? warps_no_32 : warps_no_64; + const size_t warp_size + = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? warp_size_32 : warp_size_64; + const size_t items_per_warp + = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? items_per_warp_32 : items_per_warp_64; for(size_t bi = 0; bi < size / items_per_block; bi++) { for(size_t wi = 0; wi < warps_no; wi++) { - const size_t current_warp_size = wi == warps_no - 1 - ? (block_size % warp_size != 0 ? block_size % warp_size : warp_size) - : warp_size; + const size_t current_warp_size + = wi == warps_no - 1 + ? (block_size % warp_size != 0 ? block_size % warp_size : warp_size) + : warp_size; for(size_t li = 0; li < current_warp_size; li++) { for(size_t ii = 0; ii < items_per_thread; ii++) @@ -555,36 +542,36 @@ TYPED_TEST(HipcubBlockExchangeTests, WarpStripedToBlocked) // Preparing device type* device_input; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_input, + input.size() * sizeof(typename decltype(input)::value_type))); output_type* device_output; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_output, + output.size() * sizeof(typename decltype(output)::value_type))); HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(type), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_input, input.data(), input.size() * sizeof(type), hipMemcpyHostToDevice)); // Running kernel constexpr unsigned int grid_size = (size / items_per_block); hipLaunchKernelGGL( - HIP_KERNEL_NAME(warp_striped_to_blocked_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, device_output - ); - HIP_CHECK(hipPeekAtLastError()); + HIP_KERNEL_NAME( + warp_striped_to_blocked_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_output); + HIP_CHECK(hipGetLastError()); HIP_CHECK(hipDeviceSynchronize()); // Reading results - HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(typename decltype(output)::value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(typename decltype(output)::value_type), + hipMemcpyDeviceToHost)); for(size_t i = 0; i < size; i++) { @@ -596,22 +583,19 @@ TYPED_TEST(HipcubBlockExchangeTests, WarpStripedToBlocked) HIP_CHECK(hipFree(device_output)); } -template< - class Type, - class OutputType, - unsigned int ItemsPerBlock, - unsigned int ItemsPerThread -> +template __global__ __launch_bounds__(512) -void scatter_to_blocked_kernel(Type* device_input, OutputType* device_output, unsigned int* device_ranks) +void scatter_to_blocked_kernel(Type* device_input, + OutputType* device_output, + unsigned int* device_ranks) { - constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; + constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; - Type input[ItemsPerThread]; - OutputType output[ItemsPerThread]; + Type input[ItemsPerThread]; + OutputType output[ItemsPerThread]; unsigned int ranks[ItemsPerThread]; hipcub::LoadDirectBlocked(lid, device_input + block_offset, input); hipcub::LoadDirectBlocked(lid, device_ranks + block_offset, ranks); @@ -643,8 +627,8 @@ TYPED_TEST(HipcubBlockExchangeTests, ScatterToBlocked) const size_t size = items_per_block * 113; // Generate data - std::vector input(size); - std::vector expected(size); + std::vector input(size); + std::vector expected(size); std::vector output(size, test_utils::convert_to_device(0)); std::vector ranks(size); @@ -653,7 +637,9 @@ TYPED_TEST(HipcubBlockExchangeTests, ScatterToBlocked) { auto block_ranks = ranks.begin() + bi * items_per_block; std::iota(block_ranks, block_ranks + items_per_block, 0); - std::shuffle(block_ranks, block_ranks + items_per_block, std::mt19937{std::random_device{}()}); + std::shuffle(block_ranks, + block_ranks + items_per_block, + std::mt19937{std::random_device{}()}); } std::vector values(size); std::iota(values.begin(), values.end(), 0); @@ -674,46 +660,46 @@ TYPED_TEST(HipcubBlockExchangeTests, ScatterToBlocked) // Preparing device type* device_input; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_input, + input.size() * sizeof(typename decltype(input)::value_type))); output_type* device_output; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_output, + output.size() * sizeof(typename decltype(output)::value_type))); unsigned int* device_ranks; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_ranks, ranks.size() * sizeof(typename decltype(ranks)::value_type))); + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_ranks, + ranks.size() * sizeof(typename decltype(ranks)::value_type))); HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(type), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_input, input.data(), input.size() * sizeof(type), hipMemcpyHostToDevice)); - HIP_CHECK( - hipMemcpy( - device_ranks, ranks.data(), - ranks.size() * sizeof(unsigned int), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(device_ranks, + ranks.data(), + ranks.size() * sizeof(unsigned int), + hipMemcpyHostToDevice)); // Running kernel constexpr unsigned int grid_size = (size / items_per_block); hipLaunchKernelGGL( - HIP_KERNEL_NAME(scatter_to_blocked_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, device_output, device_ranks - ); - HIP_CHECK(hipPeekAtLastError()); + HIP_KERNEL_NAME( + scatter_to_blocked_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_output, + device_ranks); + HIP_CHECK(hipGetLastError()); HIP_CHECK(hipDeviceSynchronize()); // Reading results - HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(typename decltype(output)::value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(typename decltype(output)::value_type), + hipMemcpyDeviceToHost)); for(size_t i = 0; i < size; i++) { @@ -726,22 +712,19 @@ TYPED_TEST(HipcubBlockExchangeTests, ScatterToBlocked) HIP_CHECK(hipFree(device_ranks)); } -template< - class Type, - class OutputType, - unsigned int ItemsPerBlock, - unsigned int ItemsPerThread -> +template __global__ __launch_bounds__(512) -void scatter_to_striped_kernel(Type* device_input, OutputType* device_output, unsigned int* device_ranks) +void scatter_to_striped_kernel(Type* device_input, + OutputType* device_output, + unsigned int* device_ranks) { - constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; + constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; - Type input[ItemsPerThread]; - OutputType output[ItemsPerThread]; + Type input[ItemsPerThread]; + OutputType output[ItemsPerThread]; unsigned int ranks[ItemsPerThread]; hipcub::LoadDirectBlocked(lid, device_input + block_offset, input); hipcub::LoadDirectBlocked(lid, device_ranks + block_offset, ranks); @@ -773,8 +756,8 @@ TYPED_TEST(HipcubBlockExchangeTests, ScatterToStriped) const size_t size = items_per_block * 113; // Generate data - std::vector input(size); - std::vector expected(size); + std::vector input(size); + std::vector expected(size); std::vector output(size, test_utils::convert_to_device(0)); std::vector ranks(size); @@ -783,7 +766,9 @@ TYPED_TEST(HipcubBlockExchangeTests, ScatterToStriped) { auto block_ranks = ranks.begin() + bi * items_per_block; std::iota(block_ranks, block_ranks + items_per_block, 0); - std::shuffle(block_ranks, block_ranks + items_per_block, std::mt19937{std::random_device{}()}); + std::shuffle(block_ranks, + block_ranks + items_per_block, + std::mt19937{std::random_device{}()}); } std::vector values(size); std::iota(values.begin(), values.end(), 0); @@ -794,10 +779,9 @@ TYPED_TEST(HipcubBlockExchangeTests, ScatterToStriped) for(size_t ii = 0; ii < items_per_thread; ii++) { const size_t offset = bi * items_per_block; - const size_t i0 = offset + ti * items_per_thread + ii; - const size_t i1 = offset - + ranks[i0] % block_size * items_per_thread - + ranks[i0] / block_size; + const size_t i0 = offset + ti * items_per_thread + ii; + const size_t i1 + = offset + ranks[i0] % block_size * items_per_thread + ranks[i0] / block_size; input[i0] = test_utils::convert_to_device(values[i0]); expected[i1] = test_utils::convert_to_device(values[i0]); } @@ -806,46 +790,46 @@ TYPED_TEST(HipcubBlockExchangeTests, ScatterToStriped) // Preparing device type* device_input; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_input, + input.size() * sizeof(typename decltype(input)::value_type))); output_type* device_output; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_output, + output.size() * sizeof(typename decltype(output)::value_type))); unsigned int* device_ranks; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_ranks, ranks.size() * sizeof(typename decltype(ranks)::value_type))); + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_ranks, + ranks.size() * sizeof(typename decltype(ranks)::value_type))); HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(type), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_input, input.data(), input.size() * sizeof(type), hipMemcpyHostToDevice)); - HIP_CHECK( - hipMemcpy( - device_ranks, ranks.data(), - ranks.size() * sizeof(unsigned int), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(device_ranks, + ranks.data(), + ranks.size() * sizeof(unsigned int), + hipMemcpyHostToDevice)); // Running kernel constexpr unsigned int grid_size = (size / items_per_block); hipLaunchKernelGGL( - HIP_KERNEL_NAME(scatter_to_striped_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, device_output, device_ranks - ); - HIP_CHECK(hipPeekAtLastError()); + HIP_KERNEL_NAME( + scatter_to_striped_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_output, + device_ranks); + HIP_CHECK(hipGetLastError()); HIP_CHECK(hipDeviceSynchronize()); // Reading results - HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(typename decltype(output)::value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(typename decltype(output)::value_type), + hipMemcpyDeviceToHost)); for(size_t i = 0; i < size; i++) { @@ -856,5 +840,1139 @@ TYPED_TEST(HipcubBlockExchangeTests, ScatterToStriped) HIP_CHECK(hipFree(device_input)); HIP_CHECK(hipFree(device_output)); HIP_CHECK(hipFree(device_ranks)); +} + +template +__global__ +void scatter_to_stripped_guarded_kernel(T* device_input, T* device_output, int* device_ranks) +{ + const size_t items_per_block = items_per_thread * block_size; + const size_t offset = (blockIdx.x * items_per_block) + threadIdx.x * items_per_thread; + + T input[items_per_thread]; + T output[items_per_thread]; + int ranks[items_per_thread]; + + for(size_t i = 0; i < items_per_thread; i++) + { + input[i] = device_input[offset + i]; + ranks[i] = device_ranks[offset + i]; + } + hipcub::BlockExchange exchange; + exchange.ScatterToStripedGuarded(input, output, ranks); + + for(size_t i = 0; i < items_per_thread; i++) + { + device_output[offset + i] = (i == items_per_thread - 1) && (threadIdx.x == block_size - 1) + ? static_cast(0) + : output[i]; + } +} + +TYPED_TEST(HipcubBlockExchangeTests, ScatterToStripedGuarded) +{ + using type = typename TestFixture::params::type; + constexpr size_t block_size = TestFixture::params::block_size; + constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + constexpr size_t grid_size = 113; + + const size_t items_per_block = block_size * items_per_thread; + const size_t size = grid_size * items_per_block; + + type* host_input = new type[size]; + type* host_expected = new type[size]; + int* host_ranks = new int[size]; + + std::iota(host_input, host_input + size, 0); + for(size_t i = 0; i < grid_size; i++) + { + size_t offset = i * items_per_block; + std::iota(host_ranks + offset, host_ranks + offset + items_per_block - 1, 0); + std::shuffle(host_ranks + offset, + host_ranks + offset + items_per_block - 1, + std::mt19937{std::random_device{}()}); + } + for(size_t i = items_per_block - 1; i < size; i += items_per_block) + { + host_ranks[i] = -1; + host_expected[i] = static_cast(0); + } + + for(size_t bi = 0; bi < size / items_per_block; bi++) + { + for(size_t ti = 0; ti < block_size; ti++) + { + for(size_t ii = 0; ii < items_per_thread; ii++) + { + const size_t offset = bi * items_per_block; + const size_t i0 = offset + ti * items_per_thread + ii; + const size_t i1 = offset + host_ranks[i0] % block_size * items_per_thread + + host_ranks[i0] / block_size; + if(i1 >= 0 && i1 < size) + host_expected[i1] = host_input[i0]; + } + } + } + + type* device_input; + type* device_output; + int* device_ranks; + + HIP_CHECK(hipMalloc(&device_input, sizeof(type) * size)); + HIP_CHECK(hipMalloc(&device_output, sizeof(type) * size)); + HIP_CHECK(hipMalloc(&device_ranks, sizeof(int) * size)); + + HIP_CHECK(hipMemcpy(device_input, host_input, sizeof(type) * size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(device_ranks, host_ranks, sizeof(int) * size, hipMemcpyHostToDevice)); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(scatter_to_stripped_guarded_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_output, + device_ranks); + + type* host_output = new type[size]; + HIP_CHECK(hipMemcpy(host_output, device_output, sizeof(type) * size, hipMemcpyDeviceToHost)); + + for(size_t i = 0; i < size; i++) + ASSERT_EQ(host_output[i], host_expected[i]); + + delete[] host_input; + delete[] host_expected; + delete[] host_ranks; + delete[] host_output; + + HIP_CHECK(hipFree(device_input)); + HIP_CHECK(hipFree(device_output)); + HIP_CHECK(hipFree(device_ranks)); +} + +template +__global__ +void scatter_to_stripped_flagged_kernel(T* device_input, + T* device_output, + int* device_ranks, + bool* device_flags) +{ + const size_t items_per_block = items_per_thread * block_size; + const size_t offset = (blockIdx.x * items_per_block) + threadIdx.x * items_per_thread; + + T input[items_per_thread]; + T output[items_per_thread]; + int ranks[items_per_thread]; + bool flags[items_per_thread]; + + for(size_t i = 0; i < items_per_thread; i++) + { + input[i] = device_input[offset + i]; + ranks[i] = device_ranks[offset + i]; + flags[i] = device_flags[offset + i]; + } + hipcub::BlockExchange exchange; + exchange.ScatterToStripedFlagged(input, output, ranks, flags); + + for(size_t i = 0; i < items_per_thread; i++) + { + device_output[offset + i] = (i == items_per_thread - 1) && (threadIdx.x == block_size - 1) + ? static_cast(0) + : output[i]; + } +} + +TYPED_TEST(HipcubBlockExchangeTests, ScatterToStripedFlagged) +{ + using type = typename TestFixture::params::type; + constexpr size_t block_size = TestFixture::params::block_size; + constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + constexpr size_t grid_size = 113; + + const size_t items_per_block = block_size * items_per_thread; + const size_t size = grid_size * items_per_block; + + type* host_input = new type[size]; + type* host_expected = new type[size]; + int* host_ranks = new int[size]; + bool* host_flags = new bool[size]; + + std::iota(host_input, host_input + size, 0); + for(size_t i = 0; i < grid_size; i++) + { + size_t offset = i * items_per_block; + std::iota(host_ranks + offset, host_ranks + offset + items_per_block - 1, 0); + std::shuffle(host_ranks + offset, + host_ranks + offset + items_per_block - 1, + std::mt19937{std::random_device{}()}); + } + + for(size_t i = items_per_block - 1; i < size; i += items_per_block) + { + host_ranks[i] = -1; + host_expected[i] = static_cast(0); + } + + for(size_t bi = 0; bi < size / items_per_block; bi++) + { + for(size_t ti = 0; ti < block_size; ti++) + { + for(size_t ii = 0; ii < items_per_thread; ii++) + { + const size_t offset = bi * items_per_block; + const size_t i0 = offset + ti * items_per_thread + ii; + const size_t i1 = offset + host_ranks[i0] % block_size * items_per_thread + + host_ranks[i0] / block_size; + if(i1 >= 0 && i1 < size) + host_expected[i1] = host_input[i0]; + host_flags[i0] + = (ti == block_size - 1) && (ii == items_per_thread - 1) ? false : true; + } + } + } + + for(size_t i = items_per_block - 1; i < size; i += items_per_block) + host_ranks[i] = 5; + + type* device_input; + type* device_output; + int* device_ranks; + bool* device_flags; + + HIP_CHECK(hipMalloc(&device_input, sizeof(type) * size)); + HIP_CHECK(hipMalloc(&device_output, sizeof(type) * size)); + HIP_CHECK(hipMalloc(&device_ranks, sizeof(int) * size)); + HIP_CHECK(hipMalloc(&device_flags, sizeof(bool) * size)); + + HIP_CHECK(hipMemcpy(device_input, host_input, sizeof(type) * size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(device_ranks, host_ranks, sizeof(int) * size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(device_flags, host_flags, sizeof(bool) * size, hipMemcpyHostToDevice)); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(scatter_to_stripped_flagged_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_output, + device_ranks, + device_flags); + + type* host_output = new type[size]; + HIP_CHECK(hipMemcpy(host_output, device_output, sizeof(type) * size, hipMemcpyDeviceToHost)); + + for(size_t i = 0; i < size; i++) + ASSERT_EQ(host_output[i], host_expected[i]); + + delete[] host_input; + delete[] host_expected; + delete[] host_ranks; + delete[] host_output; + delete[] host_flags; + + HIP_CHECK(hipFree(device_input)); + HIP_CHECK(hipFree(device_output)); + HIP_CHECK(hipFree(device_ranks)); + HIP_CHECK(hipFree(device_flags)); +} +template +__global__ +__launch_bounds__(512) +void striped_to_blocked_one_param_kernel(Type* device_input) +{ + constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; + + Type input[ItemsPerThread]; + hipcub::LoadDirectBlocked(lid, device_input + block_offset, input); + + hipcub::BlockExchange exchange; + exchange.StripedToBlocked(input); + + hipcub::StoreDirectBlocked(lid, device_input + block_offset, input); +} + +TYPED_TEST(HipcubBlockExchangeTests, StripedToBlockedOneParam) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using type = typename TestFixture::params::type; + using output_type = typename TestFixture::params::output_type; + + constexpr size_t block_size = TestFixture::params::block_size; + constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + constexpr size_t items_per_block = block_size * items_per_thread; + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t size = items_per_block * 113; + // Generate data + type* input = new type[size]; + type* expected = new type[size]; + + // Calculate input and expected results on host + type* values = new type[size]; + std::iota(values, values + size, 0); + + for(size_t bi = 0; bi < size / items_per_block; bi++) + { + for(size_t ti = 0; ti < block_size; ti++) + { + for(size_t ii = 0; ii < items_per_thread; ii++) + { + const size_t offset = bi * items_per_block; + const size_t i0 = offset + ti * items_per_thread + ii; + const size_t i1 = offset + ii * block_size + ti; + input[i0] = values[i1]; + expected[i1] = values[i1]; + } + } + } + + // Preparing device + type* device_input; + HIP_CHECK(hipMalloc(&device_input, sizeof(type) * size)); + + HIP_CHECK(hipMemcpy(device_input, input, sizeof(type) * size, hipMemcpyHostToDevice)); + + // Running kernel + constexpr unsigned int grid_size = (size / items_per_block); + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + striped_to_blocked_one_param_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Reading results + HIP_CHECK(hipMemcpy(input, device_input, sizeof(type) * size, hipMemcpyDeviceToHost)); + + for(size_t i = 0; i < size; i++) + { + ASSERT_EQ(input[i], expected[i]); + } + + HIP_CHECK(hipFree(device_input)); + delete[] input; + delete[] expected; + delete[] values; +} + +template +__global__ +__launch_bounds__(512) +void blocked_to_striped_one_param_kernel(Type* device_input) +{ + constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; + + Type input[ItemsPerThread]; + hipcub::LoadDirectBlocked(lid, device_input + block_offset, input); + + hipcub::BlockExchange exchange; + exchange.BlockedToStriped(input); + + hipcub::StoreDirectBlocked(lid, device_input + block_offset, input); +} + +TYPED_TEST(HipcubBlockExchangeTests, BlockedToStripedOneParam) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using type = typename TestFixture::params::type; + using output_type = typename TestFixture::params::output_type; + + constexpr size_t block_size = TestFixture::params::block_size; + constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + constexpr size_t items_per_block = block_size * items_per_thread; + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t size = items_per_block * 113; + // Generate data + type* input = new type[size]; + type* expected = new type[size]; + + // Calculate input and expected results on host + type* values = new type[size]; + std::iota(values, values + size, 0); + + for(size_t bi = 0; bi < size / items_per_block; bi++) + { + for(size_t ti = 0; ti < block_size; ti++) + { + for(size_t ii = 0; ii < items_per_thread; ii++) + { + const size_t offset = bi * items_per_block; + const size_t i0 = offset + ti * items_per_thread + ii; + const size_t i1 = offset + ii * block_size + ti; + input[i1] = values[i1]; + expected[i0] = values[i1]; + } + } + } + + // Preparing device + type* device_input; + HIP_CHECK(hipMalloc(&device_input, sizeof(type) * size)); + + HIP_CHECK(hipMemcpy(device_input, input, sizeof(type) * size, hipMemcpyHostToDevice)); + + // Running kernel + constexpr unsigned int grid_size = (size / items_per_block); + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + blocked_to_striped_one_param_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Reading results + HIP_CHECK(hipMemcpy(input, device_input, sizeof(type) * size, hipMemcpyDeviceToHost)); + + for(size_t i = 0; i < size; i++) + { + ASSERT_EQ(input[i], expected[i]); + } + + HIP_CHECK(hipFree(device_input)); + delete[] input; + delete[] expected; + delete[] values; } + +template +__global__ +__launch_bounds__(512) +void warp_striped_to_blocked_one_param_kernel(Type* device_input) +{ + constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; + + Type input[ItemsPerThread]; + hipcub::LoadDirectBlocked(lid, device_input + block_offset, input); + + hipcub::BlockExchange exchange; + exchange.WarpStripedToBlocked(input); + + hipcub::StoreDirectBlocked(lid, device_input + block_offset, input); +} + +TYPED_TEST(HipcubBlockExchangeTests, WarpStripedToBlockedOneParam) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using type = typename TestFixture::params::type; + + constexpr size_t block_size = TestFixture::params::block_size; + constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + constexpr size_t items_per_block = block_size * items_per_thread; + + const unsigned int current_device_warp_size = HIPCUB_HOST_WARP_THREADS; + // Given block size not supported + bool is_block_size_unsupported = block_size > test_utils::get_max_block_size(); +#ifdef HIPCUB_CUB_API + // CUB does not support exchanges to/from warp-striped arrangements + // for incomplete blocks (not divisible by warp size) + // Workaround for nvcc warning: "dynamic initialization in unreachable code" + // (not a simple if with compile-time expression) + is_block_size_unsupported |= block_size % current_device_warp_size != 0; +#endif + if(is_block_size_unsupported) + { + printf("Unsupported test block size: %zu. Skipping test\n", block_size); + GTEST_SKIP(); + } + + const size_t size = items_per_block * 113; + // Generate data + type* input = new type[size]; + type* expected = new type[size]; + + constexpr size_t warp_size_32 + = test_utils::get_min_warp_size(block_size, size_t(HIPCUB_WARP_SIZE_32)); + constexpr size_t warp_size_64 + = test_utils::get_min_warp_size(block_size, size_t(HIPCUB_WARP_SIZE_64)); + constexpr size_t warps_no_32 = (block_size + warp_size_32 - 1) / warp_size_32; + constexpr size_t warps_no_64 = (block_size + warp_size_64 - 1) / warp_size_64; + constexpr size_t items_per_warp_32 = warp_size_32 * items_per_thread; + constexpr size_t items_per_warp_64 = warp_size_64 * items_per_thread; + + // Calculate input and expected results on host + type* values = new type[size]; + std::iota(values, values + size, 0); + + const size_t warps_no + = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? warps_no_32 : warps_no_64; + const size_t warp_size + = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? warp_size_32 : warp_size_64; + const size_t items_per_warp + = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? items_per_warp_32 : items_per_warp_64; + + for(size_t bi = 0; bi < size / items_per_block; bi++) + { + for(size_t wi = 0; wi < warps_no; wi++) + { + const size_t current_warp_size + = wi == warps_no - 1 + ? (block_size % warp_size != 0 ? block_size % warp_size : warp_size) + : warp_size; + for(size_t li = 0; li < current_warp_size; li++) + { + for(size_t ii = 0; ii < items_per_thread; ii++) + { + const size_t offset = bi * items_per_block + wi * items_per_warp; + const size_t i0 = offset + li * items_per_thread + ii; + const size_t i1 = offset + ii * current_warp_size + li; + input[i0] = values[i1]; + expected[i1] = values[i1]; + } + } + } + } + + // Preparing device + type* device_input; + HIP_CHECK(hipMalloc(&device_input, sizeof(type) * size)); + + HIP_CHECK(hipMemcpy(device_input, input, sizeof(type) * size, hipMemcpyHostToDevice)); + + // Running kernel + constexpr unsigned int grid_size = (size / items_per_block); + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + warp_striped_to_blocked_one_param_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Reading results + HIP_CHECK(hipMemcpy(input, device_input, sizeof(type) * size, hipMemcpyDeviceToHost)); + + for(size_t i = 0; i < size; i++) + ASSERT_EQ(input[i], expected[i]); + + HIP_CHECK(hipFree(device_input)); + delete[] input; + delete[] expected; + delete[] values; +} + +template +__global__ +__launch_bounds__(512) +void blocked_to_warp_striped_one_param_kernel(Type* device_input) +{ + constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; + + Type input[ItemsPerThread]; + hipcub::LoadDirectBlocked(lid, device_input + block_offset, input); + + hipcub::BlockExchange exchange; + exchange.BlockedToWarpStriped(input); + + hipcub::StoreDirectBlocked(lid, device_input + block_offset, input); +} + +TYPED_TEST(HipcubBlockExchangeTests, BlockedToWarpStripedOneParam) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using type = typename TestFixture::params::type; + + constexpr size_t block_size = TestFixture::params::block_size; + constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + constexpr size_t items_per_block = block_size * items_per_thread; + + const unsigned int current_device_warp_size = HIPCUB_HOST_WARP_THREADS; + // Given block size not supported + bool is_block_size_unsupported = block_size > test_utils::get_max_block_size(); +#ifdef HIPCUB_CUB_API + // CUB does not support exchanges to/from warp-striped arrangements + // for incomplete blocks (not divisible by warp size) + // Workaround for nvcc warning: "dynamic initialization in unreachable code" + // (not a simple if with compile-time expression) + is_block_size_unsupported |= block_size % current_device_warp_size != 0; +#endif + if(is_block_size_unsupported) + { + printf("Unsupported test block size: %zu. Skipping test\n", block_size); + GTEST_SKIP(); + } + + const size_t size = items_per_block * 113; + // Generate data + std::vector input(size); + std::vector expected(size); + + constexpr size_t warp_size_32 + = test_utils::get_min_warp_size(block_size, size_t(HIPCUB_WARP_SIZE_32)); + constexpr size_t warp_size_64 + = test_utils::get_min_warp_size(block_size, size_t(HIPCUB_WARP_SIZE_64)); + constexpr size_t warps_no_32 = (block_size + warp_size_32 - 1) / warp_size_32; + constexpr size_t warps_no_64 = (block_size + warp_size_64 - 1) / warp_size_64; + constexpr size_t items_per_warp_32 = warp_size_32 * items_per_thread; + constexpr size_t items_per_warp_64 = warp_size_64 * items_per_thread; + + // Calculate input and expected results on host + std::vector values(size); + std::iota(values.begin(), values.end(), 0); + + const size_t warps_no + = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? warps_no_32 : warps_no_64; + const size_t warp_size + = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? warp_size_32 : warp_size_64; + const size_t items_per_warp + = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? items_per_warp_32 : items_per_warp_64; + + for(size_t bi = 0; bi < size / items_per_block; bi++) + { + for(size_t wi = 0; wi < warps_no; wi++) + { + const size_t current_warp_size + = wi == warps_no - 1 + ? (block_size % warp_size != 0 ? block_size % warp_size : warp_size) + : warp_size; + for(size_t li = 0; li < current_warp_size; li++) + { + for(size_t ii = 0; ii < items_per_thread; ii++) + { + const size_t offset = bi * items_per_block + wi * items_per_warp; + const size_t i0 = offset + li * items_per_thread + ii; + const size_t i1 = offset + ii * current_warp_size + li; + input[i1] = test_utils::convert_to_device(values[i1]); + expected[i0] = test_utils::convert_to_device(values[i1]); + } + } + } + } + + // Preparing device + type* device_input; + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_input, + input.size() * sizeof(typename decltype(input)::value_type))); + + HIP_CHECK( + hipMemcpy(device_input, input.data(), input.size() * sizeof(type), hipMemcpyHostToDevice)); + + // Running kernel + constexpr unsigned int grid_size = (size / items_per_block); + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + blocked_to_warp_striped_one_param_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Reading results + HIP_CHECK(hipMemcpy(input.data(), + device_input, + input.size() * sizeof(typename decltype(input)::value_type), + hipMemcpyDeviceToHost)); + + for(size_t i = 0; i < size; i++) + { + ASSERT_EQ(test_utils::convert_to_native(input[i]), + test_utils::convert_to_native(expected[i])); + } + + HIP_CHECK(hipFree(device_input)); +} + +template +__global__ +__launch_bounds__(512) +void scatter_to_blocked_no_output_param_kernel(Type* device_input, unsigned int* device_ranks) +{ + constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; + + Type input[ItemsPerThread]; + unsigned int ranks[ItemsPerThread]; + hipcub::LoadDirectBlocked(lid, device_input + block_offset, input); + hipcub::LoadDirectBlocked(lid, device_ranks + block_offset, ranks); + + hipcub::BlockExchange exchange; + exchange.ScatterToBlocked(input, ranks); + + hipcub::StoreDirectBlocked(lid, device_input + block_offset, input); +} + +TYPED_TEST(HipcubBlockExchangeTests, ScatterToBlockedNoOutputParam) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using type = typename TestFixture::params::type; + + constexpr size_t block_size = TestFixture::params::block_size; + constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + constexpr size_t items_per_block = block_size * items_per_thread; + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t size = items_per_block * 113; + // Generate data + std::vector input(size); + std::vector expected(size); + std::vector ranks(size); + + // Calculate input and expected results on host + for(size_t bi = 0; bi < size / items_per_block; bi++) + { + auto block_ranks = ranks.begin() + bi * items_per_block; + std::iota(block_ranks, block_ranks + items_per_block, 0); + std::shuffle(block_ranks, + block_ranks + items_per_block, + std::mt19937{std::random_device{}()}); + } + std::vector values(size); + std::iota(values.begin(), values.end(), 0); + for(size_t bi = 0; bi < size / items_per_block; bi++) + { + for(size_t ti = 0; ti < block_size; ti++) + { + for(size_t ii = 0; ii < items_per_thread; ii++) + { + const size_t offset = bi * items_per_block; + const size_t i0 = offset + ti * items_per_thread + ii; + const size_t i1 = offset + ranks[i0]; + input[i0] = test_utils::convert_to_device(values[i0]); + expected[i1] = test_utils::convert_to_device(values[i0]); + } + } + } + + // Preparing device + type* device_input; + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_input, + input.size() * sizeof(typename decltype(input)::value_type))); + unsigned int* device_ranks; + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_ranks, + ranks.size() * sizeof(typename decltype(ranks)::value_type))); + + HIP_CHECK( + hipMemcpy(device_input, input.data(), input.size() * sizeof(type), hipMemcpyHostToDevice)); + + HIP_CHECK(hipMemcpy(device_ranks, + ranks.data(), + ranks.size() * sizeof(unsigned int), + hipMemcpyHostToDevice)); + + // Running kernel + constexpr unsigned int grid_size = (size / items_per_block); + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + scatter_to_blocked_no_output_param_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_ranks); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Reading results + HIP_CHECK(hipMemcpy(input.data(), + device_input, + input.size() * sizeof(typename decltype(input)::value_type), + hipMemcpyDeviceToHost)); + + for(size_t i = 0; i < size; i++) + { + ASSERT_EQ(test_utils::convert_to_native(input[i]), + test_utils::convert_to_native(expected[i])); + } + + HIP_CHECK(hipFree(device_input)); + HIP_CHECK(hipFree(device_ranks)); +} + +template +__global__ +__launch_bounds__(512) +void scatter_to_striped_no_output_param_kernel(Type* device_input, unsigned int* device_ranks) +{ + constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; + + Type input[ItemsPerThread]; + unsigned int ranks[ItemsPerThread]; + hipcub::LoadDirectBlocked(lid, device_input + block_offset, input); + hipcub::LoadDirectBlocked(lid, device_ranks + block_offset, ranks); + + hipcub::BlockExchange exchange; + exchange.ScatterToStriped(input, ranks); + + hipcub::StoreDirectBlocked(lid, device_input + block_offset, input); +} + +TYPED_TEST(HipcubBlockExchangeTests, ScatterToStripedNoOutputParam) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using type = typename TestFixture::params::type; + + constexpr size_t block_size = TestFixture::params::block_size; + constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + constexpr size_t items_per_block = block_size * items_per_thread; + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t size = items_per_block * 113; + // Generate data + std::vector input(size); + std::vector expected(size); + std::vector ranks(size); + + // Calculate input and expected results on host + for(size_t bi = 0; bi < size / items_per_block; bi++) + { + auto block_ranks = ranks.begin() + bi * items_per_block; + std::iota(block_ranks, block_ranks + items_per_block, 0); + std::shuffle(block_ranks, + block_ranks + items_per_block, + std::mt19937{std::random_device{}()}); + } + std::vector values(size); + std::iota(values.begin(), values.end(), 0); + for(size_t bi = 0; bi < size / items_per_block; bi++) + { + for(size_t ti = 0; ti < block_size; ti++) + { + for(size_t ii = 0; ii < items_per_thread; ii++) + { + const size_t offset = bi * items_per_block; + const size_t i0 = offset + ti * items_per_thread + ii; + const size_t i1 + = offset + ranks[i0] % block_size * items_per_thread + ranks[i0] / block_size; + input[i0] = test_utils::convert_to_device(values[i0]); + expected[i1] = test_utils::convert_to_device(values[i0]); + } + } + } + + // Preparing device + type* device_input; + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_input, + input.size() * sizeof(typename decltype(input)::value_type))); + unsigned int* device_ranks; + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_ranks, + ranks.size() * sizeof(typename decltype(ranks)::value_type))); + + HIP_CHECK( + hipMemcpy(device_input, input.data(), input.size() * sizeof(type), hipMemcpyHostToDevice)); + + HIP_CHECK(hipMemcpy(device_ranks, + ranks.data(), + ranks.size() * sizeof(unsigned int), + hipMemcpyHostToDevice)); + + // Running kernel + constexpr unsigned int grid_size = (size / items_per_block); + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + scatter_to_striped_no_output_param_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_ranks); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Reading results + HIP_CHECK(hipMemcpy(input.data(), + device_input, + input.size() * sizeof(typename decltype(input)::value_type), + hipMemcpyDeviceToHost)); + + for(size_t i = 0; i < size; i++) + { + ASSERT_EQ(test_utils::convert_to_native(input[i]), + test_utils::convert_to_native(expected[i])); + } + + HIP_CHECK(hipFree(device_input)); + HIP_CHECK(hipFree(device_ranks)); +} + +template +__global__ +void scatter_to_stripped_guarded_no_output_param_kernel(T* device_input, int* device_ranks) +{ + const size_t items_per_block = items_per_thread * block_size; + const size_t offset = (blockIdx.x * items_per_block) + threadIdx.x * items_per_thread; + + T input[items_per_thread]; + int ranks[items_per_thread]; + + for(size_t i = 0; i < items_per_thread; i++) + { + input[i] = device_input[offset + i]; + ranks[i] = device_ranks[offset + i]; + } + hipcub::BlockExchange exchange; + exchange.ScatterToStripedGuarded(input, ranks); + + for(size_t i = 0; i < items_per_thread; i++) + { + device_input[offset + i] = (i == items_per_thread - 1) && (threadIdx.x == block_size - 1) + ? static_cast(0) + : input[i]; + } +} + +TYPED_TEST(HipcubBlockExchangeTests, ScatterToStripedGuardedNoOutputParam) +{ + using type = typename TestFixture::params::type; + constexpr size_t block_size = TestFixture::params::block_size; + constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + constexpr size_t grid_size = 113; + + const size_t items_per_block = block_size * items_per_thread; + const size_t size = grid_size * items_per_block; + + type* host_input = new type[size]; + type* host_expected = new type[size]; + int* host_ranks = new int[size]; + + std::iota(host_input, host_input + size, 0); + for(size_t i = 0; i < grid_size; i++) + { + size_t offset = i * items_per_block; + std::iota(host_ranks + offset, host_ranks + offset + items_per_block - 1, 0); + std::shuffle(host_ranks + offset, + host_ranks + offset + items_per_block - 1, + std::mt19937{std::random_device{}()}); + } + for(size_t i = items_per_block - 1; i < size; i += items_per_block) + { + host_ranks[i] = -1; + host_expected[i] = static_cast(0); + } + + for(size_t bi = 0; bi < size / items_per_block; bi++) + { + for(size_t ti = 0; ti < block_size; ti++) + { + for(size_t ii = 0; ii < items_per_thread; ii++) + { + const size_t offset = bi * items_per_block; + const size_t i0 = offset + ti * items_per_thread + ii; + const size_t i1 = offset + host_ranks[i0] % block_size * items_per_thread + + host_ranks[i0] / block_size; + if(i1 >= 0 && i1 < size) + host_expected[i1] = host_input[i0]; + } + } + } + + type* device_input; + int* device_ranks; + + HIP_CHECK(hipMalloc(&device_input, sizeof(type) * size)); + HIP_CHECK(hipMalloc(&device_ranks, sizeof(int) * size)); + + HIP_CHECK(hipMemcpy(device_input, host_input, sizeof(type) * size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(device_ranks, host_ranks, sizeof(int) * size, hipMemcpyHostToDevice)); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + scatter_to_stripped_guarded_no_output_param_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_ranks); + + type* host_output = new type[size]; + HIP_CHECK(hipMemcpy(host_input, device_input, sizeof(type) * size, hipMemcpyDeviceToHost)); + + for(size_t i = 0; i < size; i++) + ASSERT_EQ(host_input[i], host_expected[i]); + + delete[] host_input; + delete[] host_expected; + delete[] host_ranks; + + HIP_CHECK(hipFree(device_input)); + HIP_CHECK(hipFree(device_ranks)); +} + +template +__global__ +void scatter_to_stripped_flagged_no_output_param_kernel(T* device_input, + int* device_ranks, + bool* device_flags) +{ + const size_t items_per_block = items_per_thread * block_size; + const size_t offset = (blockIdx.x * items_per_block) + threadIdx.x * items_per_thread; + + T input[items_per_thread]; + int ranks[items_per_thread]; + bool flags[items_per_thread]; + + for(size_t i = 0; i < items_per_thread; i++) + { + input[i] = device_input[offset + i]; + ranks[i] = device_ranks[offset + i]; + flags[i] = device_flags[offset + i]; + } + hipcub::BlockExchange exchange; + exchange.ScatterToStripedFlagged(input, ranks, flags); + + for(size_t i = 0; i < items_per_thread; i++) + { + device_input[offset + i] = (i == items_per_thread - 1) && (threadIdx.x == block_size - 1) + ? static_cast(0) + : input[i]; + } +} + +TYPED_TEST(HipcubBlockExchangeTests, ScatterToStripedFlaggedNoOutputParam) +{ + using type = typename TestFixture::params::type; + constexpr size_t block_size = TestFixture::params::block_size; + constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + constexpr size_t grid_size = 113; + + const size_t items_per_block = block_size * items_per_thread; + const size_t size = grid_size * items_per_block; + + type* host_input = new type[size]; + type* host_expected = new type[size]; + int* host_ranks = new int[size]; + bool* host_flags = new bool[size]; + + std::iota(host_input, host_input + size, 0); + for(size_t i = 0; i < grid_size; i++) + { + size_t offset = i * items_per_block; + std::iota(host_ranks + offset, host_ranks + offset + items_per_block - 1, 0); + std::shuffle(host_ranks + offset, + host_ranks + offset + items_per_block - 1, + std::mt19937{std::random_device{}()}); + } + + for(size_t i = items_per_block - 1; i < size; i += items_per_block) + { + host_ranks[i] = -1; + host_expected[i] = static_cast(0); + } + + for(size_t bi = 0; bi < size / items_per_block; bi++) + { + for(size_t ti = 0; ti < block_size; ti++) + { + for(size_t ii = 0; ii < items_per_thread; ii++) + { + const size_t offset = bi * items_per_block; + const size_t i0 = offset + ti * items_per_thread + ii; + const size_t i1 = offset + host_ranks[i0] % block_size * items_per_thread + + host_ranks[i0] / block_size; + if(i1 >= 0 && i1 < size) + host_expected[i1] = host_input[i0]; + host_flags[i0] + = (ti == block_size - 1) && (ii == items_per_thread - 1) ? false : true; + } + } + } + + for(size_t i = items_per_block - 1; i < size; i += items_per_block) + host_ranks[i] = 5; + + type* device_input; + int* device_ranks; + bool* device_flags; + + HIP_CHECK(hipMalloc(&device_input, sizeof(type) * size)); + HIP_CHECK(hipMalloc(&device_ranks, sizeof(int) * size)); + HIP_CHECK(hipMalloc(&device_flags, sizeof(bool) * size)); + + HIP_CHECK(hipMemcpy(device_input, host_input, sizeof(type) * size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(device_ranks, host_ranks, sizeof(int) * size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(device_flags, host_flags, sizeof(bool) * size, hipMemcpyHostToDevice)); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + scatter_to_stripped_flagged_no_output_param_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_ranks, + device_flags); + + HIP_CHECK(hipMemcpy(host_input, device_input, sizeof(type) * size, hipMemcpyDeviceToHost)); + + for(size_t i = 0; i < size; i++) + ASSERT_EQ(host_input[i], host_expected[i]); + + delete[] host_input; + delete[] host_expected; + delete[] host_ranks; + delete[] host_flags; + + HIP_CHECK(hipFree(device_input)); + HIP_CHECK(hipFree(device_ranks)); + HIP_CHECK(hipFree(device_flags)); +} \ No newline at end of file diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_merge_sort.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_merge_sort.cpp index e3f91c10e10..598aea81d13 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_merge_sort.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_merge_sort.cpp @@ -23,32 +23,34 @@ #include "common_test_header.hpp" // hipcub API -#include "hipcub/block/block_merge_sort.hpp" #include "hipcub/block/block_load.hpp" +#include "hipcub/block/block_merge_sort.hpp" #include "hipcub/block/block_store.hpp" +#include +#include +#define ull unsigned long long -template< - class Key, - class Value, - unsigned int BlockSize, - unsigned int ItemsPerThread, - class CompareFunction = test_utils::less, - bool ToStriped = false -> +template struct params { - using key_type = Key; - using value_type = Value; - static constexpr unsigned int block_size = BlockSize; + using key_type = Key; + using value_type = Value; + static constexpr unsigned int block_size = BlockSize; static constexpr unsigned int items_per_thread = ItemsPerThread; - using compare_function = CompareFunction; - static constexpr bool to_striped = ToStriped; + using compare_function = CompareFunction; + static constexpr bool to_striped = ToStriped; }; template -class HipcubBlockMergeSort : public ::testing::Test { +class HipcubBlockMergeSort : public ::testing::Test +{ public: using params = Params; }; @@ -75,21 +77,14 @@ using Params = ::testing::Types< TYPED_TEST_SUITE(HipcubBlockMergeSort, Params); -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - class key_type, - typename CompareOp -> +template __global__ __launch_bounds__(BlockSize) -void sort_key_kernel( - key_type* device_keys_output, - CompareOp compare_op) +void sort_key_kernel(key_type* device_keys_output, CompareOp compare_op) { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * items_per_block; + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * items_per_block; key_type keys[ItemsPerThread]; hipcub::LoadDirectBlocked(lid, device_keys_output + block_offset, keys); @@ -106,23 +101,24 @@ TYPED_TEST(HipcubBlockMergeSort, SortKeys) SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); HIP_CHECK(hipSetDevice(device_id)); - using key_type = typename TestFixture::params::key_type; - constexpr size_t block_size = TestFixture::params::block_size; + using key_type = typename TestFixture::params::key_type; + constexpr size_t block_size = TestFixture::params::block_size; constexpr size_t items_per_thread = TestFixture::params::items_per_thread; - using compare_function = typename TestFixture::params::compare_function; - constexpr size_t items_per_block = block_size * items_per_thread; + using compare_function = typename TestFixture::params::compare_function; + constexpr size_t items_per_block = block_size * items_per_thread; // Given block size not supported if(block_size > test_utils::get_max_block_size()) { GTEST_SKIP(); } - const size_t size = items_per_block * 1134; + const size_t size = items_per_block * 1134; const size_t grid_size = size / items_per_block; - for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { - unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate data @@ -136,40 +132,35 @@ TYPED_TEST(HipcubBlockMergeSort, SortKeys) std::vector expected(keys_output); for(size_t i = 0; i < size / items_per_block; i++) { - std::stable_sort( - expected.begin() + (i * items_per_block), - expected.begin() + ((i + 1) * items_per_block), - compare_function() - ); + std::stable_sort(expected.begin() + (i * items_per_block), + expected.begin() + ((i + 1) * items_per_block), + compare_function()); } // Preparing device key_type* device_keys_output; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_keys_output, keys_output.size() * sizeof(key_type))); + HIP_CHECK(test_common_utils::hipMallocHelper(&device_keys_output, + keys_output.size() * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - device_keys_output, keys_output.data(), - keys_output.size() * sizeof(typename decltype(keys_output)::value_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(device_keys_output, + keys_output.data(), + keys_output.size() * sizeof(typename decltype(keys_output)::value_type), + hipMemcpyHostToDevice)); // Running kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(sort_key_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_keys_output, compare_function() - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(sort_key_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_keys_output, + compare_function()); // Getting results to host - HIP_CHECK( - hipMemcpy( - keys_output.data(), device_keys_output, - keys_output.size() * sizeof(typename decltype(keys_output)::value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(keys_output.data(), + device_keys_output, + keys_output.size() * sizeof(typename decltype(keys_output)::value_type), + hipMemcpyDeviceToHost)); // Verifying results for(size_t i = 0; i < size; i++) @@ -181,25 +172,146 @@ TYPED_TEST(HipcubBlockMergeSort, SortKeys) HIP_CHECK(hipFree(device_keys_output)); } } -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - class key_type, - class value_type, - class CompareOp - > + +template +__global__ +void sort_key_with_valid_items_kernel(T* device_input, + CompareOp compare_op, + int valid_items, + T default_val) +{ + constexpr size_t items_per_block = items_per_thread * block_size; + const size_t offset = (blockIdx.x * items_per_block) + (threadIdx.x * items_per_thread); + + T input[items_per_thread]; + + for(size_t i = 0; i < items_per_thread; i++) + input[i] = device_input[offset + i]; + + hipcub::BlockMergeSort bsort; + + bsort.Sort(input, compare_op, valid_items, default_val); + + for(size_t i = 0; i < items_per_thread; i++) + device_input[offset + i] = input[i]; +} + +TYPED_TEST(HipcubBlockMergeSort, SortKeysWithValidItems) +{ + + constexpr size_t block_size = TestFixture::params::block_size; + constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + using compare_function = typename TestFixture::params::compare_function; + using T = typename TestFixture::params::key_type; + constexpr int items_per_block = items_per_thread * block_size; + constexpr int grid_size = 113; + + auto compare_op = compare_function(); + + if(block_size > test_utils::get_max_block_size()) + { + GTEST_SKIP(); + } + + constexpr size_t size = grid_size * items_per_block; + + // minus|plus two to prevent overflow weirdness + const T mini = std::numeric_limits::min() + static_cast(2); + const T maxi = std::numeric_limits::max() - static_cast(2); + + const T default_val = static_cast(compare_op(mini, maxi) ? maxi : mini); + const int valid_items_arr[8] = {items_per_block / 2, + items_per_block / 3, + items_per_block / 4, + items_per_block / 5, + items_per_block - 10, + items_per_block - 5, + items_per_block - 2, + items_per_block - 1}; + + T* host_keys_input = new T[size]; + T* host_keys_output = new T[size]; + T* host_keys_expected = new T[size]; + + T* device_keys_input; + HIP_CHECK(hipMalloc(&device_keys_input, sizeof(T) * size)); + + for(size_t it = 0; it < 8; it++) + { + int valid_items = valid_items_arr[it]; + + // need to cast the 0 because of __half and bfloat16 types + T elem = static_cast(0); + for(size_t i = 0; i < size; i++) + { + if(elem > maxi) + elem = static_cast(0); + host_keys_input[i] = host_keys_expected[i] = elem++; + } + + // filling in the default_val + for(size_t bI = 0; bI < grid_size; bI++) + { + size_t offset = (bI * items_per_block); + for(size_t i = valid_items; i < items_per_block; i++) + host_keys_expected[offset + i] = default_val; + } + + // sorting the values + for(size_t bI = 0; bI < grid_size; bI++) + { + size_t offset = (bI * items_per_block); + std::sort(host_keys_expected + offset, + host_keys_expected + offset + items_per_block, + compare_op); + } + + HIP_CHECK( + hipMemcpy(device_keys_input, host_keys_input, sizeof(T) * size, hipMemcpyHostToDevice)); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(sort_key_with_valid_items_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_keys_input, + compare_op, + valid_items, + default_val); + + HIP_CHECK(hipMemcpy(host_keys_output, + device_keys_input, + sizeof(T) * size, + hipMemcpyDeviceToHost)); + + for(size_t i = 0; i < size; i++) + ASSERT_EQ(host_keys_expected[i], host_keys_output[i]); + } + + delete[] host_keys_input; + delete[] host_keys_output; + delete[] host_keys_expected; + + HIP_CHECK(hipFree(device_keys_input)); +} + +template __global__ __launch_bounds__(BlockSize) - void sort_key_value_kernel( - key_type* device_keys_output, - value_type* device_values_output, - CompareOp compare_op) +void sort_key_value_kernel(key_type* device_keys_output, + value_type* device_values_output, + CompareOp compare_op) { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * items_per_block; + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * items_per_block; - key_type keys[ItemsPerThread]; + key_type keys[ItemsPerThread]; value_type values[ItemsPerThread]; hipcub::LoadDirectBlocked(lid, device_keys_output + block_offset, keys); hipcub::LoadDirectBlocked(lid, device_values_output + block_offset, values); @@ -217,24 +329,25 @@ TYPED_TEST(HipcubBlockMergeSort, SortKeysValues) SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); HIP_CHECK(hipSetDevice(device_id)); - using key_type = typename TestFixture::params::key_type; - using value_type = typename TestFixture::params::value_type; - constexpr size_t block_size = TestFixture::params::block_size; + using key_type = typename TestFixture::params::key_type; + using value_type = typename TestFixture::params::value_type; + constexpr size_t block_size = TestFixture::params::block_size; constexpr size_t items_per_thread = TestFixture::params::items_per_thread; - using compare_function = typename TestFixture::params::compare_function; - constexpr size_t items_per_block = block_size * items_per_thread; + using compare_function = typename TestFixture::params::compare_function; + constexpr size_t items_per_block = block_size * items_per_thread; // Given block size not supported if(block_size > test_utils::get_max_block_size()) { return; } - const size_t size = items_per_block * 1134; + const size_t size = items_per_block * 1134; const size_t grid_size = size / items_per_block; - for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { - unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate data @@ -245,11 +358,11 @@ TYPED_TEST(HipcubBlockMergeSort, SortKeysValues) seed_value); std::vector values_output; - values_output = - test_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max(), - seed_value + seed_value_addition); + values_output + = test_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max(), + seed_value + seed_value_addition); using key_value = std::pair; @@ -265,56 +378,292 @@ TYPED_TEST(HipcubBlockMergeSort, SortKeysValues) { std::stable_sort(expected.begin() + (i * items_per_block), expected.begin() + ((i + 1) * items_per_block), - [compare_op](const key_value & a, const key_value & b) - { - return compare_op(a.first, b.first); - }); + [compare_op](const key_value& a, const key_value& b) + { return compare_op(a.first, b.first); }); } key_type* device_keys_output; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_keys_output, keys_output.size() * sizeof(key_type))); + HIP_CHECK(test_common_utils::hipMallocHelper(&device_keys_output, + keys_output.size() * sizeof(key_type))); value_type* device_values_output; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_values_output, values_output.size() * sizeof(value_type))); + HIP_CHECK(test_common_utils::hipMallocHelper(&device_values_output, + values_output.size() * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - device_keys_output, keys_output.data(), - keys_output.size() * sizeof(typename decltype(keys_output)::value_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(device_keys_output, + keys_output.data(), + keys_output.size() * sizeof(typename decltype(keys_output)::value_type), + hipMemcpyHostToDevice)); HIP_CHECK( - hipMemcpy( - device_values_output, values_output.data(), - values_output.size() * sizeof(typename decltype(values_output)::value_type), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_values_output, + values_output.data(), + values_output.size() * sizeof(typename decltype(values_output)::value_type), + hipMemcpyHostToDevice)); // Running kernel hipLaunchKernelGGL( - HIP_KERNEL_NAME(sort_key_value_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_keys_output, device_values_output, compare_op - ); + HIP_KERNEL_NAME( + sort_key_value_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_keys_output, + device_values_output, + compare_op); // Getting results to host + HIP_CHECK(hipMemcpy(keys_output.data(), + device_keys_output, + keys_output.size() * sizeof(typename decltype(keys_output)::value_type), + hipMemcpyDeviceToHost)); + HIP_CHECK( - hipMemcpy( - keys_output.data(), device_keys_output, - keys_output.size() * sizeof(typename decltype(keys_output)::value_type), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(values_output.data(), + device_values_output, + values_output.size() * sizeof(typename decltype(values_output)::value_type), + hipMemcpyDeviceToHost)); + + for(size_t i = 0; i < size; i++) + { + ASSERT_EQ(test_utils::convert_to_native(keys_output[i]), + test_utils::convert_to_native(expected[i].first)); + ASSERT_EQ(test_utils::convert_to_native(values_output[i]), + test_utils::convert_to_native(expected[i].second)); + } + + HIP_CHECK(hipFree(device_keys_output)); + HIP_CHECK(hipFree(device_values_output)); + } +} + +template +__global__ +void stable_sort_kernel(T* device_input, CompareOp compare_op) +{ + constexpr size_t items_per_block = items_per_thread * block_size; + const size_t offset = (blockIdx.x * items_per_block) + (threadIdx.x * items_per_thread); + + T input[items_per_thread]; + + for(size_t i = 0; i < items_per_thread; i++) + input[i] = device_input[offset + i]; + + hipcub::BlockMergeSort bsort; + + bsort.StableSort(input, + [&](const T& lhs, const T& rhs) { return compare_op(lhs.elem, rhs.elem); }); + + for(size_t i = 0; i < items_per_thread; i++) + device_input[offset + i] = input[i]; +} + +TYPED_TEST(HipcubBlockMergeSort, StableSort) +{ + constexpr size_t block_size = TestFixture::params::block_size; + constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + using T = typename TestFixture::params::key_type; + using compare_function = typename TestFixture::params::compare_function; + constexpr size_t items_per_block = items_per_thread * block_size; + constexpr size_t grid_size = 113; + const size_t size = grid_size * items_per_block; + + auto compare_op = compare_function(); + if(block_size > test_utils::get_max_block_size()) + { + GTEST_SKIP(); + } + struct custom_type + { + T elem; + size_t id; + }; + + custom_type* host_input = new custom_type[size]; + custom_type* host_expected = new custom_type[size]; + + //populate the inputs + for(size_t i = 0; i < size; i++) + { + if(i % 2) + host_expected[i] = host_input[i] = {static_cast(i - 1), i}; + else + host_expected[i] = host_input[i] = {static_cast(i), i}; + } + + // get the expected result + for(size_t bI = 0; bI < grid_size; bI++) + { + size_t offset = (bI * items_per_block); + std::stable_sort(host_expected + offset, + host_expected + offset + items_per_block, + [&](const custom_type& lhs, const custom_type& rhs) + { return compare_op(lhs.elem, rhs.elem); }); + } + custom_type* device_input; + + HIP_CHECK(hipMalloc(&device_input, sizeof(custom_type) * size)); + HIP_CHECK( + hipMemcpy(device_input, host_input, sizeof(custom_type) * size, hipMemcpyHostToDevice)); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(stable_sort_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + compare_op); + + HIP_CHECK( + hipMemcpy(host_input, device_input, sizeof(custom_type) * size, hipMemcpyDeviceToHost)); + + for(size_t i = 0; i < size; i++) + { + ASSERT_EQ(host_input[i].elem, host_expected[i].elem); + ASSERT_EQ(host_input[i].id, host_expected[i].id); + } + + delete[] host_input; + delete[] host_expected; + + HIP_CHECK(hipFree(device_input)); +} + +template +__global__ + __launch_bounds__(BlockSize) +void stable_sort_key_value_kernel(key_type* device_keys_output, + value_type* device_values_output, + CompareOp compare_op) +{ + constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * items_per_block; + + key_type keys[ItemsPerThread]; + value_type values[ItemsPerThread]; + hipcub::LoadDirectBlocked(lid, device_keys_output + block_offset, keys); + hipcub::LoadDirectBlocked(lid, device_values_output + block_offset, values); + + hipcub::BlockMergeSort bsort; + bsort.StableSort(keys, values, compare_op); + + hipcub::StoreDirectBlocked(lid, device_keys_output + block_offset, keys); + hipcub::StoreDirectBlocked(lid, device_values_output + block_offset, values); +} + +TYPED_TEST(HipcubBlockMergeSort, StableSortKeysValues) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using key_type = typename TestFixture::params::key_type; + using value_type = typename TestFixture::params::value_type; + constexpr size_t block_size = TestFixture::params::block_size; + constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + using compare_function = typename TestFixture::params::compare_function; + constexpr size_t items_per_block = block_size * items_per_thread; + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t size = items_per_block * 1134; + const size_t grid_size = size / items_per_block; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); + + // Generate data + std::vector keys_output; + keys_output = test_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max(), + seed_value); + + std::vector values_output; + values_output + = test_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max(), + seed_value + seed_value_addition); + + // Set some keys to be the same, but have different values to test stability + for(size_t i = 0; i < 10; i++) + { + keys_output[i] = static_cast(0); + values_output[i] = static_cast(i); + } + + using key_value = std::pair; + + // Calculate expected results on host + std::vector expected(size); + for(size_t i = 0; i < size; i++) + { + expected[i] = key_value(keys_output[i], values_output[i]); + } + + compare_function compare_op; + for(size_t i = 0; i < size / items_per_block; i++) + { + std::stable_sort(expected.begin() + (i * items_per_block), + expected.begin() + ((i + 1) * items_per_block), + [compare_op](const key_value& a, const key_value& b) + { return compare_op(a.first, b.first); }); + } + + key_type* device_keys_output; + HIP_CHECK(test_common_utils::hipMallocHelper(&device_keys_output, + keys_output.size() * sizeof(key_type))); + value_type* device_values_output; + HIP_CHECK(test_common_utils::hipMallocHelper(&device_values_output, + values_output.size() * sizeof(value_type))); + + HIP_CHECK(hipMemcpy(device_keys_output, + keys_output.data(), + keys_output.size() * sizeof(typename decltype(keys_output)::value_type), + hipMemcpyHostToDevice)); HIP_CHECK( - hipMemcpy( - values_output.data(), device_values_output, - values_output.size() * sizeof(typename decltype(values_output)::value_type), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(device_values_output, + values_output.data(), + values_output.size() * sizeof(typename decltype(values_output)::value_type), + hipMemcpyHostToDevice)); + + // Running kernel + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + stable_sort_key_value_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_keys_output, + device_values_output, + compare_op); + + // Getting results to host + HIP_CHECK(hipMemcpy(keys_output.data(), + device_keys_output, + keys_output.size() * sizeof(typename decltype(keys_output)::value_type), + hipMemcpyDeviceToHost)); + + HIP_CHECK( + hipMemcpy(values_output.data(), + device_values_output, + values_output.size() * sizeof(typename decltype(values_output)::value_type), + hipMemcpyDeviceToHost)); for(size_t i = 0; i < size; i++) { @@ -328,3 +677,317 @@ TYPED_TEST(HipcubBlockMergeSort, SortKeysValues) HIP_CHECK(hipFree(device_values_output)); } } + +template +__global__ +void stable_sort_key_with_valid_items_kernel(T* device_input, + CompareOp compare_op, + int valid_items, + T default_val) +{ + constexpr size_t items_per_block = items_per_thread * block_size; + const size_t offset = (blockIdx.x * items_per_block) + (threadIdx.x * items_per_thread); + + T input[items_per_thread]; + + for(size_t i = 0; i < items_per_thread; i++) + input[i] = device_input[offset + i]; + + hipcub::BlockMergeSort bsort; + + bsort.StableSort( + input, + [&](const T& lhs, const T& rhs) { return compare_op(lhs.elem, rhs.elem); }, + valid_items, + default_val); + + for(size_t i = 0; i < items_per_thread; i++) + device_input[offset + i] = input[i]; +} + +TYPED_TEST(HipcubBlockMergeSort, StableSortKeysWithValidItems) +{ + + constexpr size_t block_size = TestFixture::params::block_size; + constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + using compare_function = typename TestFixture::params::compare_function; + using T = typename TestFixture::params::key_type; + constexpr int items_per_block = items_per_thread * block_size; + constexpr int grid_size = 113; + + auto compare_op = compare_function(); + + if(block_size > test_utils::get_max_block_size()) + { + GTEST_SKIP(); + } + + struct custom_type + { + T elem; + size_t id; + }; + + constexpr size_t size = grid_size * items_per_block; + + // minus|plus two to prevent overflow weirdness + const T mini = std::numeric_limits::min() + static_cast(2); + const T maxi = std::numeric_limits::max() - static_cast(2); + + const custom_type default_val = {static_cast(compare_op(mini, maxi) ? maxi : mini), 0}; + const int valid_items_arr[8] = {items_per_block / 2, + items_per_block / 3, + items_per_block / 4, + items_per_block / 5, + items_per_block - 10, + items_per_block - 5, + items_per_block - 2, + items_per_block - 1}; + + custom_type* host_keys_input = new custom_type[size]; + custom_type* host_keys_output = new custom_type[size]; + custom_type* host_keys_expected = new custom_type[size]; + + custom_type* device_keys_input; + HIP_CHECK(hipMalloc(&device_keys_input, sizeof(custom_type) * size)); + + for(size_t it = 0; it < 8; it++) + { + int valid_items = valid_items_arr[it]; + + // need to cast 0 because of __half and bfloat16 types + T elem = static_cast(0); + for(size_t i = 0; i < size; i++) + { + if(elem > maxi) + elem = static_cast(0); + + host_keys_input[i] = host_keys_expected[i] = {elem++, i}; + } + + // filling in the default_val + for(size_t bI = 0; bI < grid_size; bI++) + { + size_t offset = (bI * items_per_block); + for(size_t i = valid_items; i < items_per_block; i++) + { + host_keys_expected[offset + i] = default_val; + } + } + + // sorting the values + for(size_t bI = 0; bI < grid_size; bI++) + { + size_t offset = (bI * items_per_block); + std::stable_sort(host_keys_expected + offset, + host_keys_expected + offset + items_per_block, + [&](const custom_type& lhs, const custom_type& rhs) + { return compare_op(lhs.elem, rhs.elem); }); + } + + HIP_CHECK(hipMemcpy(device_keys_input, + host_keys_input, + sizeof(custom_type) * size, + hipMemcpyHostToDevice)); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + stable_sort_key_with_valid_items_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_keys_input, + compare_op, + valid_items, + default_val); + + HIP_CHECK(hipMemcpy(host_keys_output, + device_keys_input, + sizeof(custom_type) * size, + hipMemcpyDeviceToHost)); + + for(size_t i = 0; i < size; i++) + { + ASSERT_EQ(host_keys_expected[i].elem, host_keys_output[i].elem); + ASSERT_EQ(host_keys_expected[i].id, host_keys_output[i].id); + } + } + + delete[] host_keys_input; + delete[] host_keys_output; + delete[] host_keys_expected; + + HIP_CHECK(hipFree(device_keys_input)); +} + +template +__global__ +void stable_sort_key_value_with_valid_items_kernel(T* device_key_input, + T* device_value_input, + CompareOp compare_op, + int valid_items, + T default_val) +{ + constexpr size_t items_per_block = items_per_thread * block_size; + const size_t offset = (blockIdx.x * items_per_block) + (threadIdx.x * items_per_thread); + + T key_input[items_per_thread]; + T value_input[items_per_thread]; + + for(size_t i = 0; i < items_per_thread; i++) + { + key_input[i] = device_key_input[offset + i]; + value_input[i] = device_value_input[offset + i]; + } + + hipcub::BlockMergeSort bsort; + + bsort.StableSort(key_input, value_input, compare_op, valid_items, default_val); + + for(size_t i = 0; i < items_per_thread; i++) + { + device_key_input[offset + i] = key_input[i]; + device_value_input[offset + i] = value_input[i]; + } +} + +TYPED_TEST(HipcubBlockMergeSort, StableSortKeysValuesWithValidItems) +{ + + constexpr size_t block_size = TestFixture::params::block_size; + constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + using compare_function = typename TestFixture::params::compare_function; + using T = typename TestFixture::params::key_type; + constexpr int items_per_block = items_per_thread * block_size; + constexpr int grid_size = 113; + + auto compare_op = compare_function(); + + if(block_size > test_utils::get_max_block_size()) + { + GTEST_SKIP(); + } + + struct custom_type + { + T key; + T value; + }; + + constexpr size_t size = grid_size * items_per_block; + + // minus|plus two to prevent overflow weirdness + const T mini = std::numeric_limits::min() + static_cast(2); + const T maxi = std::numeric_limits::max() - static_cast(2); + + T default_val = static_cast(compare_op(mini, maxi) ? maxi : mini); + const int valid_items_arr[8] = {items_per_block / 2, + items_per_block / 3, + items_per_block / 4, + items_per_block / 5, + items_per_block - 10, + items_per_block - 5, + items_per_block - 2, + items_per_block - 1}; + + custom_type* host_side_sort = new custom_type[size]; + T* host_keys_input = new T[size]; + T* host_values_input = new T[size]; + + T* host_keys_expected = new T[size]; + T* host_values_expected = new T[size]; + + T* device_keys_input; + T* device_values_input; + HIP_CHECK(hipMalloc(&device_keys_input, sizeof(T) * size)); + HIP_CHECK(hipMalloc(&device_values_input, sizeof(T) * size)); + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dis(static_cast(mini) + 2, + static_cast(maxi) - 2); + + for(size_t it = 0; it < 8; it++) + { + int valid_items = valid_items_arr[it]; + + // need to cast the 0 because of __half and bfloat16 types + T rIndex = static_cast(0); + for(size_t i = 0; i < size; i++) + { + if(rIndex > maxi) + rIndex = static_cast(0); + + if(i % 2) + { + T oIndex = rIndex - static_cast(1); + host_side_sort[i] = {oIndex, static_cast(dis(gen))}; + } + else + host_side_sort[i] = {rIndex, static_cast(dis(gen))}; + host_keys_input[i] = host_side_sort[i].key; + host_values_input[i] = host_side_sort[i].value; + rIndex++; + } + + // filling in the default_val + for(size_t bI = 0; bI < grid_size; bI++) + { + size_t offset = (bI * items_per_block); + for(size_t i = valid_items; i < items_per_block; i++) + { + host_side_sort[offset + i].key = default_val; + } + } + + // sorting the values + for(size_t bI = 0; bI < grid_size; bI++) + { + size_t offset = (bI * items_per_block); + std::stable_sort(host_side_sort + offset, + host_side_sort + offset + items_per_block, + [&](const custom_type& lhs, const custom_type& rhs) + { return compare_op(lhs.key, rhs.key); }); + } + + HIP_CHECK( + hipMemcpy(device_keys_input, host_keys_input, sizeof(T) * size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(device_values_input, + host_values_input, + sizeof(T) * size, + hipMemcpyHostToDevice)); + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + stable_sort_key_value_with_valid_items_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_keys_input, + device_values_input, + compare_op, + valid_items, + default_val); + + HIP_CHECK( + hipMemcpy(host_keys_input, device_keys_input, sizeof(T) * size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(host_values_input, + device_values_input, + sizeof(T) * size, + hipMemcpyDeviceToHost)); + + for(size_t i = 0; i < size; i++) + { + ASSERT_EQ(host_side_sort[i].key, host_keys_input[i]); + ASSERT_EQ(host_side_sort[i].value, host_values_input[i]); + } + } + + delete[] host_keys_input; + delete[] host_values_input; + delete[] host_side_sort; + + HIP_CHECK(hipFree(device_keys_input)); + HIP_CHECK(hipFree(device_values_input)); +} \ No newline at end of file diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp index 472135e5620..ad47a54ce3e 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp @@ -37,6 +37,8 @@ #include "hipcub/block/block_store.hpp" #include "hipcub/util_type.hpp" +#include + template, - params, - params, - params, - params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, // Non-power of 2 BlockSize - params, - params, - params, - params, - params, - - // Power of 2 BlockSize and ItemsPerThread > 1 - params, - params, - params, - params, - - // Non-power of 2 BlockSize and ItemsPerThread > 1 - params, - params, - params, - params, - params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + + // Power of 2 BlockSize and ItemsPerThread > 1 and ItemsPerThread Power of 2 + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + + // Power of 2 BlockSize and ItemsPerThread > 1 and ItemsPerThread Non-power of 2 + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + + // Non-power of 2 BlockSize and ItemsPerThread > 1 and ItemsPerThread Power of 2 + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + + // Non-power of 2 BlockSize and ItemsPerThread > 1 and ItemsPerThread Non-power of 2 + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, // StartBit and MaxRadixBits params, @@ -115,10 +294,11 @@ template -__global__ __launch_bounds__(BlockSize) void rank_kernel(const KeyType* keys_input, - int* ranks_output, - unsigned int start_bit, - unsigned int radix_bits) +__global__ __launch_bounds__(BlockSize) +void rank_kernel(const KeyType* keys_input, + int* ranks_output, + unsigned int start_bit, + unsigned int radix_bits) { constexpr bool warp_striped = Algorithm == RadixRankAlgorithm::RADIX_RANK_MATCH; @@ -189,15 +369,15 @@ void test_radix_rank() SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); HIP_CHECK(hipSetDevice(device_id)); - using key_type = typename TestFixture::params::key_type; - constexpr size_t block_size = TestFixture::params::block_size; - constexpr size_t items_per_thread = TestFixture::params::items_per_thread; - constexpr bool descending = TestFixture::params::descending; + using key_type = typename TestFixture::params::key_type; + constexpr size_t block_size = TestFixture::params::block_size; + constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + constexpr bool descending = TestFixture::params::descending; constexpr unsigned int max_radix_bits = TestFixture::params::max_radix_bits; - constexpr unsigned int start_bit = TestFixture::params::start_bit; + constexpr unsigned int start_bit = TestFixture::params::start_bit; constexpr unsigned int radix_bits = TestFixture::params::radix_bits; constexpr unsigned end_bit = start_bit + radix_bits; - constexpr size_t items_per_block = block_size * items_per_thread; + constexpr size_t items_per_block = block_size * items_per_thread; static_assert(radix_bits <= max_radix_bits, "radix_bits must be less than or equal to max_radix_bits"); @@ -209,13 +389,15 @@ void test_radix_rank() } const size_t grid_size = 42; - const size_t size = items_per_block * grid_size; + const size_t size = items_per_block * grid_size; - SCOPED_TRACE(testing::Message() << "with items_per_block= " << items_per_block << " size=" << size); + SCOPED_TRACE(testing::Message() + << "with items_per_block= " << items_per_block << " size=" << size); - for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { - unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate data @@ -334,3 +516,267 @@ TYPED_TEST(HipcubBlockRadixRank, BlockRadixRankMatch) test_radix_rank(); } + +template +__global__ __launch_bounds__(BlockSize) +void rank_with_prefix_sum_kernel(const KeyType* keys_input, + int* ranks_output, + int* prefix_sum_output, + unsigned int start_bit) +{ + constexpr bool warp_striped = Algorithm == RadixRankAlgorithm::RADIX_RANK_MATCH; + + using KeyTraits = hipcub::Traits; + using UnsignedBits = typename KeyTraits::UnsignedBits; + using DigitExtractor = hipcub::BFEDigitExtractor; + using RankType = std::conditional_t< + Algorithm == RadixRankAlgorithm::RADIX_RANK_MATCH, + hipcub::BlockRadixRankMatch, + hipcub::BlockRadixRank>; + + using KeyExchangeType = hipcub::BlockExchange; + using RankExchangeType = hipcub::BlockExchange; + + constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * items_per_block; + + __shared__ union + { + typename KeyExchangeType::TempStorage key_exchange; + typename RankType::TempStorage rank; + typename RankExchangeType::TempStorage rank_exchange; + } storage; + + KeyType keys[ItemsPerThread]; + hipcub::LoadDirectBlocked(lid, keys_input + block_offset, keys); + + if(warp_striped) + { + KeyExchangeType exchange(storage.key_exchange); + exchange.BlockedToWarpStriped(keys, keys); + __syncthreads(); + } + + UnsignedBits(&unsigned_keys)[ItemsPerThread] + = reinterpret_cast(keys); + +#pragma unroll + for(unsigned int key = 0; key < ItemsPerThread; key++) + { + unsigned_keys[key] = KeyTraits::TwiddleIn(unsigned_keys[key]); + } + + RankType rank(storage.rank); + const auto bins_tracked_per_thread = rank.BINS_TRACKED_PER_THREAD; + const DigitExtractor digit_extractor(start_bit, RadixBits); + int ranks[ItemsPerThread]; + + int prefix_sum_storage[bins_tracked_per_thread]; + + rank.RankKeys(unsigned_keys, ranks, digit_extractor, prefix_sum_storage); + + if(warp_striped) + { + __syncthreads(); + RankExchangeType exchange(storage.rank_exchange); + exchange.WarpStripedToBlocked(ranks, ranks); + } + + hipcub::StoreDirectBlocked(lid, ranks_output + block_offset, ranks); + + const size_t pfs_size = (1 << RadixBits); + const size_t pfs_offset = (blockIdx.x * pfs_size) + (threadIdx.x * bins_tracked_per_thread); + const size_t pfs_total_size = pfs_size * blockDim.x; + + for(size_t i = 0; i < bins_tracked_per_thread; i++) + { + if((threadIdx.x * bins_tracked_per_thread) + i < pfs_size) + prefix_sum_output[pfs_offset + i] = prefix_sum_storage[i]; + } +} + +template +void test_radix_rank_with_prefix_sum_output() +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using key_type = typename TestFixture::params::key_type; + constexpr size_t block_size = TestFixture::params::block_size; + constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + constexpr bool descending = TestFixture::params::descending; + constexpr unsigned int start_bit = TestFixture::params::start_bit; + constexpr unsigned int radix_bits = TestFixture::params::max_radix_bits; + constexpr unsigned end_bit = start_bit + radix_bits; + constexpr size_t items_per_block = block_size * items_per_thread; + + if constexpr(std::is_same_v) + { + + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t grid_size = 42; + const size_t pfs_items_per_block = (1 << radix_bits); + const size_t pfs_size = pfs_items_per_block * grid_size; + const size_t size = items_per_block * grid_size; + + SCOPED_TRACE(testing::Message() + << "with items_per_block= " << items_per_block << " size=" << size); + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); + + // Generate data + std::vector keys_input; + + keys_input = test_utils::get_random_data( + size, + test_utils::numeric_limits::min(), + test_utils::numeric_limits::max(), + seed_value); + + test_utils::add_special_values(keys_input, seed_value); + + // Calculate expected results on host + union converter + { + key_type in; + uint64_t out; + } c; + std::vector expected(keys_input.size()); + std::vector pfs_expected(pfs_size, 0); + for(size_t i = 0; i < grid_size; i++) + { + size_t block_offset = i * items_per_block; + const auto key_cmp + = test_utils::key_comparator(); + + // Perform an 'argsort', which gives a sorted sequence of indices into `keys_input`. + std::vector indices(items_per_block); + std::iota(indices.begin(), indices.end(), 0); + std::stable_sort(indices.begin(), + indices.end(), + [&](const int& i, const int& j) { + return key_cmp(keys_input[block_offset + i], + keys_input[block_offset + j]); + }); + + // Invert the sorted indices sequence to obtain the ranks. + for(size_t j = 0; j < indices.size(); ++j) + { + expected[block_offset + indices[j]] = static_cast(j); + } + + /* Calculating the prefix sun on host */ + size_t pfs_offset = i * pfs_items_per_block; + + std::vector histogram(pfs_items_per_block, 0); + + for(size_t ii = 0; ii < items_per_block; ii++) + { + c.in = keys_input[block_offset + ii]; + uint64_t bit_rep = c.out; + + bit_rep >>= start_bit; + bit_rep &= ((1 << radix_bits) - 1); + + if(descending) + bit_rep = (1 << radix_bits) - (1 + bit_rep); //flip it + + ++histogram[bit_rep]; + } + std::exclusive_scan(histogram.begin(), + histogram.end(), + pfs_expected.begin() + pfs_offset, + 0); + } + + // Preparing device + key_type* d_keys_input; + int* d_ranks_output; + int* d_prefix_sum_output; + HIP_CHECK(hipMalloc(&d_keys_input, keys_input.size() * sizeof(key_type))); + HIP_CHECK(hipMalloc(&d_ranks_output, expected.size() * sizeof(int))); + HIP_CHECK(hipMalloc(&d_prefix_sum_output, pfs_size * sizeof(int))); + + HIP_CHECK(hipMemcpy(d_keys_input, + keys_input.data(), + keys_input.size() * sizeof(key_type), + hipMemcpyHostToDevice)); + + // Running kernel + hipLaunchKernelGGL(HIP_KERNEL_NAME(rank_with_prefix_sum_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + d_keys_input, + d_ranks_output, + d_prefix_sum_output, + start_bit); + + // Getting results to host + std::vector ranks_output(expected.size()); + std::vector prefix_sum_output(pfs_size); + HIP_CHECK(hipMemcpy(ranks_output.data(), + d_ranks_output, + ranks_output.size() * sizeof(int), + hipMemcpyDeviceToHost)); + + HIP_CHECK(hipMemcpy(prefix_sum_output.data(), + d_prefix_sum_output, + prefix_sum_output.size() * sizeof(int), + hipMemcpyDeviceToHost)); + + // Verifying results + for(size_t i = 0; i < size; i++) + { + SCOPED_TRACE(testing::Message() << "with index= " << i); + ASSERT_EQ(ranks_output[i], expected[i]); + + if(i < pfs_size) + ASSERT_EQ(prefix_sum_output[i], pfs_expected[i]); + } + + HIP_CHECK(hipFree(d_keys_input)); + HIP_CHECK(hipFree(d_ranks_output)); + } + } +} + +TYPED_TEST(HipcubBlockRadixRank, BlockRadixRankBasicWithPrefixSumOutput) +{ + test_radix_rank_with_prefix_sum_output(); +} + +TYPED_TEST(HipcubBlockRadixRank, BlockRadixRankMemoizeWithPrefixSumOutput) +{ + test_radix_rank_with_prefix_sum_output(); +} + +TYPED_TEST(HipcubBlockRadixRank, BlockRadixRankMatchWithPrefixSumOutput) +{ + test_radix_rank_with_prefix_sum_output(); +} \ No newline at end of file diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_radix_sort.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_radix_sort.cpp index 974686945d7..15ee9d9ab8b 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_radix_sort.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_radix_sort.cpp @@ -31,30 +31,29 @@ #include -template< - class Key, - class Value, - unsigned int BlockSize, - unsigned int ItemsPerThread, - bool Descending = false, - bool ToStriped = false, - unsigned int StartBit = 0, - unsigned int EndBit = sizeof(Key) * 8 -> +template struct params { - using key_type = Key; - using value_type = Value; - static constexpr unsigned int block_size = BlockSize; + using key_type = Key; + using value_type = Value; + static constexpr unsigned int block_size = BlockSize; static constexpr unsigned int items_per_thread = ItemsPerThread; - static constexpr bool descending = Descending; - static constexpr bool to_striped = ToStriped; - static constexpr unsigned int start_bit = StartBit; - static constexpr unsigned int end_bit = EndBit; + static constexpr bool descending = Descending; + static constexpr bool to_striped = ToStriped; + static constexpr unsigned int start_bit = StartBit; + static constexpr unsigned int end_bit = EndBit; }; template -class HipcubBlockRadixSort : public ::testing::Test { +class HipcubBlockRadixSort : public ::testing::Test +{ public: using params = Params; }; @@ -65,45 +64,133 @@ using Params = ::testing::Types< params<__int128_t, __int128_t, 64U, 1>, params<__uint128_t, __uint128_t, 64U, 1>, #endif + params, + params, + params, params, - params, - params, - params, + params, + params, + params, + params, + params, float, 64U, 1>, + params, int, 64U, 1>, + params, int, 64U, 1>, // Non-power of 2 BlockSize - params, - params, - params, - params, - params, - params, - params, - - // Power of 2 BlockSize and ItemsPerThread > 1 - params, - params, - params, - - // Non-power of 2 BlockSize and ItemsPerThread > 1 - params, - params, - params, - params, - - // StartBit and EndBit - params, - params, - params, - - // Stability (a number of key values is lower than BlockSize * ItemsPerThread: some keys appear - // multiple times with different values or key parts outside [StartBit, EndBit)) - params, - params, - - // Sorting keys of a custom type with a custom decomposer - params, int, 128, 4>, - params, int, 129, 2, true, false>, - params, float, 255, 1, false, true, 1, 12>>; + params, + params, + params, + params, + params, + params, + params, + params, + params, float, 63U, 1>, + params, int, 63U, 1>, + params, int, 63U, 1>, + + // Power of 2 BlockSize and ItemsPerThread > 1 and ItemsPerThread Power of 2 + params, + params, + params, + params, + params, + params, + params, + params, + params, float, 64U, 4>, + params, int, 64U, 4>, + params, int, 64U, 4>, + + // Power of 2 BlockSize and ItemsPerThread > 1 and ItemsPerThread Non-power of 2 + params, + params, + params, + params, + params, + params, + params, + params, + params, float, 64U, 3>, + params, int, 64U, 3>, + params, int, 64U, 3>, + + // Non-power of 2 BlockSize and ItemsPerThread > 1 and ItemsPerThread Power of 2 + params, + params, + params, + params, + params, + params, + params, + params, + params, float, 63U, 4>, + params, int, 63U, 4>, + params, int, 63U, 4>, + + // Non-power of 2 BlockSize and ItemsPerThread > 1 and ItemsPerThread Non-power of 2 + params, + params, + params, + params, + params, + params, + params, + params, + params, float, 63U, 3>, + params, int, 63U, 3>, + params, int, 63U, 3>, + + // Sort with Striped arangement + params, + params, + params, + params, + params, + params, + params, + params, + params, float, 64U, 4, false, true>, + params, int, 64U, 4, false, true>, + params, int, 64U, 4, false, true>, + + // Sort in Descending order + params, + params, + params, + params, + params, + params, + params, + params, + params, float, 64U, 4, true>, + params, int, 64U, 4, true>, + params, int, 64U, 4, true>, + params, + params, + params, + params, + params, + params, + params, + params, + params, float, 64U, 4, true, true>, + params, int, 64U, 4, true, true>, + params, int, 64U, 4, true, true>, + + // Sort with + params, float, 64U, 3, false, false, 1, 7>, + params, int, 64U, 3, false, false, 1, 7>, + params, int, 64U, 3, false, false, 1, 7>, + params, float, 64U, 4, false, true, 1, 7>, + params, int, 64U, 4, false, true, 1, 7>, + params, int, 64U, 4, false, true, 1, 7>, + params, float, 64U, 4, true, false, 1, 7>, + params, int, 64U, 4, true, false, 1, 7>, + params, int, 64U, 4, true, false, 1, 7>, + params, float, 64U, 4, true, true, 1, 7>, + params, int, 64U, 4, true, true, 1, 7>, + params, int, 64U, 4, true, true, 1, 7>>; TYPED_TEST_SUITE(HipcubBlockRadixSort, Params); @@ -114,7 +201,8 @@ template<> struct SortDispatch { template - __device__ static void sort(BlockSort&& block_sort, Args&&... args) + __device__ + static void sort(BlockSort&& block_sort, Args&&... args) { block_sort.Sort(std::forward(args)...); } @@ -124,7 +212,8 @@ template<> struct SortDispatch { template - __device__ static void sort(BlockSort&& block_sort, Args&&... args) + __device__ + static void sort(BlockSort&& block_sort, Args&&... args) { block_sort.SortDescending(std::forward(args)...); } @@ -134,7 +223,8 @@ template<> struct SortDispatch { template - __device__ static void sort(BlockSort&& block_sort, Args&&... args) + __device__ + static void sort(BlockSort&& block_sort, Args&&... args) { block_sort.SortBlockedToStriped(std::forward(args)...); } @@ -144,7 +234,8 @@ template<> struct SortDispatch { template - __device__ static void sort(BlockSort&& block_sort, Args&&... args) + __device__ + static void sort(BlockSort&& block_sort, Args&&... args) { block_sort.SortDescendingBlockedToStriped(std::forward(args)...); } @@ -156,22 +247,25 @@ struct SortOp using dispatch_t = SortDispatch; template - __device__ void operator()(Key (&keys)[ItemsPerThread], int start_bit, int end_bit) const + __device__ + void operator()(Key (&keys)[ItemsPerThread], int start_bit, int end_bit) const { hipcub::BlockRadixSort block_sort; if(start_bit == 0 && end_bit == sizeof(Key) * 8) { dispatch_t::sort(block_sort, keys); - } else + } + else { dispatch_t::sort(block_sort, keys, start_bit, end_bit); } } template - __device__ void operator()(test_utils::custom_test_type (&keys)[ItemsPerThread], - int start_bit, - int end_bit) const + __device__ + void operator()(test_utils::custom_test_type (&keys)[ItemsPerThread], + int start_bit, + int end_bit) const { using custom_test_t = test_utils::custom_test_type; hipcub::BlockRadixSort block_sort; @@ -179,33 +273,37 @@ struct SortOp if(start_bit == 0 && end_bit == sizeof(custom_test_t) * 8) { dispatch_t::sort(block_sort, keys, decomposer); - } else + } + else { dispatch_t::sort(block_sort, keys, decomposer, start_bit, end_bit); } } template - __device__ void operator()(Key (&keys)[ItemsPerThread], - Value (&values)[ItemsPerThread], - int start_bit, - int end_bit) const + __device__ + void operator()(Key (&keys)[ItemsPerThread], + Value (&values)[ItemsPerThread], + int start_bit, + int end_bit) const { hipcub::BlockRadixSort block_sort; if(start_bit == 0 && end_bit == sizeof(Key) * 8) { dispatch_t::sort(block_sort, keys, values); - } else + } + else { dispatch_t::sort(block_sort, keys, values, start_bit, end_bit); } } template - __device__ void operator()(test_utils::custom_test_type (&keys)[ItemsPerThread], - Value (&values)[ItemsPerThread], - int start_bit, - int end_bit) const + __device__ + void operator()(test_utils::custom_test_type (&keys)[ItemsPerThread], + Value (&values)[ItemsPerThread], + int start_bit, + int end_bit) const { using custom_test_t = test_utils::custom_test_type; hipcub::BlockRadixSort block_sort; @@ -213,7 +311,8 @@ struct SortOp if(start_bit == 0 && end_bit == sizeof(custom_test_t) * 8) { dispatch_t::sort(block_sort, keys, values, decomposer); - } else + } + else { dispatch_t::sort(block_sort, keys, values, decomposer, start_bit, end_bit); } @@ -229,17 +328,19 @@ struct StoreOp static constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; template - __device__ void operator()(Key (&keys)[ItemsPerThread], Key* keys_output) const + __device__ + void operator()(Key (&keys)[ItemsPerThread], Key* keys_output) const { const unsigned int block_offset = blockIdx.x * items_per_block; hipcub::StoreDirectBlocked(threadIdx.x, keys_output + block_offset, keys); } template - __device__ void operator()(Key (&keys)[ItemsPerThread], - Value (&values)[ItemsPerThread], - Key* keys_output, - Value* values_output) const + __device__ + void operator()(Key (&keys)[ItemsPerThread], + Value (&values)[ItemsPerThread], + Key* keys_output, + Value* values_output) const { const unsigned int block_offset = blockIdx.x * items_per_block; hipcub::StoreDirectBlocked(threadIdx.x, keys_output + block_offset, keys); @@ -253,17 +354,19 @@ struct StoreOp static constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; template - __device__ void operator()(Key (&keys)[ItemsPerThread], Key* keys_output) const + __device__ + void operator()(Key (&keys)[ItemsPerThread], Key* keys_output) const { const unsigned int block_offset = blockIdx.x * items_per_block; hipcub::StoreDirectStriped(threadIdx.x, keys_output + block_offset, keys); } template - __device__ void operator()(Key (&keys)[ItemsPerThread], - Value (&values)[ItemsPerThread], - Key* keys_output, - Value* values_output) const + __device__ + void operator()(Key (&keys)[ItemsPerThread], + Value (&values)[ItemsPerThread], + Key* keys_output, + Value* values_output) const { const unsigned int block_offset = blockIdx.x * items_per_block; hipcub::StoreDirectStriped(threadIdx.x, keys_output + block_offset, keys); @@ -276,9 +379,8 @@ template -__global__ __launch_bounds__(BlockSize) void sort_key_kernel(key_type* device_keys_output, - unsigned int start_bit, - unsigned int end_bit) +__global__ __launch_bounds__(BlockSize) +void sort_key_kernel(key_type* device_keys_output, unsigned int start_bit, unsigned int end_bit) { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; const unsigned int block_offset = blockIdx.x * items_per_block; @@ -296,8 +398,8 @@ void assert_eq(T a, U b, size_t index) // GTest's ASSERT_EQ prints the values if the test fails. On Windows, GTest doesn't currently provide overloads for // printing 128 bit types, resulting in linker errors. // Check if we're testing with 128 bit types. If so, test using bools so GTest doesn't try to print them on failure. - if (test_utils::is_int128::value || test_utils::is_uint128::value || - test_utils::is_int128::value || test_utils::is_uint128::value) + if(test_utils::is_int128::value || test_utils::is_uint128::value + || test_utils::is_int128::value || test_utils::is_uint128::value) { const bool values_equal = (a == b); ASSERT_EQ(values_equal, true) << "at index: " << index; @@ -314,26 +416,27 @@ TYPED_TEST(HipcubBlockRadixSort, SortKeys) SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); HIP_CHECK(hipSetDevice(device_id)); - using key_type = typename TestFixture::params::key_type; - constexpr size_t block_size = TestFixture::params::block_size; - constexpr size_t items_per_thread = TestFixture::params::items_per_thread; - constexpr bool descending = TestFixture::params::descending; - constexpr bool to_striped = TestFixture::params::to_striped; - constexpr unsigned int start_bit = TestFixture::params::start_bit; - constexpr unsigned int end_bit = TestFixture::params::end_bit; - constexpr size_t items_per_block = block_size * items_per_thread; + using key_type = typename TestFixture::params::key_type; + constexpr size_t block_size = TestFixture::params::block_size; + constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + constexpr bool descending = TestFixture::params::descending; + constexpr bool to_striped = TestFixture::params::to_striped; + constexpr unsigned int start_bit = TestFixture::params::start_bit; + constexpr unsigned int end_bit = TestFixture::params::end_bit; + constexpr size_t items_per_block = block_size * items_per_thread; // Given block size not supported if(block_size > test_utils::get_max_block_size()) { return; } - const size_t size = items_per_block * 1134; + const size_t size = items_per_block * 1134; const size_t grid_size = size / items_per_block; - for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { - unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate data @@ -363,40 +466,35 @@ TYPED_TEST(HipcubBlockRadixSort, SortKeys) std::stable_sort( expected.begin() + (i * items_per_block), expected.begin() + ((i + 1) * items_per_block), - test_utils::key_comparator() - ); + test_utils::key_comparator()); } // Preparing device key_type* device_keys_output; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_keys_output, keys_output.size() * sizeof(key_type))); + HIP_CHECK(test_common_utils::hipMallocHelper(&device_keys_output, + keys_output.size() * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - device_keys_output, keys_output.data(), - keys_output.size() * sizeof(typename decltype(keys_output)::value_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(device_keys_output, + keys_output.data(), + keys_output.size() * sizeof(typename decltype(keys_output)::value_type), + hipMemcpyHostToDevice)); // Running kernel sort_key_kernel <<>>(device_keys_output, start_bit, end_bit); // Getting results to host - HIP_CHECK( - hipMemcpy( - keys_output.data(), device_keys_output, - keys_output.size() * sizeof(typename decltype(keys_output)::value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(keys_output.data(), + device_keys_output, + keys_output.size() * sizeof(typename decltype(keys_output)::value_type), + hipMemcpyDeviceToHost)); // Verifying results for(size_t i = 0; i < size; i++) { assert_eq(test_utils::convert_to_native(keys_output[i]), - test_utils::convert_to_native(expected[i]), i); + test_utils::convert_to_native(expected[i]), + i); } HIP_CHECK(hipFree(device_keys_output)); @@ -409,16 +507,17 @@ template -__global__ __launch_bounds__(BlockSize) void sort_key_value_kernel(key_type* device_keys_output, - value_type* device_values_output, - unsigned int start_bit, - unsigned int end_bit) +__global__ __launch_bounds__(BlockSize) +void sort_key_value_kernel(key_type* device_keys_output, + value_type* device_values_output, + unsigned int start_bit, + unsigned int end_bit) { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * items_per_block; - key_type keys[ItemsPerThread]; + key_type keys[ItemsPerThread]; value_type values[ItemsPerThread]; hipcub::LoadDirectBlocked(lid, device_keys_output + block_offset, keys); hipcub::LoadDirectBlocked(lid, device_values_output + block_offset, values); @@ -436,27 +535,28 @@ TYPED_TEST(HipcubBlockRadixSort, SortKeysValues) SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); HIP_CHECK(hipSetDevice(device_id)); - using key_type = typename TestFixture::params::key_type; - using value_type = typename TestFixture::params::value_type; - constexpr size_t block_size = TestFixture::params::block_size; - constexpr size_t items_per_thread = TestFixture::params::items_per_thread; - constexpr bool descending = TestFixture::params::descending; - constexpr bool to_striped = TestFixture::params::to_striped; - constexpr unsigned int start_bit = TestFixture::params::start_bit; - constexpr unsigned int end_bit = TestFixture::params::end_bit; - constexpr size_t items_per_block = block_size * items_per_thread; + using key_type = typename TestFixture::params::key_type; + using value_type = typename TestFixture::params::value_type; + constexpr size_t block_size = TestFixture::params::block_size; + constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + constexpr bool descending = TestFixture::params::descending; + constexpr bool to_striped = TestFixture::params::to_striped; + constexpr unsigned int start_bit = TestFixture::params::start_bit; + constexpr unsigned int end_bit = TestFixture::params::end_bit; + constexpr size_t items_per_block = block_size * items_per_thread; // Given block size not supported if(block_size > test_utils::get_max_block_size()) { return; } - const size_t size = items_per_block * 1134; + const size_t size = items_per_block * 1134; const size_t grid_size = size / items_per_block; - for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { - unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate data @@ -490,12 +590,11 @@ TYPED_TEST(HipcubBlockRadixSort, SortKeysValues) } else { - values_output = test_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max(), - seed_value + seed_value_addition - ); + values_output + = test_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max(), + seed_value + seed_value_addition); } using key_value = std::pair; @@ -512,30 +611,27 @@ TYPED_TEST(HipcubBlockRadixSort, SortKeysValues) std::stable_sort( expected.begin() + (i * items_per_block), expected.begin() + ((i + 1) * items_per_block), - test_utils::key_value_comparator() - ); + test_utils:: + key_value_comparator()); } key_type* device_keys_output; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_keys_output, keys_output.size() * sizeof(key_type))); + HIP_CHECK(test_common_utils::hipMallocHelper(&device_keys_output, + keys_output.size() * sizeof(key_type))); value_type* device_values_output; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_values_output, values_output.size() * sizeof(value_type))); + HIP_CHECK(test_common_utils::hipMallocHelper(&device_values_output, + values_output.size() * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - device_keys_output, keys_output.data(), - keys_output.size() * sizeof(typename decltype(keys_output)::value_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(device_keys_output, + keys_output.data(), + keys_output.size() * sizeof(typename decltype(keys_output)::value_type), + hipMemcpyHostToDevice)); HIP_CHECK( - hipMemcpy( - device_values_output, values_output.data(), - values_output.size() * sizeof(typename decltype(values_output)::value_type), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_values_output, + values_output.data(), + values_output.size() * sizeof(typename decltype(values_output)::value_type), + hipMemcpyHostToDevice)); // Running kernel sort_key_value_kernel @@ -545,31 +641,28 @@ TYPED_TEST(HipcubBlockRadixSort, SortKeysValues) end_bit); // Getting results to host - HIP_CHECK( - hipMemcpy( - keys_output.data(), device_keys_output, - keys_output.size() * sizeof(typename decltype(keys_output)::value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(keys_output.data(), + device_keys_output, + keys_output.size() * sizeof(typename decltype(keys_output)::value_type), + hipMemcpyDeviceToHost)); HIP_CHECK( - hipMemcpy( - values_output.data(), device_values_output, - values_output.size() * sizeof(typename decltype(values_output)::value_type), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(values_output.data(), + device_values_output, + values_output.size() * sizeof(typename decltype(values_output)::value_type), + hipMemcpyDeviceToHost)); for(size_t i = 0; i < size; i++) { - assert_eq(test_utils::convert_to_native(keys_output[i]), - test_utils::convert_to_native(expected[i].first), i); + assert_eq(test_utils::convert_to_native(keys_output[i]), + test_utils::convert_to_native(expected[i].first), + i); assert_eq(test_utils::convert_to_native(values_output[i]), - test_utils::convert_to_native(expected[i].second), i); + test_utils::convert_to_native(expected[i].second), + i); } HIP_CHECK(hipFree(device_keys_output)); HIP_CHECK(hipFree(device_values_output)); } -} +} \ No newline at end of file diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_reduce.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_reduce.cpp index 44ddf181538..71378b559e3 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_reduce.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_reduce.cpp @@ -27,18 +27,17 @@ #include "hipcub/thread/thread_operators.hpp" // Params for tests -template< - class T, - unsigned int BlockSize = 256U, - unsigned int ItemsPerThread = 1U, - hipcub::BlockReduceAlgorithm Algorithm = hipcub::BlockReduceAlgorithm::BLOCK_REDUCE_WARP_REDUCTIONS -> +template struct params { - using type = T; - static constexpr hipcub::BlockReduceAlgorithm algorithm = Algorithm; - static constexpr unsigned int block_size = BlockSize; - static constexpr unsigned int items_per_thread = ItemsPerThread; + using type = T; + static constexpr hipcub::BlockReduceAlgorithm algorithm = Algorithm; + static constexpr unsigned int block_size = BlockSize; + static constexpr unsigned int items_per_thread = ItemsPerThread; }; // --------------------------------------------------------- @@ -49,9 +48,9 @@ template class HipcubBlockReduceSingleValueTests : public ::testing::Test { public: - using type = typename Params::type; - static constexpr hipcub::BlockReduceAlgorithm algorithm = Params::algorithm; - static constexpr unsigned int block_size = Params::block_size; + using type = typename Params::type; + static constexpr hipcub::BlockReduceAlgorithm algorithm = Params::algorithm; + static constexpr unsigned int block_size = Params::block_size; }; using SingleValueTestParams = ::testing::Types< @@ -122,17 +121,13 @@ using SingleValueTestParams = ::testing::Types< TYPED_TEST_SUITE(HipcubBlockReduceSingleValueTests, SingleValueTestParams); -template< - unsigned int BlockSize, - hipcub::BlockReduceAlgorithm Algorithm, - class T -> +template __global__ __launch_bounds__(BlockSize) void reduce_kernel(T* device_output, T* device_output_reductions) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; - T value = device_output[index]; + const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + T value = device_output[index]; using breduce_t = hipcub::BlockReduce; __shared__ typename breduce_t::TempStorage temp_storage; value = breduce_t(temp_storage).Reduce(value, hipcub::Sum()); @@ -154,7 +149,7 @@ TYPED_TEST(HipcubBlockReduceSingleValueTests, Reduce) binary_op_type_host binary_op_host; using acc_type = typename test_utils::select_plus_operator_host::acc_type; - constexpr auto algorithm = TestFixture::algorithm; + constexpr auto algorithm = TestFixture::algorithm; constexpr size_t block_size = TestFixture::block_size; // Given block size not supported @@ -163,12 +158,13 @@ TYPED_TEST(HipcubBlockReduceSingleValueTests, Reduce) return; } - const size_t size = block_size * 113; + const size_t size = block_size * 113; const size_t grid_size = size / block_size; - for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { - unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate data @@ -193,31 +189,129 @@ TYPED_TEST(HipcubBlockReduceSingleValueTests, Reduce) T* device_output; HIP_CHECK(test_common_utils::hipMallocHelper(&device_output, output.size() * sizeof(T))); T* device_output_reductions; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_output_reductions, output_reductions.size() * sizeof(T))); + HIP_CHECK(test_common_utils::hipMallocHelper(&device_output_reductions, + output_reductions.size() * sizeof(T))); - HIP_CHECK( - hipMemcpy( - device_output, output.data(), - output.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(device_output, + output.data(), + output.size() * sizeof(T), + hipMemcpyHostToDevice)); // Running kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(reduce_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_output, device_output_reductions - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_reductions); // Reading results back - HIP_CHECK( - hipMemcpy( - output_reductions.data(), device_output_reductions, - output_reductions.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output_reductions.data(), + device_output_reductions, + output_reductions.size() * sizeof(T), + hipMemcpyDeviceToHost)); + + // Verifying results + test_utils::assert_near(output_reductions, + expected_reductions, + test_utils::precision::value * block_size); + + HIP_CHECK(hipFree(device_output)); + HIP_CHECK(hipFree(device_output_reductions)); + } +} + +template +__global__ +__launch_bounds__(BlockSize) +void sum_kernel(T* device_output, T* device_output_reductions) +{ + const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + T value = device_output[index]; + using breduce_t = hipcub::BlockReduce; + __shared__ typename breduce_t::TempStorage temp_storage; + value = breduce_t(temp_storage).Sum(value); + if(hipThreadIdx_x == 0) + { + device_output_reductions[hipBlockIdx_x] = value; + } +} + +TYPED_TEST(HipcubBlockReduceSingleValueTests, Sum) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using T = typename TestFixture::type; + // for bfloat16 and half we use double for host-side accumulation + using binary_op_type_host = typename test_utils::select_plus_operator_host::type; + binary_op_type_host binary_op_host; + using acc_type = typename test_utils::select_plus_operator_host::acc_type; + + constexpr auto algorithm = TestFixture::algorithm; + constexpr size_t block_size = TestFixture::block_size; + + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t size = block_size * 113; + const size_t grid_size = size / block_size; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); + + // Generate data + std::vector output = test_utils::get_random_data(size, 2, 200, seed_value); + std::vector output_reductions(size / block_size); + + // Calculate expected results on host + std::vector expected_reductions(output_reductions.size(), + test_utils::convert_to_device(0)); + for(size_t i = 0; i < output.size() / block_size; i++) + { + acc_type value(0); + for(size_t j = 0; j < block_size; j++) + { + auto idx = i * block_size + j; + value = binary_op_host(value, output[idx]); + } + expected_reductions[i] = static_cast(value); + } + + // Preparing device + T* device_output; + HIP_CHECK(test_common_utils::hipMallocHelper(&device_output, output.size() * sizeof(T))); + T* device_output_reductions; + HIP_CHECK(test_common_utils::hipMallocHelper(&device_output_reductions, + output_reductions.size() * sizeof(T))); + + HIP_CHECK(hipMemcpy(device_output, + output.data(), + output.size() * sizeof(T), + hipMemcpyHostToDevice)); + + // Running kernel + hipLaunchKernelGGL(HIP_KERNEL_NAME(sum_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_reductions); + + // Reading results back + HIP_CHECK(hipMemcpy(output_reductions.data(), + device_output_reductions, + output_reductions.size() * sizeof(T), + hipMemcpyDeviceToHost)); // Verifying results test_utils::assert_near(output_reductions, @@ -231,17 +325,15 @@ TYPED_TEST(HipcubBlockReduceSingleValueTests, Reduce) TYPED_TEST_SUITE(HipcubBlockReduceSingleValueTests, SingleValueTestParams); -template< - unsigned int BlockSize, - hipcub::BlockReduceAlgorithm Algorithm, - class T -> +template __global__ __launch_bounds__(BlockSize) -void reduce_valid_kernel(T* device_output, T* device_output_reductions, const unsigned int valid_items) +void reduce_valid_kernel(T* device_output, + T* device_output_reductions, + const unsigned int valid_items) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; - T value = device_output[index]; + const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + T value = device_output[index]; using breduce_t = hipcub::BlockReduce; __shared__ typename breduce_t::TempStorage temp_storage; value = breduce_t(temp_storage).Reduce(value, hipcub::Sum(), valid_items); @@ -266,8 +358,8 @@ TYPED_TEST(HipcubBlockReduceSingleValueTests, ReduceValid) constexpr auto algorithm = TestFixture::algorithm; constexpr size_t block_size = TestFixture::block_size; - const size_t size = block_size * 113; - const size_t grid_size = size / block_size; + const size_t size = block_size * 113; + const size_t grid_size = size / block_size; // Given block size not supported if(block_size > test_utils::get_max_block_size()) @@ -275,24 +367,18 @@ TYPED_TEST(HipcubBlockReduceSingleValueTests, ReduceValid) return; } - for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { - unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); - const unsigned int valid_items = test_utils::get_random_value( - block_size - 10, - block_size, - seed_value - ); + const unsigned int valid_items + = test_utils::get_random_value(block_size - 10, block_size, seed_value); // Generate data - std::vector output = test_utils::get_random_data( - size, - 2, - 200, - seed_value + seed_value_addition - ); + std::vector output + = test_utils::get_random_data(size, 2, 200, seed_value + seed_value_addition); std::vector output_reductions(size / block_size); // Calculate expected results on host @@ -313,31 +399,29 @@ TYPED_TEST(HipcubBlockReduceSingleValueTests, ReduceValid) T* device_output; HIP_CHECK(test_common_utils::hipMallocHelper(&device_output, output.size() * sizeof(T))); T* device_output_reductions; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_output_reductions, output_reductions.size() * sizeof(T))); + HIP_CHECK(test_common_utils::hipMallocHelper(&device_output_reductions, + output_reductions.size() * sizeof(T))); - HIP_CHECK( - hipMemcpy( - device_output, output.data(), - output.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(device_output, + output.data(), + output.size() * sizeof(T), + hipMemcpyHostToDevice)); // Running kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(reduce_valid_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_output, device_output_reductions, valid_items - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_valid_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_reductions, + valid_items); // Reading results back - HIP_CHECK( - hipMemcpy( - output_reductions.data(), device_output_reductions, - output_reductions.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output_reductions.data(), + device_output_reductions, + output_reductions.size() * sizeof(T), + hipMemcpyDeviceToHost)); // Verifying results test_utils::assert_near(output_reductions, @@ -349,15 +433,120 @@ TYPED_TEST(HipcubBlockReduceSingleValueTests, ReduceValid) } } +template +__global__ +__launch_bounds__(BlockSize) +void sum_valid_kernel(T* device_output, T* device_output_reductions, const unsigned int valid_items) +{ + const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + T value = device_output[index]; + using breduce_t = hipcub::BlockReduce; + __shared__ typename breduce_t::TempStorage temp_storage; + value = breduce_t(temp_storage).Sum(value, valid_items); + if(hipThreadIdx_x == 0) + { + device_output_reductions[hipBlockIdx_x] = value; + } +} + +TYPED_TEST(HipcubBlockReduceSingleValueTests, SumValid) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using T = typename TestFixture::type; + // for bfloat16 and half we use double for host-side accumulation + using binary_op_type_host = typename test_utils::select_plus_operator_host::type; + binary_op_type_host binary_op_host; + using acc_type = typename test_utils::select_plus_operator_host::acc_type; + + constexpr auto algorithm = TestFixture::algorithm; + + constexpr size_t block_size = TestFixture::block_size; + const size_t size = block_size * 113; + const size_t grid_size = size / block_size; + + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); + + const unsigned int valid_items + = test_utils::get_random_value(block_size - 10, block_size, seed_value); + + // Generate data + std::vector output + = test_utils::get_random_data(size, 2, 200, seed_value + seed_value_addition); + std::vector output_reductions(size / block_size); + + // Calculate expected results on host + std::vector expected_reductions(output_reductions.size(), + test_utils::convert_to_device(0)); + for(size_t i = 0; i < output.size() / block_size; i++) + { + acc_type value(0); + for(size_t j = 0; j < valid_items; j++) + { + auto idx = i * block_size + j; + value = binary_op_host(output[idx], value); + } + expected_reductions[i] = static_cast(value); + } + + // Preparing device + T* device_output; + HIP_CHECK(test_common_utils::hipMallocHelper(&device_output, output.size() * sizeof(T))); + T* device_output_reductions; + HIP_CHECK(test_common_utils::hipMallocHelper(&device_output_reductions, + output_reductions.size() * sizeof(T))); + + HIP_CHECK(hipMemcpy(device_output, + output.data(), + output.size() * sizeof(T), + hipMemcpyHostToDevice)); + + // Running kernel + hipLaunchKernelGGL(HIP_KERNEL_NAME(sum_valid_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_reductions, + valid_items); + + // Reading results back + HIP_CHECK(hipMemcpy(output_reductions.data(), + device_output_reductions, + output_reductions.size() * sizeof(T), + hipMemcpyDeviceToHost)); + + // Verifying results + test_utils::assert_near(output_reductions, + expected_reductions, + test_utils::precision::value * block_size); + + HIP_CHECK(hipFree(device_output)); + HIP_CHECK(hipFree(device_output_reductions)); + } +} template class HipcubBlockReduceInputArrayTests : public ::testing::Test { public: - using type = typename Params::type; - static constexpr unsigned int block_size = Params::block_size; - static constexpr hipcub::BlockReduceAlgorithm algorithm = Params::algorithm; - static constexpr unsigned int items_per_thread = Params::items_per_thread; + using type = typename Params::type; + static constexpr unsigned int block_size = Params::block_size; + static constexpr hipcub::BlockReduceAlgorithm algorithm = Params::algorithm; + static constexpr unsigned int items_per_thread = Params::items_per_thread; }; using InputArrayTestParams = ::testing::Types< @@ -394,12 +583,10 @@ using InputArrayTestParams = ::testing::Types< TYPED_TEST_SUITE(HipcubBlockReduceInputArrayTests, InputArrayTestParams); -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - hipcub::BlockReduceAlgorithm Algorithm, - class T -> +template __global__ __launch_bounds__(BlockSize) void reduce_array_kernel(T* device_output, T* device_output_reductions) @@ -412,7 +599,7 @@ void reduce_array_kernel(T* device_output, T* device_output_reductions) in_out[j] = device_output[index + j]; } - T reduction; + T reduction; using breduce_t = hipcub::BlockReduce; __shared__ typename breduce_t::TempStorage temp_storage; reduction = breduce_t(temp_storage).Reduce(in_out, hipcub::Sum()); @@ -423,7 +610,6 @@ void reduce_array_kernel(T* device_output, T* device_output_reductions) } } - TYPED_TEST(HipcubBlockReduceInputArrayTests, Reduce) { int device_id = test_common_utils::obtain_device_from_ctest(); @@ -436,8 +622,8 @@ TYPED_TEST(HipcubBlockReduceInputArrayTests, Reduce) binary_op_type_host binary_op_host; using acc_type = typename test_utils::select_plus_operator_host::acc_type; - constexpr auto algorithm = TestFixture::algorithm; - constexpr size_t block_size = TestFixture::block_size; + constexpr auto algorithm = TestFixture::algorithm; + constexpr size_t block_size = TestFixture::block_size; constexpr size_t items_per_thread = TestFixture::items_per_thread; // Given block size not supported @@ -447,12 +633,13 @@ TYPED_TEST(HipcubBlockReduceInputArrayTests, Reduce) } const size_t items_per_block = block_size * items_per_thread; - const size_t size = items_per_block * 37; - const size_t grid_size = size / items_per_block; + const size_t size = items_per_block * 37; + const size_t grid_size = size / items_per_block; - for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { - unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate data @@ -483,39 +670,34 @@ TYPED_TEST(HipcubBlockReduceInputArrayTests, Reduce) T* device_output; HIP_CHECK(test_common_utils::hipMallocHelper(&device_output, output.size() * sizeof(T))); T* device_output_reductions; - HIP_CHECK(test_common_utils::hipMallocHelper(&device_output_reductions, output_reductions.size() * sizeof(T))); - - HIP_CHECK( - hipMemcpy( - device_output, output.data(), - output.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); - - HIP_CHECK( - hipMemcpy( - device_output_reductions, output_reductions.data(), - output_reductions.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(test_common_utils::hipMallocHelper(&device_output_reductions, + output_reductions.size() * sizeof(T))); + + HIP_CHECK(hipMemcpy(device_output, + output.data(), + output.size() * sizeof(T), + hipMemcpyHostToDevice)); + + HIP_CHECK(hipMemcpy(device_output_reductions, + output_reductions.data(), + output_reductions.size() * sizeof(T), + hipMemcpyHostToDevice)); // Running kernel hipLaunchKernelGGL( HIP_KERNEL_NAME(reduce_array_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_output, device_output_reductions - ); + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_reductions); // Reading results back - HIP_CHECK( - hipMemcpy( - output_reductions.data(), device_output_reductions, - output_reductions.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output_reductions.data(), + device_output_reductions, + output_reductions.size() * sizeof(T), + hipMemcpyDeviceToHost)); // Verifying results test_utils::assert_near(output_reductions, @@ -526,3 +708,129 @@ TYPED_TEST(HipcubBlockReduceInputArrayTests, Reduce) HIP_CHECK(hipFree(device_output_reductions)); } } + +template +__global__ +__launch_bounds__(BlockSize) +void sum_array_kernel(T* device_output, T* device_output_reductions) +{ + const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; + // load + T in_out[ItemsPerThread]; + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + in_out[j] = device_output[index + j]; + } + + T reduction; + using breduce_t = hipcub::BlockReduce; + __shared__ typename breduce_t::TempStorage temp_storage; + reduction = breduce_t(temp_storage).Sum(in_out); + + if(hipThreadIdx_x == 0) + { + device_output_reductions[hipBlockIdx_x] = reduction; + } +} + +TYPED_TEST(HipcubBlockReduceInputArrayTests, Sum) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using T = typename TestFixture::type; + // for bfloat16 and half we use double for host-side accumulation + using binary_op_type_host = typename test_utils::select_plus_operator_host::type; + binary_op_type_host binary_op_host; + using acc_type = typename test_utils::select_plus_operator_host::acc_type; + + constexpr auto algorithm = TestFixture::algorithm; + constexpr size_t block_size = TestFixture::block_size; + constexpr size_t items_per_thread = TestFixture::items_per_thread; + + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 37; + const size_t grid_size = size / items_per_block; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); + + // Generate data + std::vector output + = test_utils::get_random_data(size, + test_utils::convert_to_device(2), + test_utils::convert_to_device(200), + seed_value); + + // Output reduce results + std::vector output_reductions(size / block_size, test_utils::convert_to_device(0)); + + // Calculate expected results on host + std::vector expected_reductions(output_reductions.size(), + test_utils::convert_to_device(0)); + for(size_t i = 0; i < output.size() / items_per_block; i++) + { + acc_type value(0); + for(size_t j = 0; j < items_per_block; j++) + { + auto idx = i * items_per_block + j; + value = binary_op_host(static_cast(output[idx]), value); + } + expected_reductions[i] = static_cast(value); + } + + // Preparing device + T* device_output; + HIP_CHECK(test_common_utils::hipMallocHelper(&device_output, output.size() * sizeof(T))); + T* device_output_reductions; + HIP_CHECK(test_common_utils::hipMallocHelper(&device_output_reductions, + output_reductions.size() * sizeof(T))); + + HIP_CHECK(hipMemcpy(device_output, + output.data(), + output.size() * sizeof(T), + hipMemcpyHostToDevice)); + + HIP_CHECK(hipMemcpy(device_output_reductions, + output_reductions.data(), + output_reductions.size() * sizeof(T), + hipMemcpyHostToDevice)); + + // Running kernel + hipLaunchKernelGGL( + HIP_KERNEL_NAME(sum_array_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_reductions); + + // Reading results back + HIP_CHECK(hipMemcpy(output_reductions.data(), + device_output_reductions, + output_reductions.size() * sizeof(T), + hipMemcpyDeviceToHost)); + + // Verifying results + test_utils::assert_near(output_reductions, + expected_reductions, + test_utils::precision::value * items_per_block); + + HIP_CHECK(hipFree(device_output)); + HIP_CHECK(hipFree(device_output_reductions)); + } +} \ No newline at end of file diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp index 58e264a8a00..e93ae5f7a20 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp @@ -25,19 +25,17 @@ #include "hipcub/block/block_run_length_decode.hpp" #include "hipcub/block/block_store.hpp" -template< - class ItemT, - class LengthT, - unsigned BlockSize, - unsigned RunsPerThread, - unsigned DecodedItemsPerThread -> +template struct Params { - using item_type = ItemT; - using length_type = LengthT; - static constexpr unsigned block_size = BlockSize; - static constexpr unsigned runs_per_thread = RunsPerThread; + using item_type = ItemT; + using length_type = LengthT; + static constexpr unsigned block_size = BlockSize; + static constexpr unsigned runs_per_thread = RunsPerThread; static constexpr unsigned decoded_items_per_thread = DecodedItemsPerThread; }; @@ -79,46 +77,37 @@ using HipcubBlockRunLengthDecodeTestParams TYPED_TEST_SUITE(HipcubBlockRunLengthDecodeTest, HipcubBlockRunLengthDecodeTestParams); -template< - class ItemT, - class LengthT, - unsigned BlockSize, - unsigned RunsPerThread, - unsigned DecodedItemsPerThread -> +template __global__ __launch_bounds__(BlockSize) -void block_run_length_decode_kernel( - const ItemT * d_run_items, - const LengthT * d_run_lengths, - ItemT * d_decoded_items) +void block_run_length_decode_kernel(const ItemT* d_run_items, + const LengthT* d_run_lengths, + ItemT* d_decoded_items) { - using BlockRunLengthDecodeT = hipcub::BlockRunLengthDecode< - ItemT, - BlockSize, - RunsPerThread, - DecodedItemsPerThread - >; + using BlockRunLengthDecodeT + = hipcub::BlockRunLengthDecode; static constexpr unsigned int decoded_items_per_block = BlockSize * DecodedItemsPerThread; __shared__ typename BlockRunLengthDecodeT::TempStorage temp_storage; - ItemT run_items[RunsPerThread]; + ItemT run_items[RunsPerThread]; LengthT run_lengths[RunsPerThread]; const unsigned global_thread_idx = BlockSize * hipBlockIdx_x + hipThreadIdx_x; hipcub::LoadDirectBlocked(global_thread_idx, d_run_items, run_items); hipcub::LoadDirectBlocked(global_thread_idx, d_run_lengths, run_lengths); - unsigned total_decoded_size{}; - BlockRunLengthDecodeT block_run_length_decode( - temp_storage, - run_items, - run_lengths, - total_decoded_size - ); + unsigned total_decoded_size{}; + BlockRunLengthDecodeT block_run_length_decode(temp_storage, + run_items, + run_lengths, + total_decoded_size); unsigned decoded_window_offset = 0; - while (decoded_window_offset < total_decoded_size) + while(decoded_window_offset < total_decoded_size) { ItemT decoded_items[DecodedItemsPerThread]; @@ -139,13 +128,13 @@ TYPED_TEST(HipcubBlockRunLengthDecodeTest, TestDecode) SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); HIP_CHECK(hipSetDevice(device_id)); - using ItemT = typename TestFixture::params::item_type; - using LengthT = typename TestFixture::params::length_type; - constexpr unsigned block_size = TestFixture::params::block_size; - constexpr unsigned runs_per_thread = TestFixture::params::runs_per_thread; + using ItemT = typename TestFixture::params::item_type; + using LengthT = typename TestFixture::params::length_type; + constexpr unsigned block_size = TestFixture::params::block_size; + constexpr unsigned runs_per_thread = TestFixture::params::runs_per_thread; constexpr unsigned decoded_items_per_thread = TestFixture::params::decoded_items_per_thread; - for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { const unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; @@ -154,31 +143,26 @@ TYPED_TEST(HipcubBlockRunLengthDecodeTest, TestDecode) const LengthT max_run_length = static_cast( std::min(1000ll, static_cast(std::numeric_limits::max()))); - size_t num_runs = runs_per_thread * block_size; - auto run_items = test_utils::get_random_data( - num_runs, - std::numeric_limits::min(), - std::numeric_limits::max(), - seed_value - ); - auto run_lengths = test_utils::get_random_data( - num_runs, - static_cast(1), - max_run_length, - seed_value - ); - - std::default_random_engine prng(seed_value); + size_t num_runs = runs_per_thread * block_size; + auto run_items = test_utils::get_random_data(num_runs, + std::numeric_limits::min(), + std::numeric_limits::max(), + seed_value); + auto run_lengths = test_utils::get_random_data(num_runs, + static_cast(1), + max_run_length, + seed_value); + + std::default_random_engine prng(seed_value); std::uniform_int_distribution num_empty_runs_dist(1, 4); - const size_t num_trailing_empty_runs = num_empty_runs_dist(prng); + const size_t num_trailing_empty_runs = num_empty_runs_dist(prng); num_runs += num_trailing_empty_runs; - const auto empty_run_items = test_utils::get_random_data( - num_trailing_empty_runs, - std::numeric_limits::min(), - std::numeric_limits::max(), - seed_value - ); + const auto empty_run_items + = test_utils::get_random_data(num_trailing_empty_runs, + std::numeric_limits::min(), + std::numeric_limits::max(), + seed_value); // Not strictly required, but fixes a spurious GCC warning and good practice anyways run_items.reserve(run_items.size() + empty_run_items.size()); run_items.insert(run_items.end(), empty_run_items.begin(), empty_run_items.end()); @@ -186,64 +170,56 @@ TYPED_TEST(HipcubBlockRunLengthDecodeTest, TestDecode) run_lengths.insert(run_lengths.end(), num_trailing_empty_runs, static_cast(0)); std::vector expected; - for (size_t i = 0; i < run_items.size(); ++i) + for(size_t i = 0; i < run_items.size(); ++i) { - for (size_t j = 0; j < static_cast(run_lengths[i]); ++j) + for(size_t j = 0; j < static_cast(run_lengths[i]); ++j) { expected.push_back(run_items[i]); } } - ItemT * d_run_items{}; - HIP_CHECK(test_common_utils::hipMallocHelper(&d_run_items, run_items.size() * sizeof(ItemT))); + ItemT* d_run_items{}; HIP_CHECK( - hipMemcpy( - d_run_items, - run_items.data(), - run_items.size() * sizeof(ItemT), - hipMemcpyHostToDevice - ) - ); - - LengthT * d_run_lengths{}; - HIP_CHECK(test_common_utils::hipMallocHelper(&d_run_lengths, run_lengths.size() * sizeof(LengthT))); + test_common_utils::hipMallocHelper(&d_run_items, run_items.size() * sizeof(ItemT))); + HIP_CHECK(hipMemcpy(d_run_items, + run_items.data(), + run_items.size() * sizeof(ItemT), + hipMemcpyHostToDevice)); + + LengthT* d_run_lengths{}; + HIP_CHECK(test_common_utils::hipMallocHelper(&d_run_lengths, + run_lengths.size() * sizeof(LengthT))); + HIP_CHECK(hipMemcpy(d_run_lengths, + run_lengths.data(), + run_lengths.size() * sizeof(LengthT), + hipMemcpyHostToDevice)); + + ItemT* d_decoded_runs{}; HIP_CHECK( - hipMemcpy( - d_run_lengths, - run_lengths.data(), - run_lengths.size() * sizeof(LengthT), - hipMemcpyHostToDevice - ) - ); - - ItemT * d_decoded_runs{}; - HIP_CHECK(test_common_utils::hipMallocHelper(&d_decoded_runs, expected.size() * sizeof(ItemT))); + test_common_utils::hipMallocHelper(&d_decoded_runs, expected.size() * sizeof(ItemT))); + HIP_CHECK(hipGetLastError()); hipLaunchKernelGGL( - HIP_KERNEL_NAME( - block_run_length_decode_kernel< - ItemT, - LengthT, - block_size, - runs_per_thread, - decoded_items_per_thread - > - ), - dim3(1), dim3(block_size), 0, 0, - d_run_items, d_run_lengths, d_decoded_runs - ); - HIP_CHECK(hipPeekAtLastError()); + HIP_KERNEL_NAME(block_run_length_decode_kernel), + dim3(1), + dim3(block_size), + 0, + 0, + d_run_items, + d_run_lengths, + d_decoded_runs); + HIP_CHECK(hipGetLastError()); HIP_CHECK(hipDeviceSynchronize()); std::vector output(expected.size()); - HIP_CHECK( - hipMemcpy( - output.data(), - d_decoded_runs, - output.size() * sizeof(ItemT), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output.data(), + d_decoded_runs, + output.size() * sizeof(ItemT), + hipMemcpyDeviceToHost)); HIP_CHECK(hipFree(d_run_items)); HIP_CHECK(hipFree(d_run_lengths)); diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_scan.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_scan.cpp index a4b9f0a4d29..6892222fcca 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_scan.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_scan.cpp @@ -89,7 +89,7 @@ TYPED_TEST_SUITE(HipcubBlockScanSingleValueTests, SingleValueTestParams); template __global__ __launch_bounds__(BlockSize) -void block_inclusive_scan_kernel(T* device_output) +void inclusive_scan_kernel(T* device_output) { const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; T value = device_output[index]; @@ -159,7 +159,7 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveScan) hipMemcpyHostToDevice)); // Launching kernel - hipLaunchKernelGGL(HIP_KERNEL_NAME(block_inclusive_scan_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(inclusive_scan_kernel), dim3(grid_size), dim3(block_size), 0, @@ -187,7 +187,7 @@ template __global__ __launch_bounds__(BlockSize) -void block_inclusive_scan_initial_value_kernel(T* device_output, T initial_value) +void inclusive_scan_initial_value_kernel(T* device_output, T initial_value) { const unsigned int index = (hipBlockIdx_x * BlockSize * ItemsPerThread) + hipThreadIdx_x * ItemsPerThread; @@ -270,7 +270,7 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveScanInitialValue) // Launching kernel hipLaunchKernelGGL( - HIP_KERNEL_NAME(block_inclusive_scan_initial_value_kernel), + HIP_KERNEL_NAME(inclusive_scan_initial_value_kernel), dim3(grid_size), dim3(block_size), 0, @@ -295,12 +295,13 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveScanInitialValue) } template -__global__ __launch_bounds__(BlockSize) -void block_inclusive_scan_reduce_kernel(T* device_output, T* device_output_reductions) +__global__ + __launch_bounds__(BlockSize) +void inclusive_scan_reduce_kernel(T* device_output, T* device_output_reductions) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; - T value = device_output[index]; - T reduction; + const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + T value = device_output[index]; + T reduction; using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage temp_storage; bscan_t(temp_storage).InclusiveScan(value, value, hipcub::Sum(), reduction); @@ -378,14 +379,13 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveScanReduce) HIP_CHECK(hipMemset(device_output_reductions, T(0), output_reductions.size() * sizeof(T))); // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(block_inclusive_scan_reduce_kernel), - dim3(grid_size), - dim3(block_size), - 0, - 0, - device_output, - device_output_reductions); + hipLaunchKernelGGL(HIP_KERNEL_NAME(inclusive_scan_reduce_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_reductions); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); @@ -412,14 +412,18 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveScanReduce) } } +// CUB fails to compute the block aggregate correctly when using the API for initial value support. +// TODO fix this unit test +#if 0 template -__global__ __launch_bounds__(BlockSize) -void block_inclusive_scan_reduce_initial_value_kernel(T* device_output, - T* device_output_reductions, - T initial_value) +__global__ + __launch_bounds__(BlockSize) +void inclusive_scan_reduce_initial_value_kernel(T* device_output, + T* device_output_reductions, + T initial_value) { const unsigned int index = (hipBlockIdx_x * BlockSize * ItemsPerThread) + hipThreadIdx_x * ItemsPerThread; @@ -446,6 +450,7 @@ void block_inclusive_scan_reduce_initial_value_kernel(T* device_output, } } +// #ifndef __HIP_PLATFORM_NVIDIA__ TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveScanReduceInitialValue) { int device_id = test_common_utils::obtain_device_from_ctest(); @@ -483,23 +488,20 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveScanReduceInitialValue) SCOPED_TRACE(testing::Message() << "with initial_value = " << initial_value); // Calculate expected results on host - std::vector expected(output.size(), T(0)); - std::vector expected_reductions(output_reductions.size(), T(0)); + std::vector expected(output.size(), 0); + std::vector expected_reductions(output_reductions.size(), 0); for(size_t i = 0; i < output.size() / block_size; i++) { - acc_type accumulator(initial_value); - acc_type reduction = output[i * block_size]; + + acc_type accumulator = static_cast(initial_value); for(size_t j = 0; j < block_size; j++) + { - size_t idx = i * block_size + j; - accumulator = binary_op_host(output[idx], accumulator); + auto idx = i * block_size + j; + accumulator = binary_op_host(accumulator, static_cast(output[idx])); expected[idx] = static_cast(accumulator); - if(j > 0) - { - reduction = binary_op_host(output[idx], reduction); - } } - expected_reductions[i] = reduction; + expected_reductions[i] = expected[(i + 1) * block_size - 1]; } // Writing to device memory @@ -522,7 +524,7 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveScanReduceInitialValue) // Launching kernel hipLaunchKernelGGL( HIP_KERNEL_NAME( - block_inclusive_scan_reduce_initial_value_kernel), + inclusive_scan_reduce_initial_value_kernel), dim3(grid_size), dim3(block_size), 0, @@ -556,11 +558,13 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveScanReduceInitialValue) } } +// #endif //__HIP_PLATFORM_NVIDIA__ + +#endif + template __global__ __launch_bounds__(BlockSize) -void block_inclusive_scan_prefix_callback_kernel(T* device_output, - T* device_output_bp, - T block_prefix) +void inclusive_scan_prefix_callback_kernel(T* device_output, T* device_output_bp, T block_prefix) { const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; T prefix_value = block_prefix; @@ -652,7 +656,7 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveScanPrefixCallback) // Launching kernel hipLaunchKernelGGL( - HIP_KERNEL_NAME(block_inclusive_scan_prefix_callback_kernel), + HIP_KERNEL_NAME(inclusive_scan_prefix_callback_kernel), dim3(grid_size), dim3(block_size), 0, @@ -687,11 +691,12 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveScanPrefixCallback) } template -__global__ __launch_bounds__(BlockSize) void exclusive_scan_kernel(T* device_output, T init) +__global__ __launch_bounds__(BlockSize) +void exclusive_scan_kernel(T* device_output, T init) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; - T value = device_output[index]; - using bscan_t = hipcub::BlockScan; + const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + T value = device_output[index]; + using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage temp_storage; bscan_t(temp_storage).ExclusiveScan(value, value, init, hipcub::Sum()); device_output[index] = value; @@ -782,12 +787,12 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, ExclusiveScan) } template -__global__ __launch_bounds__(BlockSize) void exclusive_scan_reduce_kernel( - T* device_output, T* device_output_reductions, T init) +__global__ __launch_bounds__(BlockSize) +void exclusive_scan_reduce_kernel(T* device_output, T* device_output_reductions, T init) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; - T value = device_output[index]; - T reduction; + const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + T value = device_output[index]; + T reduction; using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage temp_storage; bscan_t(temp_storage).ExclusiveScan(value, value, init, hipcub::Sum(), reduction); @@ -911,8 +916,8 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, ExclusiveScanReduce) } template -__global__ __launch_bounds__(BlockSize) void exclusive_scan_prefix_callback_kernel( - T* device_output, T* device_output_bp, T block_prefix) +__global__ __launch_bounds__(BlockSize) +void exclusive_scan_prefix_callback_kernel(T* device_output, T* device_output_bp, T block_prefix) { const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; T prefix_value = block_prefix; @@ -988,7 +993,7 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, ExclusiveScanPrefixCallback) acc_type accumulator_block_prefixes(block_prefix); for(size_t j = 0; j < block_size; j++) { - auto idx = i * block_size + j; + auto idx = i * block_size + j; accumulator_block_prefixes = binary_op_host(static_cast(output[idx]), accumulator_block_prefixes); } @@ -1114,7 +1119,7 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, CustomStruct) hipMemcpyHostToDevice)); // Launching kernel - hipLaunchKernelGGL(HIP_KERNEL_NAME(block_inclusive_scan_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(inclusive_scan_kernel), dim3(grid_size), dim3(block_size), 0, @@ -1137,80 +1142,21 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, CustomStruct) } } -// // --------------------------------------------------------- -// // Test for scan ops taking array of values as input -// // --------------------------------------------------------- - -template -class HipcubBlockScanInputArrayTests : public ::testing::Test -{ -public: - using type = typename Params::type; - static constexpr unsigned int block_size = Params::block_size; - static constexpr hipcub::BlockScanAlgorithm algorithm = Params::algorithm; - static constexpr unsigned int items_per_thread = Params::items_per_thread; -}; - -using InputArrayTestParams = ::testing::Types< - // ----------------------------------------------------------------------- - // hipcub::BlockScanAlgorithm::using_warp_scan - // ----------------------------------------------------------------------- - params, - params, - params, - params, - params, - params, - params, - params, - // half and bfloat require small block sizes due to the very limited accuracy - params, - params, - // ----------------------------------------------------------------------- - // hipcub::BLOCK_SCAN_RAKING - // ----------------------------------------------------------------------- - params, - params, - params, - params, - params, - params, - params, - params, - // half and bfloat require small block sizes due to the very limited accuracy - params, - params>; - -TYPED_TEST_SUITE(HipcubBlockScanInputArrayTests, InputArrayTestParams); - -template +template __global__ __launch_bounds__(BlockSize) -void block_inclusive_scan_array_kernel(T* device_output) +void inclusive_sum_kernel(T* device_output) { - const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; - - // load - T in_out[ItemsPerThread]; - for(unsigned int j = 0; j < ItemsPerThread; j++) - { - in_out[j] = device_output[index + j]; - } + const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + T value = device_output[index]; using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage temp_storage; - bscan_t(temp_storage).InclusiveScan(in_out, in_out, hipcub::Sum()); + bscan_t(temp_storage).InclusiveSum(value, value); - // store - for(unsigned int j = 0; j < ItemsPerThread; j++) - { - device_output[index + j] = in_out[j]; - } + device_output[index] = value; } -TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScan) +TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveSum) { int device_id = test_common_utils::obtain_device_from_ctest(); SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); @@ -1222,9 +1168,8 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScan) binary_op_type_host binary_op_host; using acc_type = typename test_utils::select_plus_operator_host::acc_type; - constexpr auto algorithm = TestFixture::algorithm; - constexpr size_t block_size = TestFixture::block_size; - constexpr size_t items_per_thread = TestFixture::items_per_thread; + constexpr auto algorithm = TestFixture::algorithm; + constexpr size_t block_size = TestFixture::block_size; // Given block size not supported if(block_size > test_utils::get_max_block_size()) @@ -1232,9 +1177,8 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScan) return; } - const size_t items_per_block = block_size * items_per_thread; - const size_t size = items_per_block * 37; - const size_t grid_size = size / items_per_block; + const size_t size = block_size * 113; + const size_t grid_size = size / block_size; for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { @@ -1246,13 +1190,13 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScan) std::vector output = test_utils::get_random_data(size, 2, 200, seed_value); // Calculate expected results on host - std::vector expected(output.size(), test_utils::convert_to_device(0)); - for(size_t i = 0; i < output.size() / items_per_block; i++) + std::vector expected(output.size(), 0); + for(size_t i = 0; i < output.size() / block_size; i++) { acc_type accumulator(0); - for(size_t j = 0; j < items_per_block; j++) + for(size_t j = 0; j < block_size; j++) { - auto idx = i * items_per_block + j; + auto idx = i * block_size + j; accumulator = binary_op_host(static_cast(output[idx]), accumulator); expected[idx] = static_cast(accumulator); } @@ -1270,14 +1214,12 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScan) hipMemcpyHostToDevice)); // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - block_inclusive_scan_array_kernel), - dim3(grid_size), - dim3(block_size), - 0, - 0, - device_output); + hipLaunchKernelGGL(HIP_KERNEL_NAME(inclusive_sum_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); @@ -1295,40 +1237,25 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScan) } } -template -__global__ __launch_bounds__(BlockSize) -void block_inclusive_scan_reduce_array_kernel(T* device_output, T* device_output_reductions) +template +__global__ + __launch_bounds__(BlockSize) +void inclusive_sum_reduce_kernel(T* device_output, T* device_output_reductions) { - const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; - - // load - T in_out[ItemsPerThread]; - for(unsigned int j = 0; j < ItemsPerThread; j++) - { - in_out[j] = device_output[index + j]; - } - + const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + T value = device_output[index]; + T reduction; using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage temp_storage; - T reduction; - bscan_t(temp_storage).InclusiveScan(in_out, in_out, hipcub::Sum(), reduction); - - // store - for(unsigned int j = 0; j < ItemsPerThread; j++) - { - device_output[index + j] = in_out[j]; - } - + bscan_t(temp_storage).InclusiveSum(value, value, reduction); + device_output[index] = value; if(hipThreadIdx_x == 0) { device_output_reductions[hipBlockIdx_x] = reduction; } } -TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanReduce) +TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveSumReduce) { int device_id = test_common_utils::obtain_device_from_ctest(); SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); @@ -1340,9 +1267,8 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanReduce) binary_op_type_host binary_op_host; using acc_type = typename test_utils::select_plus_operator_host::acc_type; - constexpr auto algorithm = TestFixture::algorithm; - constexpr size_t block_size = TestFixture::block_size; - constexpr size_t items_per_thread = TestFixture::items_per_thread; + constexpr auto algorithm = TestFixture::algorithm; + constexpr size_t block_size = TestFixture::block_size; // Given block size not supported if(block_size > test_utils::get_max_block_size()) @@ -1350,9 +1276,8 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanReduce) return; } - const size_t items_per_block = block_size * items_per_thread; - const size_t size = items_per_block * 37; - const size_t grid_size = size / items_per_block; + const size_t size = block_size * 113; + const size_t grid_size = size / block_size; for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { @@ -1362,24 +1287,21 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanReduce) // Generate data std::vector output = test_utils::get_random_data(size, 2, 200, seed_value); - - // Output reduce results - std::vector output_reductions(size / block_size, test_utils::convert_to_device(0)); + std::vector output_reductions(size / block_size, 0); // Calculate expected results on host - std::vector expected(output.size(), test_utils::convert_to_device(0)); - std::vector expected_reductions(output_reductions.size(), - test_utils::convert_to_device(0)); - for(size_t i = 0; i < output.size() / items_per_block; i++) + std::vector expected(output.size(), 0); + std::vector expected_reductions(output_reductions.size(), 0); + for(size_t i = 0; i < output.size() / block_size; i++) { acc_type accumulator(0); - for(size_t j = 0; j < items_per_block; j++) + for(size_t j = 0; j < block_size; j++) { - auto idx = i * items_per_block + j; + auto idx = i * block_size + j; accumulator = binary_op_host(static_cast(output[idx]), accumulator); expected[idx] = static_cast(accumulator); } - expected_reductions[i] = expected[(i + 1) * items_per_block - 1]; + expected_reductions[i] = expected[(i + 1) * block_size - 1]; } // Writing to device memory @@ -1397,22 +1319,16 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanReduce) output.size() * sizeof(T), hipMemcpyHostToDevice)); - HIP_CHECK(hipMemset(device_output_reductions, - test_utils::convert_to_device(0), - output_reductions.size() * sizeof(T))); + HIP_CHECK(hipMemset(device_output_reductions, T(0), output_reductions.size() * sizeof(T))); // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(block_inclusive_scan_reduce_array_kernel), - dim3(grid_size), - dim3(block_size), - 0, - 0, - device_output, - device_output_reductions); + hipLaunchKernelGGL(HIP_KERNEL_NAME(inclusive_sum_reduce_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_reductions); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); @@ -1430,7 +1346,6 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanReduce) // Validating results test_utils::assert_near(output, expected, test_utils::precision::value * block_size); - test_utils::assert_near(output_reductions, expected_reductions, test_utils::precision::value * block_size); @@ -1440,48 +1355,33 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanReduce) } } -template +template __global__ __launch_bounds__(BlockSize) -void block_inclusive_scan_array_prefix_callback_kernel(T* device_output, - T* device_output_bp, - T block_prefix) +void inclusive_sum_prefix_callback_kernel(T* device_output, T* device_output_bp, T block_prefix) { - const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; + const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; T prefix_value = block_prefix; auto prefix_callback = [&prefix_value](T reduction) { - T prefix = prefix_value; - prefix_value = prefix_value + reduction; + T prefix = prefix_value; + prefix_value += reduction; return prefix; }; - // load - T in_out[ItemsPerThread]; - for(unsigned int j = 0; j < ItemsPerThread; j++) - { - in_out[j] = device_output[index + j]; - } + T value = device_output[index]; using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage temp_storage; - bscan_t(temp_storage).InclusiveScan(in_out, in_out, hipcub::Sum(), prefix_callback); - - // store - for(unsigned int j = 0; j < ItemsPerThread; j++) - { - device_output[index + j] = in_out[j]; - } + bscan_t(temp_storage).InclusiveSum(value, value, prefix_callback); + device_output[index] = value; if(hipThreadIdx_x == 0) { device_output_bp[hipBlockIdx_x] = prefix_value; } } -TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanPrefixCallback) +TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveSumPrefixCallback) { int device_id = test_common_utils::obtain_device_from_ctest(); SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); @@ -1493,9 +1393,8 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanPrefixCallback) binary_op_type_host binary_op_host; using acc_type = typename test_utils::select_plus_operator_host::acc_type; - constexpr auto algorithm = TestFixture::algorithm; - constexpr size_t block_size = TestFixture::block_size; - constexpr size_t items_per_thread = TestFixture::items_per_thread; + constexpr auto algorithm = TestFixture::algorithm; + constexpr size_t block_size = TestFixture::block_size; // Given block size not supported if(block_size > test_utils::get_max_block_size()) @@ -1503,9 +1402,8 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanPrefixCallback) return; } - const size_t items_per_block = block_size * items_per_thread; - const size_t size = items_per_block * 37; - const size_t grid_size = size / items_per_block; + const size_t size = block_size * 113; + const size_t grid_size = size / block_size; for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { @@ -1515,16 +1413,1670 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanPrefixCallback) // Generate data std::vector output = test_utils::get_random_data(size, 2, 200, seed_value); - std::vector output_block_prefixes(size / items_per_block, - test_utils::convert_to_device(0)); - T block_prefix = test_utils::get_random_value(test_utils::convert_to_device(0), - test_utils::convert_to_device(100), - seed_value + seed_value_addition); + std::vector output_block_prefixes(size / block_size); + T block_prefix = test_utils::get_random_value(0, 100, seed_value + seed_value_addition); // Calculate expected results on host - std::vector expected(output.size(), test_utils::convert_to_device(0)); - std::vector expected_block_prefixes(output_block_prefixes.size(), - test_utils::convert_to_device(0)); + std::vector expected(output.size(), 0); + std::vector expected_block_prefixes(output_block_prefixes.size(), 0); + for(size_t i = 0; i < output.size() / block_size; i++) + { + acc_type accumulator(block_prefix); + for(size_t j = 0; j < block_size; j++) + { + auto idx = i * block_size + j; + accumulator = binary_op_host(static_cast(output[idx]), accumulator); + expected[idx] = static_cast(accumulator); + } + expected_block_prefixes[i] = expected[(i + 1) * block_size - 1]; + } + + // Writing to device memory + T* device_output; + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_output, + output.size() * sizeof(typename decltype(output)::value_type))); + T* device_output_bp; + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_output_bp, + output_block_prefixes.size() + * sizeof(typename decltype(output_block_prefixes)::value_type))); + + HIP_CHECK(hipMemcpy(device_output, + output.data(), + output.size() * sizeof(T), + hipMemcpyHostToDevice)); + + // Launching kernel + hipLaunchKernelGGL( + HIP_KERNEL_NAME(inclusive_sum_prefix_callback_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_bp, + block_prefix); + + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Read from device memory + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(T), + hipMemcpyDeviceToHost)); + + HIP_CHECK(hipMemcpy(output_block_prefixes.data(), + device_output_bp, + output_block_prefixes.size() * sizeof(T), + hipMemcpyDeviceToHost)); + + // Validating results + test_utils::assert_near(output, expected, test_utils::precision::value * block_size); + test_utils::assert_near(output_block_prefixes, + expected_block_prefixes, + test_utils::precision::value * block_size); + + HIP_CHECK(hipFree(device_output)); + HIP_CHECK(hipFree(device_output_bp)); + } +} + +template +__global__ __launch_bounds__(BlockSize) +void exclusive_sum_kernel(T* device_output) +{ + const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + T value = device_output[index]; + using bscan_t = hipcub::BlockScan; + __shared__ typename bscan_t::TempStorage temp_storage; + bscan_t(temp_storage).ExclusiveSum(value, value); + device_output[index] = value; +} + +TYPED_TEST(HipcubBlockScanSingleValueTests, ExclusiveSum) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using T = typename TestFixture::type; + // for bfloat16 and half we use double for host-side accumulation + using binary_op_type_host = typename test_utils::select_plus_operator_host::type; + binary_op_type_host binary_op_host; + using acc_type = typename test_utils::select_plus_operator_host::acc_type; + + constexpr auto algorithm = TestFixture::algorithm; + constexpr size_t block_size = TestFixture::block_size; + + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t size = block_size * 113; + const size_t grid_size = size / block_size; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); + + // Generate data + std::vector output = test_utils::get_random_data(size, 2, 241, seed_value); + const T init = 0; + + // Calculate expected results on host + std::vector expected(output.size(), 0); + for(size_t i = 0; i < output.size() / block_size; i++) + { + acc_type accumulator(init); + expected[i * block_size] = init; + for(size_t j = 1; j < block_size; j++) + { + auto idx = i * block_size + j; + accumulator = binary_op_host(static_cast(output[idx - 1]), accumulator); + expected[idx] = static_cast(accumulator); + } + } + + // Writing to device memory + T* device_output; + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_output, + output.size() * sizeof(typename decltype(output)::value_type))); + + HIP_CHECK(hipMemcpy(device_output, + output.data(), + output.size() * sizeof(T), + hipMemcpyHostToDevice)); + + // Launching kernel + hipLaunchKernelGGL(HIP_KERNEL_NAME(exclusive_sum_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output); + + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Read from device memory + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(T), + hipMemcpyDeviceToHost)); + + // Validating results + test_utils::assert_near(output, expected, test_utils::precision::value * block_size); + + HIP_CHECK(hipFree(device_output)); + } +} + +template +__global__ __launch_bounds__(BlockSize) +void exclusive_sum_reduce_kernel(T* device_output, T* device_output_reductions) +{ + const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + T value = device_output[index]; + T reduction; + using bscan_t = hipcub::BlockScan; + __shared__ typename bscan_t::TempStorage temp_storage; + bscan_t(temp_storage).ExclusiveSum(value, value, reduction); + device_output[index] = value; + if(hipThreadIdx_x == 0) + { + device_output_reductions[hipBlockIdx_x] = reduction; + } +} + +TYPED_TEST(HipcubBlockScanSingleValueTests, ExclusiveSumReduce) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using T = typename TestFixture::type; + // for bfloat16 and half we use double for host-side accumulation + using binary_op_type_host = typename test_utils::select_plus_operator_host::type; + binary_op_type_host binary_op_host; + using acc_type = typename test_utils::select_plus_operator_host::acc_type; + + constexpr auto algorithm = TestFixture::algorithm; + constexpr size_t block_size = TestFixture::block_size; + + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t size = block_size * 113; + const size_t grid_size = size / block_size; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); + + // Generate data + std::vector output = test_utils::get_random_data(size, 2, 200, seed_value); + const T init = 0; + + // Output reduce results + std::vector output_reductions(size / block_size, 0); + + // Calculate expected results on host + std::vector expected(output.size(), 0); + std::vector expected_reductions(output_reductions.size(), 0); + for(size_t i = 0; i < output.size() / block_size; i++) + { + acc_type accumulator(init); + expected[i * block_size] = init; + for(size_t j = 1; j < block_size; j++) + { + auto idx = i * block_size + j; + accumulator = binary_op_host(static_cast(output[idx - 1]), accumulator); + expected[idx] = static_cast(accumulator); + } + + acc_type accumulator_reductions(0); + for(size_t j = 0; j < block_size; j++) + { + auto idx = i * block_size + j; + accumulator_reductions + = binary_op_host(static_cast(output[idx]), accumulator_reductions); + expected_reductions[i] = static_cast(accumulator_reductions); + } + } + + // Writing to device memory + T* device_output; + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_output, + output.size() * sizeof(typename decltype(output)::value_type))); + T* device_output_reductions; + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_output_reductions, + output_reductions.size() * sizeof(typename decltype(output_reductions)::value_type))); + + HIP_CHECK(hipMemcpy(device_output, + output.data(), + output.size() * sizeof(T), + hipMemcpyHostToDevice)); + + HIP_CHECK(hipMemset(device_output_reductions, T(0), output_reductions.size() * sizeof(T))); + + // Launching kernel + hipLaunchKernelGGL(HIP_KERNEL_NAME(exclusive_sum_reduce_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_reductions); + + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Read from device memory + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(T), + hipMemcpyDeviceToHost)); + + HIP_CHECK(hipMemcpy(output_reductions.data(), + device_output_reductions, + output_reductions.size() * sizeof(T), + hipMemcpyDeviceToHost)); + + // Validating results + test_utils::assert_near(output, expected, test_utils::precision::value * block_size); + test_utils::assert_near(output_reductions, + expected_reductions, + test_utils::precision::value * block_size); + + HIP_CHECK(hipFree(device_output)); + HIP_CHECK(hipFree(device_output_reductions)); + } +} + +template +__global__ __launch_bounds__(BlockSize) +void exclusive_sum_prefix_callback_kernel(T* device_output, T* device_output_bp, T block_prefix) +{ + const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + T prefix_value = block_prefix; + auto prefix_callback = [&prefix_value](T reduction) + { + T prefix = prefix_value; + prefix_value += reduction; + return prefix; + }; + + T value = device_output[index]; + + using bscan_t = hipcub::BlockScan; + __shared__ typename bscan_t::TempStorage temp_storage; + bscan_t(temp_storage).ExclusiveSum(value, value, prefix_callback); + + device_output[index] = value; + if(hipThreadIdx_x == 0) + { + device_output_bp[hipBlockIdx_x] = prefix_value; + } +} + +TYPED_TEST(HipcubBlockScanSingleValueTests, ExclusiveSumPrefixCallback) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using T = typename TestFixture::type; + // for bfloat16 and half we use double for host-side accumulation + using binary_op_type_host = typename test_utils::select_plus_operator_host::type; + binary_op_type_host binary_op_host; + using acc_type = typename test_utils::select_plus_operator_host::acc_type; + + constexpr auto algorithm = TestFixture::algorithm; + constexpr size_t block_size = TestFixture::block_size; + + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t size = block_size * 113; + const size_t grid_size = size / block_size; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); + + // Generate data + std::vector output = test_utils::get_random_data(size, 2, 200, seed_value); + std::vector output_block_prefixes(size / block_size); + T block_prefix = test_utils::get_random_value(0, 100, seed_value + seed_value_addition); + + // Calculate expected results on host + std::vector expected(output.size(), 0); + std::vector expected_block_prefixes(output_block_prefixes.size(), 0); + for(size_t i = 0; i < output.size() / block_size; i++) + { + acc_type accumulator(block_prefix); + expected[i * block_size] = block_prefix; + for(size_t j = 1; j < block_size; j++) + { + auto idx = i * block_size + j; + accumulator = binary_op_host(static_cast(output[idx - 1]), accumulator); + expected[idx] = static_cast(accumulator); + } + + acc_type accumulator_block_prefixes(block_prefix); + for(size_t j = 0; j < block_size; j++) + { + auto idx = i * block_size + j; + accumulator_block_prefixes = binary_op_host(static_cast(output[idx]), + accumulator_block_prefixes); + } + expected_block_prefixes[i] = static_cast(accumulator_block_prefixes); + } + + // Writing to device memory + T* device_output; + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_output, + output.size() * sizeof(typename decltype(output)::value_type))); + T* device_output_bp; + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_output_bp, + output_block_prefixes.size() + * sizeof(typename decltype(output_block_prefixes)::value_type))); + + HIP_CHECK(hipMemcpy(device_output, + output.data(), + output.size() * sizeof(T), + hipMemcpyHostToDevice)); + + // Launching kernel + hipLaunchKernelGGL( + HIP_KERNEL_NAME(exclusive_sum_prefix_callback_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_bp, + block_prefix); + + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Read from device memory + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(T), + hipMemcpyDeviceToHost)); + + HIP_CHECK(hipMemcpy(output_block_prefixes.data(), + device_output_bp, + output_block_prefixes.size() * sizeof(T), + hipMemcpyDeviceToHost)); + + // Validating results + test_utils::assert_near(output, expected, test_utils::precision::value * block_size); + test_utils::assert_near(output_block_prefixes, + expected_block_prefixes, + test_utils::precision::value * block_size); + + HIP_CHECK(hipFree(device_output)); + HIP_CHECK(hipFree(device_output_bp)); + } +} + +// // --------------------------------------------------------- +// // Test for scan ops taking array of values as input +// // --------------------------------------------------------- + +template +class HipcubBlockScanInputArrayTests : public ::testing::Test +{ +public: + using type = typename Params::type; + static constexpr unsigned int block_size = Params::block_size; + static constexpr hipcub::BlockScanAlgorithm algorithm = Params::algorithm; + static constexpr unsigned int items_per_thread = Params::items_per_thread; +}; + +using InputArrayTestParams = ::testing::Types< + // ----------------------------------------------------------------------- + // hipcub::BlockScanAlgorithm::using_warp_scan + // ----------------------------------------------------------------------- + params, + params, + params, + params, + params, + params, + params, + params, + // half and bfloat require small block sizes due to the very limited accuracy + params, + params, + // ----------------------------------------------------------------------- + // hipcub::BLOCK_SCAN_RAKING + // ----------------------------------------------------------------------- + params, + params, + params, + params, + params, + params, + params, + params, + // half and bfloat require small block sizes due to the very limited accuracy + params, + params>; + +TYPED_TEST_SUITE(HipcubBlockScanInputArrayTests, InputArrayTestParams); + +template +__global__ __launch_bounds__(BlockSize) +void inclusive_scan_array_kernel(T* device_output) +{ + const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; + + // load + T in_out[ItemsPerThread]; + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + in_out[j] = device_output[index + j]; + } + + using bscan_t = hipcub::BlockScan; + __shared__ typename bscan_t::TempStorage temp_storage; + bscan_t(temp_storage).InclusiveScan(in_out, in_out, hipcub::Sum()); + + // store + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + device_output[index + j] = in_out[j]; + } +} + +TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScan) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using T = typename TestFixture::type; + // for bfloat16 and half we use double for host-side accumulation + using binary_op_type_host = typename test_utils::select_plus_operator_host::type; + binary_op_type_host binary_op_host; + using acc_type = typename test_utils::select_plus_operator_host::acc_type; + + constexpr auto algorithm = TestFixture::algorithm; + constexpr size_t block_size = TestFixture::block_size; + constexpr size_t items_per_thread = TestFixture::items_per_thread; + + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 37; + const size_t grid_size = size / items_per_block; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); + + // Generate data + std::vector output = test_utils::get_random_data(size, 2, 200, seed_value); + + // Calculate expected results on host + std::vector expected(output.size(), test_utils::convert_to_device(0)); + for(size_t i = 0; i < output.size() / items_per_block; i++) + { + acc_type accumulator(0); + for(size_t j = 0; j < items_per_block; j++) + { + auto idx = i * items_per_block + j; + accumulator = binary_op_host(static_cast(output[idx]), accumulator); + expected[idx] = static_cast(accumulator); + } + } + + // Writing to device memory + T* device_output; + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_output, + output.size() * sizeof(typename decltype(output)::value_type))); + + HIP_CHECK(hipMemcpy(device_output, + output.data(), + output.size() * sizeof(T), + hipMemcpyHostToDevice)); + + // Launching kernel + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + inclusive_scan_array_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output); + + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Read from device memory + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(T), + hipMemcpyDeviceToHost)); + + // Validating results + test_utils::assert_near(output, expected, test_utils::precision::value * block_size); + + HIP_CHECK(hipFree(device_output)); + } +} + +template +__global__ __launch_bounds__(BlockSize) +void inclusive_scan_reduce_array_kernel(T* device_output, T* device_output_reductions) +{ + const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; + + // load + T in_out[ItemsPerThread]; + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + in_out[j] = device_output[index + j]; + } + + using bscan_t = hipcub::BlockScan; + __shared__ typename bscan_t::TempStorage temp_storage; + T reduction; + bscan_t(temp_storage).InclusiveScan(in_out, in_out, hipcub::Sum(), reduction); + + // store + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + device_output[index + j] = in_out[j]; + } + + if(hipThreadIdx_x == 0) + { + device_output_reductions[hipBlockIdx_x] = reduction; + } +} + +TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanReduce) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using T = typename TestFixture::type; + // for bfloat16 and half we use double for host-side accumulation + using binary_op_type_host = typename test_utils::select_plus_operator_host::type; + binary_op_type_host binary_op_host; + using acc_type = typename test_utils::select_plus_operator_host::acc_type; + + constexpr auto algorithm = TestFixture::algorithm; + constexpr size_t block_size = TestFixture::block_size; + constexpr size_t items_per_thread = TestFixture::items_per_thread; + + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 37; + const size_t grid_size = size / items_per_block; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); + + // Generate data + std::vector output = test_utils::get_random_data(size, 2, 200, seed_value); + + // Output reduce results + std::vector output_reductions(size / block_size, test_utils::convert_to_device(0)); + + // Calculate expected results on host + std::vector expected(output.size(), test_utils::convert_to_device(0)); + std::vector expected_reductions(output_reductions.size(), + test_utils::convert_to_device(0)); + for(size_t i = 0; i < output.size() / items_per_block; i++) + { + acc_type accumulator(0); + for(size_t j = 0; j < items_per_block; j++) + { + auto idx = i * items_per_block + j; + accumulator = binary_op_host(static_cast(output[idx]), accumulator); + expected[idx] = static_cast(accumulator); + } + expected_reductions[i] = expected[(i + 1) * items_per_block - 1]; + } + + // Writing to device memory + T* device_output; + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_output, + output.size() * sizeof(typename decltype(output)::value_type))); + T* device_output_reductions; + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_output_reductions, + output_reductions.size() * sizeof(typename decltype(output_reductions)::value_type))); + + HIP_CHECK(hipMemcpy(device_output, + output.data(), + output.size() * sizeof(T), + hipMemcpyHostToDevice)); + + HIP_CHECK(hipMemset(device_output_reductions, + test_utils::convert_to_device(0), + output_reductions.size() * sizeof(T))); + + // Launching kernel + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + inclusive_scan_reduce_array_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_reductions); + + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Read from device memory + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(T), + hipMemcpyDeviceToHost)); + + HIP_CHECK(hipMemcpy(output_reductions.data(), + device_output_reductions, + output_reductions.size() * sizeof(T), + hipMemcpyDeviceToHost)); + + // Validating results + test_utils::assert_near(output, expected, test_utils::precision::value * block_size); + + test_utils::assert_near(output_reductions, + expected_reductions, + test_utils::precision::value * block_size); + + HIP_CHECK(hipFree(device_output)); + HIP_CHECK(hipFree(device_output_reductions)); + } +} + +template +__global__ __launch_bounds__(BlockSize) +void inclusive_scan_array_prefix_callback_kernel(T* device_output, + T* device_output_bp, + T block_prefix) +{ + const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; + T prefix_value = block_prefix; + auto prefix_callback = [&prefix_value](T reduction) + { + T prefix = prefix_value; + prefix_value = prefix_value + reduction; + return prefix; + }; + + // load + T in_out[ItemsPerThread]; + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + in_out[j] = device_output[index + j]; + } + + using bscan_t = hipcub::BlockScan; + __shared__ typename bscan_t::TempStorage temp_storage; + bscan_t(temp_storage).InclusiveScan(in_out, in_out, hipcub::Sum(), prefix_callback); + + // store + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + device_output[index + j] = in_out[j]; + } + + if(hipThreadIdx_x == 0) + { + device_output_bp[hipBlockIdx_x] = prefix_value; + } +} + +TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanPrefixCallback) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using T = typename TestFixture::type; + // for bfloat16 and half we use double for host-side accumulation + using binary_op_type_host = typename test_utils::select_plus_operator_host::type; + binary_op_type_host binary_op_host; + using acc_type = typename test_utils::select_plus_operator_host::acc_type; + + constexpr auto algorithm = TestFixture::algorithm; + constexpr size_t block_size = TestFixture::block_size; + constexpr size_t items_per_thread = TestFixture::items_per_thread; + + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 37; + const size_t grid_size = size / items_per_block; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); + + // Generate data + std::vector output = test_utils::get_random_data(size, 2, 200, seed_value); + std::vector output_block_prefixes(size / items_per_block, + test_utils::convert_to_device(0)); + T block_prefix = test_utils::get_random_value(test_utils::convert_to_device(0), + test_utils::convert_to_device(100), + seed_value + seed_value_addition); + + // Calculate expected results on host + std::vector expected(output.size(), test_utils::convert_to_device(0)); + std::vector expected_block_prefixes(output_block_prefixes.size(), + test_utils::convert_to_device(0)); + for(size_t i = 0; i < output.size() / items_per_block; i++) + { + acc_type accumulator(block_prefix); + for(size_t j = 0; j < items_per_block; j++) + { + auto idx = i * items_per_block + j; + accumulator = binary_op_host(static_cast(output[idx]), accumulator); + expected[idx] = static_cast(accumulator); + } + expected_block_prefixes[i] = expected[(i + 1) * items_per_block - 1]; + } + + // Writing to device memory + T* device_output; + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_output, + output.size() * sizeof(typename decltype(output)::value_type))); + T* device_output_bp; + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_output_bp, + output_block_prefixes.size() + * sizeof(typename decltype(output_block_prefixes)::value_type))); + + HIP_CHECK(hipMemcpy(device_output, + output.data(), + output.size() * sizeof(T), + hipMemcpyHostToDevice)); + + HIP_CHECK(hipMemcpy(device_output_bp, + output_block_prefixes.data(), + output_block_prefixes.size() * sizeof(T), + hipMemcpyHostToDevice)); + + // Launching kernel + hipLaunchKernelGGL( + HIP_KERNEL_NAME(inclusive_scan_array_prefix_callback_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_bp, + block_prefix); + + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Read from device memory + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(T), + hipMemcpyDeviceToHost)); + + HIP_CHECK(hipMemcpy(output_block_prefixes.data(), + device_output_bp, + output_block_prefixes.size() * sizeof(T), + hipMemcpyDeviceToHost)); + + // Validating results + test_utils::assert_near(output, expected, test_utils::precision::value * block_size); + + test_utils::assert_near(output_block_prefixes, + expected_block_prefixes, + test_utils::precision::value * block_size); + + HIP_CHECK(hipFree(device_output)); + HIP_CHECK(hipFree(device_output_bp)); + } +} + +template +__global__ __launch_bounds__(BlockSize) +void exclusive_scan_array_kernel(T* device_output, T init) +{ + const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; + // load + T in_out[ItemsPerThread]; + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + in_out[j] = device_output[index + j]; + } + + using bscan_t = hipcub::BlockScan; + __shared__ typename bscan_t::TempStorage temp_storage; + bscan_t(temp_storage).ExclusiveScan(in_out, in_out, init, hipcub::Sum()); + + // store + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + device_output[index + j] = in_out[j]; + } +} + +TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveScan) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using T = typename TestFixture::type; + // for bfloat16 and half we use double for host-side accumulation + using binary_op_type_host = typename test_utils::select_plus_operator_host::type; + binary_op_type_host binary_op_host; + using acc_type = typename test_utils::select_plus_operator_host::acc_type; + + constexpr auto algorithm = TestFixture::algorithm; + constexpr size_t block_size = TestFixture::block_size; + constexpr size_t items_per_thread = TestFixture::items_per_thread; + + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 37; + const size_t grid_size = size / items_per_block; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); + + // Generate data + std::vector output + = test_utils::get_random_data(size, + test_utils::convert_to_device(2), + test_utils::convert_to_device(200), + seed_value); + const T init = test_utils::get_random_value(test_utils::convert_to_device(0), + test_utils::convert_to_device(100), + seed_value + seed_value_addition); + + // Calculate expected results on host + std::vector expected(output.size(), test_utils::convert_to_device(0)); + for(size_t i = 0; i < output.size() / items_per_block; i++) + { + acc_type accumulator(init); + expected[i * items_per_block] = init; + for(size_t j = 1; j < items_per_block; j++) + { + auto idx = i * items_per_block + j; + accumulator = binary_op_host(static_cast(output[idx - 1]), accumulator); + expected[idx] = static_cast(accumulator); + } + } + + // Writing to device memory + T* device_output; + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_output, + output.size() * sizeof(typename decltype(output)::value_type))); + + HIP_CHECK(hipMemcpy(device_output, + output.data(), + output.size() * sizeof(T), + hipMemcpyHostToDevice)); + + // Launching kernel + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + exclusive_scan_array_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + init); + + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Read from device memory + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(T), + hipMemcpyDeviceToHost)); + + // Validating results + test_utils::assert_near(output, expected, test_utils::precision::value * block_size); + + HIP_CHECK(hipFree(device_output)); + } +} + +template +__global__ __launch_bounds__(BlockSize) +void exclusive_scan_reduce_array_kernel(T* device_output, T* device_output_reductions, T init) +{ + const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; + // load + T in_out[ItemsPerThread]; + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + in_out[j] = device_output[index + j]; + } + + using bscan_t = hipcub::BlockScan; + __shared__ typename bscan_t::TempStorage temp_storage; + T reduction; + bscan_t(temp_storage).ExclusiveScan(in_out, in_out, init, hipcub::Sum(), reduction); + + // store + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + device_output[index + j] = in_out[j]; + } + + if(hipThreadIdx_x == 0) + { + device_output_reductions[hipBlockIdx_x] = reduction; + } +} + +TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveScanReduce) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using T = typename TestFixture::type; + // for bfloat16 and half we use double for host-side accumulation + using binary_op_type_host = typename test_utils::select_plus_operator_host::type; + binary_op_type_host binary_op_host; + using acc_type = typename test_utils::select_plus_operator_host::acc_type; + + constexpr auto algorithm = TestFixture::algorithm; + constexpr size_t block_size = TestFixture::block_size; + constexpr size_t items_per_thread = TestFixture::items_per_thread; + + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 37; + const size_t grid_size = size / items_per_block; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); + + // Generate data + std::vector output + = test_utils::get_random_data(size, + test_utils::convert_to_device(2), + test_utils::convert_to_device(200), + seed_value); + + // Output reduce results + std::vector output_reductions(size / block_size, test_utils::convert_to_device(0)); + const T init = test_utils::get_random_value(test_utils::convert_to_device(0), + test_utils::convert_to_device(100), + seed_value + seed_value_addition); + + // Calculate expected results on host + std::vector expected(output.size(), test_utils::convert_to_device(0)); + std::vector expected_reductions(output_reductions.size(), + test_utils::convert_to_device(0)); + for(size_t i = 0; i < output.size() / items_per_block; i++) + { + acc_type accumulator(init); + expected[i * items_per_block] = init; + for(size_t j = 1; j < items_per_block; j++) + { + auto idx = i * items_per_block + j; + accumulator = binary_op_host(static_cast(output[idx - 1]), accumulator); + expected[idx] = static_cast(accumulator); + } + + acc_type accumulator_reductions(0); + for(size_t j = 0; j < items_per_block; j++) + { + auto idx = i * items_per_block + j; + accumulator_reductions + = binary_op_host(static_cast(output[idx]), accumulator_reductions); + expected_reductions[i] = static_cast(accumulator_reductions); + } + } + + // Writing to device memory + T* device_output; + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_output, + output.size() * sizeof(typename decltype(output)::value_type))); + T* device_output_reductions; + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_output_reductions, + output_reductions.size() * sizeof(typename decltype(output_reductions)::value_type))); + + HIP_CHECK(hipMemcpy(device_output, + output.data(), + output.size() * sizeof(T), + hipMemcpyHostToDevice)); + + HIP_CHECK(hipMemset(device_output_reductions, + test_utils::convert_to_device(0), + output_reductions.size() * sizeof(T))); + + // Launching kernel + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + exclusive_scan_reduce_array_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_reductions, + init); + + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Read from device memory + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(T), + hipMemcpyDeviceToHost)); + + HIP_CHECK(hipMemcpy(output_reductions.data(), + device_output_reductions, + output_reductions.size() * sizeof(T), + hipMemcpyDeviceToHost)); + + // Validating results + test_utils::assert_near(output, expected, test_utils::precision::value * block_size); + + test_utils::assert_near(output_reductions, + expected_reductions, + test_utils::precision::value * block_size); + } +} + +template +__global__ __launch_bounds__(BlockSize) +void exclusive_scan_prefix_callback_array_kernel(T* device_output, + T* device_output_bp, + T block_prefix) +{ + const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; + T prefix_value = block_prefix; + auto prefix_callback = [&prefix_value](T reduction) + { + T prefix = prefix_value; + prefix_value = prefix_value + reduction; + return prefix; + }; + + // load + T in_out[ItemsPerThread]; + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + in_out[j] = device_output[index + j]; + } + + using bscan_t = hipcub::BlockScan; + __shared__ typename bscan_t::TempStorage temp_storage; + bscan_t(temp_storage).ExclusiveScan(in_out, in_out, hipcub::Sum(), prefix_callback); + + // store + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + device_output[index + j] = in_out[j]; + } + + if(hipThreadIdx_x == 0) + { + device_output_bp[hipBlockIdx_x] = prefix_value; + } +} + +TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveScanPrefixCallback) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using T = typename TestFixture::type; + // for bfloat16 and half we use double for host-side accumulation + using binary_op_type_host = typename test_utils::select_plus_operator_host::type; + binary_op_type_host binary_op_host; + using acc_type = typename test_utils::select_plus_operator_host::acc_type; + + constexpr auto algorithm = TestFixture::algorithm; + constexpr size_t block_size = TestFixture::block_size; + constexpr size_t items_per_thread = TestFixture::items_per_thread; + + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 37; + const size_t grid_size = size / items_per_block; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); + + // Generate data + std::vector output = test_utils::get_random_data(size, 2, 200, seed_value); + std::vector output_block_prefixes(size / items_per_block); + T block_prefix = test_utils::get_random_value(test_utils::convert_to_device(0), + test_utils::convert_to_device(100), + seed_value + seed_value_addition); + + // Calculate expected results on host + std::vector expected(output.size(), test_utils::convert_to_device(0)); + std::vector expected_block_prefixes(output_block_prefixes.size(), + test_utils::convert_to_device(0)); + for(size_t i = 0; i < output.size() / items_per_block; i++) + { + acc_type accumulator(block_prefix); + expected[i * items_per_block] = block_prefix; + for(size_t j = 1; j < items_per_block; j++) + { + auto idx = i * items_per_block + j; + accumulator = binary_op_host(static_cast(output[idx - 1]), accumulator); + expected[idx] = static_cast(accumulator); + } + acc_type accumulator_block_prefixes(block_prefix); + for(size_t j = 0; j < items_per_block; j++) + { + auto idx = i * items_per_block + j; + accumulator_block_prefixes = binary_op_host(static_cast(output[idx]), + accumulator_block_prefixes); + expected_block_prefixes[i] = static_cast(accumulator_block_prefixes); + } + } + + // Writing to device memory + T* device_output; + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_output, + output.size() * sizeof(typename decltype(output)::value_type))); + T* device_output_bp; + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_output_bp, + output_block_prefixes.size() + * sizeof(typename decltype(output_block_prefixes)::value_type))); + + HIP_CHECK(hipMemcpy(device_output, + output.data(), + output.size() * sizeof(T), + hipMemcpyHostToDevice)); + + // Launching kernel + hipLaunchKernelGGL( + HIP_KERNEL_NAME(exclusive_scan_prefix_callback_array_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_bp, + block_prefix); + + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Read from device memory + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(T), + hipMemcpyDeviceToHost)); + + HIP_CHECK(hipMemcpy(output_block_prefixes.data(), + device_output_bp, + output_block_prefixes.size() * sizeof(T), + hipMemcpyDeviceToHost)); + + // Validating results + test_utils::assert_near(output, expected, test_utils::precision::value * block_size); + + test_utils::assert_near(output_block_prefixes, + expected_block_prefixes, + test_utils::precision::value * block_size); + + HIP_CHECK(hipFree(device_output)); + HIP_CHECK(hipFree(device_output_bp)); + } +} + +template +__global__ __launch_bounds__(BlockSize) +void inclusive_sum_array_kernel(T* device_output) +{ + const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; + + // load + T in_out[ItemsPerThread]; + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + in_out[j] = device_output[index + j]; + } + + using bscan_t = hipcub::BlockScan; + __shared__ typename bscan_t::TempStorage temp_storage; + bscan_t(temp_storage).InclusiveSum(in_out, in_out); + + // store + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + device_output[index + j] = in_out[j]; + } +} + +TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveSum) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using T = typename TestFixture::type; + // for bfloat16 and half we use double for host-side accumulation + using binary_op_type_host = typename test_utils::select_plus_operator_host::type; + binary_op_type_host binary_op_host; + using acc_type = typename test_utils::select_plus_operator_host::acc_type; + + constexpr auto algorithm = TestFixture::algorithm; + constexpr size_t block_size = TestFixture::block_size; + constexpr size_t items_per_thread = TestFixture::items_per_thread; + + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 37; + const size_t grid_size = size / items_per_block; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); + + // Generate data + std::vector output = test_utils::get_random_data(size, 2, 200, seed_value); + + // Calculate expected results on host + std::vector expected(output.size(), test_utils::convert_to_device(0)); + for(size_t i = 0; i < output.size() / items_per_block; i++) + { + acc_type accumulator(0); + for(size_t j = 0; j < items_per_block; j++) + { + auto idx = i * items_per_block + j; + accumulator = binary_op_host(static_cast(output[idx]), accumulator); + expected[idx] = static_cast(accumulator); + } + } + + // Writing to device memory + T* device_output; + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_output, + output.size() * sizeof(typename decltype(output)::value_type))); + + HIP_CHECK(hipMemcpy(device_output, + output.data(), + output.size() * sizeof(T), + hipMemcpyHostToDevice)); + + // Launching kernel + hipLaunchKernelGGL( + HIP_KERNEL_NAME(inclusive_sum_array_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output); + + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Read from device memory + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(T), + hipMemcpyDeviceToHost)); + + // Validating results + test_utils::assert_near(output, expected, test_utils::precision::value * block_size); + + HIP_CHECK(hipFree(device_output)); + } +} + +template +__global__ __launch_bounds__(BlockSize) +void inclusive_sum_reduce_array_kernel(T* device_output, T* device_output_reductions) +{ + const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; + + // load + T in_out[ItemsPerThread]; + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + in_out[j] = device_output[index + j]; + } + + using bscan_t = hipcub::BlockScan; + __shared__ typename bscan_t::TempStorage temp_storage; + T reduction; + bscan_t(temp_storage).InclusiveSum(in_out, in_out, reduction); + + // store + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + device_output[index + j] = in_out[j]; + } + + if(hipThreadIdx_x == 0) + { + device_output_reductions[hipBlockIdx_x] = reduction; + } +} + +TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveSumReduce) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using T = typename TestFixture::type; + // for bfloat16 and half we use double for host-side accumulation + using binary_op_type_host = typename test_utils::select_plus_operator_host::type; + binary_op_type_host binary_op_host; + using acc_type = typename test_utils::select_plus_operator_host::acc_type; + + constexpr auto algorithm = TestFixture::algorithm; + constexpr size_t block_size = TestFixture::block_size; + constexpr size_t items_per_thread = TestFixture::items_per_thread; + + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 37; + const size_t grid_size = size / items_per_block; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); + + // Generate data + std::vector output = test_utils::get_random_data(size, 2, 200, seed_value); + + // Output reduce results + std::vector output_reductions(size / block_size, test_utils::convert_to_device(0)); + + // Calculate expected results on host + std::vector expected(output.size(), test_utils::convert_to_device(0)); + std::vector expected_reductions(output_reductions.size(), + test_utils::convert_to_device(0)); + for(size_t i = 0; i < output.size() / items_per_block; i++) + { + acc_type accumulator(0); + for(size_t j = 0; j < items_per_block; j++) + { + auto idx = i * items_per_block + j; + accumulator = binary_op_host(static_cast(output[idx]), accumulator); + expected[idx] = static_cast(accumulator); + } + expected_reductions[i] = expected[(i + 1) * items_per_block - 1]; + } + + // Writing to device memory + T* device_output; + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_output, + output.size() * sizeof(typename decltype(output)::value_type))); + T* device_output_reductions; + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_output_reductions, + output_reductions.size() * sizeof(typename decltype(output_reductions)::value_type))); + + HIP_CHECK(hipMemcpy(device_output, + output.data(), + output.size() * sizeof(T), + hipMemcpyHostToDevice)); + + HIP_CHECK(hipMemset(device_output_reductions, + test_utils::convert_to_device(0), + output_reductions.size() * sizeof(T))); + + // Launching kernel + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + inclusive_sum_reduce_array_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_reductions); + + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Read from device memory + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(T), + hipMemcpyDeviceToHost)); + + HIP_CHECK(hipMemcpy(output_reductions.data(), + device_output_reductions, + output_reductions.size() * sizeof(T), + hipMemcpyDeviceToHost)); + + // Validating results + test_utils::assert_near(output, expected, test_utils::precision::value * block_size); + + test_utils::assert_near(output_reductions, + expected_reductions, + test_utils::precision::value * block_size); + + HIP_CHECK(hipFree(device_output)); + HIP_CHECK(hipFree(device_output_reductions)); + } +} + +template +__global__ __launch_bounds__(BlockSize) +void inclusive_sum_array_prefix_callback_kernel(T* device_output, + T* device_output_bp, + T block_prefix) +{ + const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; + T prefix_value = block_prefix; + auto prefix_callback = [&prefix_value](T reduction) + { + T prefix = prefix_value; + prefix_value = prefix_value + reduction; + return prefix; + }; + + // load + T in_out[ItemsPerThread]; + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + in_out[j] = device_output[index + j]; + } + + using bscan_t = hipcub::BlockScan; + __shared__ typename bscan_t::TempStorage temp_storage; + bscan_t(temp_storage).InclusiveSum(in_out, in_out, prefix_callback); + + // store + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + device_output[index + j] = in_out[j]; + } + + if(hipThreadIdx_x == 0) + { + device_output_bp[hipBlockIdx_x] = prefix_value; + } +} + +TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveSumPrefixCallback) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using T = typename TestFixture::type; + // for bfloat16 and half we use double for host-side accumulation + using binary_op_type_host = typename test_utils::select_plus_operator_host::type; + binary_op_type_host binary_op_host; + using acc_type = typename test_utils::select_plus_operator_host::acc_type; + + constexpr auto algorithm = TestFixture::algorithm; + constexpr size_t block_size = TestFixture::block_size; + constexpr size_t items_per_thread = TestFixture::items_per_thread; + + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 37; + const size_t grid_size = size / items_per_block; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); + + // Generate data + std::vector output = test_utils::get_random_data(size, 2, 200, seed_value); + std::vector output_block_prefixes(size / items_per_block, + test_utils::convert_to_device(0)); + T block_prefix = test_utils::get_random_value(test_utils::convert_to_device(0), + test_utils::convert_to_device(100), + seed_value + seed_value_addition); + + // Calculate expected results on host + std::vector expected(output.size(), test_utils::convert_to_device(0)); + std::vector expected_block_prefixes(output_block_prefixes.size(), + test_utils::convert_to_device(0)); for(size_t i = 0; i < output.size() / items_per_block; i++) { acc_type accumulator(block_prefix); @@ -1560,10 +3112,10 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanPrefixCallback) // Launching kernel hipLaunchKernelGGL( - HIP_KERNEL_NAME(block_inclusive_scan_array_prefix_callback_kernel), + HIP_KERNEL_NAME(inclusive_sum_array_prefix_callback_kernel), dim3(grid_size), dim3(block_size), 0, @@ -1602,7 +3154,8 @@ template -__global__ __launch_bounds__(BlockSize) void exclusive_scan_array_kernel(T* device_output, T init) +__global__ __launch_bounds__(BlockSize) +void exclusive_sum_array_kernel(T* device_output) { const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; // load @@ -1614,7 +3167,7 @@ __global__ __launch_bounds__(BlockSize) void exclusive_scan_array_kernel(T* devi using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage temp_storage; - bscan_t(temp_storage).ExclusiveScan(in_out, in_out, init, hipcub::Sum()); + bscan_t(temp_storage).ExclusiveSum(in_out, in_out); // store for(unsigned int j = 0; j < ItemsPerThread; j++) @@ -1623,7 +3176,7 @@ __global__ __launch_bounds__(BlockSize) void exclusive_scan_array_kernel(T* devi } } -TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveScan) +TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveSum) { int device_id = test_common_utils::obtain_device_from_ctest(); SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); @@ -1661,9 +3214,7 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveScan) test_utils::convert_to_device(2), test_utils::convert_to_device(200), seed_value); - const T init = test_utils::get_random_value(test_utils::convert_to_device(0), - test_utils::convert_to_device(100), - seed_value + seed_value_addition); + const T init = static_cast(0); // Calculate expected results on host std::vector expected(output.size(), test_utils::convert_to_device(0)); @@ -1692,14 +3243,12 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveScan) // Launching kernel hipLaunchKernelGGL( - HIP_KERNEL_NAME( - exclusive_scan_array_kernel), + HIP_KERNEL_NAME(exclusive_sum_array_kernel), dim3(grid_size), dim3(block_size), 0, 0, - device_output, - init); + device_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); @@ -1721,8 +3270,8 @@ template -__global__ __launch_bounds__(BlockSize) void exclusive_scan_reduce_array_kernel( - T* device_output, T* device_output_reductions, T init) +__global__ __launch_bounds__(BlockSize) +void exclusive_sum_reduce_array_kernel(T* device_output, T* device_output_reductions) { const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; // load @@ -1735,7 +3284,7 @@ __global__ __launch_bounds__(BlockSize) void exclusive_scan_reduce_array_kernel( using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage temp_storage; T reduction; - bscan_t(temp_storage).ExclusiveScan(in_out, in_out, init, hipcub::Sum(), reduction); + bscan_t(temp_storage).ExclusiveSum(in_out, in_out, reduction); // store for(unsigned int j = 0; j < ItemsPerThread; j++) @@ -1749,7 +3298,7 @@ __global__ __launch_bounds__(BlockSize) void exclusive_scan_reduce_array_kernel( } } -TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveScanReduce) +TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveSumReduce) { int device_id = test_common_utils::obtain_device_from_ctest(); SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); @@ -1790,9 +3339,7 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveScanReduce) // Output reduce results std::vector output_reductions(size / block_size, test_utils::convert_to_device(0)); - const T init = test_utils::get_random_value(test_utils::convert_to_device(0), - test_utils::convert_to_device(100), - seed_value + seed_value_addition); + const T init = static_cast(0); // Calculate expected results on host std::vector expected(output.size(), test_utils::convert_to_device(0)); @@ -1841,14 +3388,13 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveScanReduce) // Launching kernel hipLaunchKernelGGL( HIP_KERNEL_NAME( - exclusive_scan_reduce_array_kernel), + exclusive_sum_reduce_array_kernel), dim3(grid_size), dim3(block_size), 0, 0, device_output, - device_output_reductions, - init); + device_output_reductions); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); @@ -1877,8 +3423,10 @@ template -__global__ __launch_bounds__(BlockSize) void exclusive_scan_prefix_callback_array_kernel( - T* device_output, T* device_output_bp, T block_prefix) +__global__ __launch_bounds__(BlockSize) +void exclusive_sum_prefix_callback_array_kernel(T* device_output, + T* device_output_bp, + T block_prefix) { const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; T prefix_value = block_prefix; @@ -1898,7 +3446,7 @@ __global__ __launch_bounds__(BlockSize) void exclusive_scan_prefix_callback_arra using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage temp_storage; - bscan_t(temp_storage).ExclusiveScan(in_out, in_out, hipcub::Sum(), prefix_callback); + bscan_t(temp_storage).ExclusiveSum(in_out, in_out, prefix_callback); // store for(unsigned int j = 0; j < ItemsPerThread; j++) @@ -1912,7 +3460,7 @@ __global__ __launch_bounds__(BlockSize) void exclusive_scan_prefix_callback_arra } } -TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveScanPrefixCallback) +TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveSumPrefixCallback) { int device_id = test_common_utils::obtain_device_from_ctest(); SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); @@ -1968,7 +3516,7 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveScanPrefixCallback) acc_type accumulator_block_prefixes(block_prefix); for(size_t j = 0; j < items_per_block; j++) { - auto idx = i * items_per_block + j; + auto idx = i * items_per_block + j; accumulator_block_prefixes = binary_op_host(static_cast(output[idx]), accumulator_block_prefixes); expected_block_prefixes[i] = static_cast(accumulator_block_prefixes); @@ -1993,10 +3541,10 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveScanPrefixCallback) // Launching kernel hipLaunchKernelGGL( - HIP_KERNEL_NAME(exclusive_scan_prefix_callback_array_kernel), + HIP_KERNEL_NAME(exclusive_sum_prefix_callback_array_kernel), dim3(grid_size), dim3(block_size), 0, @@ -2029,4 +3577,4 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveScanPrefixCallback) HIP_CHECK(hipFree(device_output)); HIP_CHECK(hipFree(device_output_bp)); } -} +} \ No newline at end of file diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_shuffle.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_shuffle.cpp index 500ffd69b2a..f0eb9d54953 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_shuffle.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_shuffle.cpp @@ -24,8 +24,8 @@ // required hipcub headers #include -#include #include +#include // #include // required test headers @@ -34,21 +34,18 @@ #include // Params for tests -template< - class T, - unsigned int BlockSize = 256U -> +template struct params { - using type = T; + using type = T; static constexpr unsigned int block_size = BlockSize; }; - template -class HipcubBlockShuffleTests : public ::testing::Test { +class HipcubBlockShuffleTests : public ::testing::Test +{ public: - using type = typename Params::type; + using type = typename Params::type; static constexpr unsigned int block_size = Params::block_size; }; @@ -78,21 +75,14 @@ using SingleValueTestParams = ::testing::Types< TYPED_TEST_SUITE(HipcubBlockShuffleTests, SingleValueTestParams); -template< - unsigned int BlockSize, - class T -> +template __global__ __launch_bounds__(BlockSize) void shuffle_offset_kernel(T* device_input, T* device_output, int distance) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; - hipcub::BlockShuffle b_shuffle; - b_shuffle.Offset( - device_input[index], - device_output[index], - distance - ); + const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + hipcub::BlockShuffle b_shuffle; + b_shuffle.Offset(device_input[index], device_output[index], distance); } TYPED_TEST(HipcubBlockShuffleTests, BlockOffset) @@ -101,15 +91,18 @@ TYPED_TEST(HipcubBlockShuffleTests, BlockOffset) SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); HIP_CHECK(hipSetDevice(device_id)); - using type = typename TestFixture::type; + using type = typename TestFixture::type; const size_t block_size = TestFixture::block_size; - const size_t size = block_size * 1134; - const size_t grid_size = size / block_size; - for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + const size_t size = block_size * 1134; + const size_t grid_size = size / block_size; + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { - unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; - int distance = rand() % std::min(size_t(10), block_size/2) - std::min(size_t(10), block_size/2); - SCOPED_TRACE(testing::Message() << "with seed= " << seed_value <<" & distance = "< +#include template(); -} \ No newline at end of file +} From 19f99d45afebc47c4db8d41819e38641fd170708 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Tue, 5 Aug 2025 08:34:05 -0700 Subject: [PATCH 05/10] nuiked .jenkins to fix mci --- projects/hipcub/.jenkins/common.groovy | 79 ----------------- projects/hipcub/.jenkins/precheckin.groovy | 84 ------------------- .../hipcub/.jenkins/staticanalysis.groovy | 64 -------------- projects/hipcub/.jenkins/staticlibrary.groovy | 82 ------------------ 4 files changed, 309 deletions(-) delete mode 100644 projects/hipcub/.jenkins/common.groovy delete mode 100644 projects/hipcub/.jenkins/precheckin.groovy delete mode 100644 projects/hipcub/.jenkins/staticanalysis.groovy delete mode 100644 projects/hipcub/.jenkins/staticlibrary.groovy diff --git a/projects/hipcub/.jenkins/common.groovy b/projects/hipcub/.jenkins/common.groovy deleted file mode 100644 index ae8f7978256..00000000000 --- a/projects/hipcub/.jenkins/common.groovy +++ /dev/null @@ -1,79 +0,0 @@ -// This file is for internal AMD use. -// If you are interested in running your own Jenkins, please raise a github issue for assistance. - -def runCompileCommand(platform, project, jobName, boolean debug=false, boolean sameOrg=true) -{ - project.paths.construct_build_prefix() - - String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug' : '-DCMAKE_BUILD_TYPE=Release' - String buildTypeDir = debug ? 'debug' : 'release' - String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake' - //Set CI node's gfx arch as target if PR, otherwise use default targets of the library - String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : '' - - def getRocPRIM = auxiliary.getLibrary('rocPRIM', platform.jenkinsLabel, null, sameOrg) - - def command = """#!/usr/bin/env bash - set -x - ${getRocPRIM} - cd ${project.paths.project_build_prefix} - mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir} - ${auxiliary.gfxTargetParser()} - ${cmake} --toolchain=toolchain-linux.cmake ${buildTypeArg} ${amdgpuTargets} -DBUILD_TEST=ON -DBUILD_BENCHMARK=ON ../.. - make -j\$(nproc) - """ - - platform.runCommand(this, command) -} - - -def runTestCommand (platform, project, boolean rocmExamples=false) -{ - String sudo = auxiliary.sudo(platform.jenkinsLabel) - - def testCommand = "ctest --output-on-failure --verbose --timeout 900" - def command = """#!/usr/bin/env bash - set -x - cd ${project.paths.project_build_prefix} - cd ${project.testDirectory} - ${sudo} LD_LIBRARY_PATH=/opt/rocm/lib ${testCommand} - """ - - platform.runCommand(this, command) - if (rocmExamples){ - String buildString = "" - if (platform.os.contains("ubuntu")){ - buildString += "sudo dpkg -i *.deb" - } - else { - buildString += "sudo rpm -i *.rpm" - } - testCommand = """#!/usr/bin/env bash - set -ex - cd ${project.paths.project_build_prefix}/build/release/package - ${buildString} - cd ../../.. - testDirs=("Libraries/hipCUB") - git clone https://github.com/ROCm/rocm-examples.git - rocm_examples_dir=\$(readlink -f rocm-examples) - for testDir in \${testDirs[@]}; do - cd \${rocm_examples_dir}/\${testDir} - cmake -S . -B build - cmake --build build - cd ./build - ctest --output-on-failure - done - """ - platform.runCommand(this, testCommand, "ROCM Examples") - } -} - -def runPackageCommand(platform, project) -{ - def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/release") - - platform.runCommand(this, packageHelper[0]) - platform.archiveArtifacts(this, packageHelper[1]) -} - -return this diff --git a/projects/hipcub/.jenkins/precheckin.groovy b/projects/hipcub/.jenkins/precheckin.groovy deleted file mode 100644 index 70f2bb54a76..00000000000 --- a/projects/hipcub/.jenkins/precheckin.groovy +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env groovy -// This shared library is available at https://github.com/ROCm/rocJENKINS/ -@Library('rocJenkins@pong') _ - -// This file is for internal AMD use. -// If you are interested in running your own Jenkins, please raise a github issue for assistance. - -import com.amd.project.* -import com.amd.docker.* -import java.nio.file.Path; - -def runCI = -{ - nodeDetails, jobName-> - - def prj = new rocProject('hipCUB', 'PreCheckin') - prj.timeout.compile = 400 - // Define test architectures, optional rocm version argument is available - def nodes = new dockerNodes(nodeDetails, jobName, prj) - - boolean formatCheck = false - - def commonGroovy - - def compileCommand = - { - platform, project-> - - commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" - commonGroovy.runCompileCommand(platform, project, jobName) - } - - def testCommand = - { - platform, project-> - - commonGroovy.runTestCommand(platform, project, true) - } - - def packageCommand = - { - platform, project-> - - commonGroovy.runPackageCommand(platform, project) - } - - buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) -} - -ci: { - String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) - - def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]] - propertyList = auxiliary.appendPropertyList(propertyList) - - def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])] - jobNameList = auxiliary.appendJobNameList(jobNameList) - - auxiliary.registerDependencyBranchParameter(["rocPRIM"]) - - propertyList.each - { - jobName, property-> - if (urlJobName == jobName) - properties(auxiliary.addCommonProperties(property)) - } - - Set seenJobNames = [] - jobNameList.each - { - jobName, nodeDetails-> - seenJobNames.add(jobName) - if (urlJobName == jobName) - runCI(nodeDetails, jobName) - } - - // For url job names that are outside of the standardJobNameSet i.e. compute-rocm-dkms-no-npi-1901 - if(!seenJobNames.contains(urlJobName)) - { - properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) - runCI([ubuntu16:['gfx906']], urlJobName) - } -} - diff --git a/projects/hipcub/.jenkins/staticanalysis.groovy b/projects/hipcub/.jenkins/staticanalysis.groovy deleted file mode 100644 index 4e2237ab1a8..00000000000 --- a/projects/hipcub/.jenkins/staticanalysis.groovy +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env groovy -// This shared library is available at https://github.com/ROCm/rocJENKINS/ -@Library('rocJenkins@pong') _ - -// This is file for internal AMD use. -// If you are interested in running your own Jenkins, please raise a github issue for assistance. - -import com.amd.project.* -import com.amd.docker.* -import java.nio.file.Path - -def runCompileCommand(platform, project, jobName, boolean debug=false) -{ - project.paths.construct_build_prefix() -} - -def runCI = -{ - nodeDetails, jobName-> - - def prj = new rocProject('hipCUB', 'StaticAnalysis') - - // Define test architectures, optional rocm version argument is available - def nodes = new dockerNodes(nodeDetails, jobName, prj) - - boolean formatCheck = false - boolean staticAnalysis = true - - def compileCommand = - { - platform, project-> - - runCompileCommand(platform, project, jobName, false) - } - - buildProject(prj , formatCheck, nodes.dockerArray, compileCommand, null, null, staticAnalysis) -} - -ci: { - String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) - - def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])], - "rocm-docker":[]] - propertyList = auxiliary.appendPropertyList(propertyList) - - def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":[]] - jobNameList = auxiliary.appendJobNameList(jobNameList) - - propertyList.each - { - jobName, property-> - if (urlJobName == jobName) - properties(auxiliary.addCommonProperties(property)) - } - - jobNameList.each - { - jobName, nodeDetails-> - if (urlJobName == jobName) - stage(jobName) { - runCI(nodeDetails, jobName) - } - } -} diff --git a/projects/hipcub/.jenkins/staticlibrary.groovy b/projects/hipcub/.jenkins/staticlibrary.groovy deleted file mode 100644 index 549913d8cbb..00000000000 --- a/projects/hipcub/.jenkins/staticlibrary.groovy +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env groovy -@Library('rocJenkins@pong') _ -import com.amd.project.* -import com.amd.docker.* -import java.nio.file.Path; - -def runCI = -{ - nodeDetails, jobName-> - - def prj = new rocProject('hipCUB', 'Static Library PreCheckin') - - def nodes = new dockerNodes(nodeDetails, jobName, prj) - - def commonGroovy - - boolean formatCheck = false - - def compileCommand = - { - platform, project-> - - commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" - commonGroovy.runCompileCommand(platform, project, jobName, false, true) - } - - - def testCommand = - { - platform, project-> - - commonGroovy.runTestCommand(platform, project) - } - - def packageCommand = - { - platform, project-> - - commonGroovy.runPackageCommand(platform, project) - } - - buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) -} - -ci: { - String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) - - def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])], - "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])], - "rocm-docker":[]] - propertyList = auxiliary.appendPropertyList(propertyList) - - def jobNameList = ["compute-rocm-dkms-no-npi":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']]), - "compute-rocm-dkms-no-npi-hipclang":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']]), - "rocm-docker":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']])] - jobNameList = auxiliary.appendJobNameList(jobNameList) - - propertyList.each - { - jobName, property-> - if (urlJobName == jobName) - properties(auxiliary.addCommonProperties(property)) - } - - jobNameList.each - { - jobName, nodeDetails-> - if (urlJobName == jobName) - stage(jobName) { - runCI(nodeDetails, jobName) - } - } - - // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 - if(!jobNameList.keySet().contains(urlJobName)) - { - properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) - stage(urlJobName) { - runCI([ubuntu16:['gfx906']], urlJobName) - } - } -} From 32ae8082fecdf682e6d0c073fba5bd185f6042b3 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 6 Aug 2025 12:07:11 -0700 Subject: [PATCH 06/10] implemented fall back implementation for std::exclusive scane for gcc < 9 (debian10) --- .../hipcub/test_hipcub_block_radix_rank.cpp | 32 ++++++++++++++++--- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp index cca727699d2..14748a17ff2 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp @@ -605,6 +605,21 @@ void rank_with_prefix_sum_kernel(const KeyType* keys_input, } } +#if defined(_GLIBCXX_RELEASE) && (GLIBCXX_RELEASE < 9) + +template +void exclusive_scan(It first, It last, OutIt out, T init) +{ + // Fallback implementation for exclusive scan if gcc version is < 9 + for (; first != last; ++first) + { + *out++ = init; + init += *first; + } +} + +#endif + template void test_radix_rank_with_prefix_sum_output() { @@ -703,10 +718,19 @@ void test_radix_rank_with_prefix_sum_output() ++histogram[bit_rep]; } - std::exclusive_scan(histogram.begin(), - histogram.end(), - pfs_expected.begin() + pfs_offset, - 0); + + #if defined(_GLIBCXX_RELEASE) && (GLIBCXX_RELEASE >= 9) + std::exclusive_scan(histogram.begin(), + histogram.end(), + pfs_expected.begin() + pfs_offset, + 0); + #else + exclusive_scan(histogram.begin(), + histogram.end(), + pfs_expected.begin() + pfs_offset, + 0); + #endif + } // Preparing device From eb079869ae1692fafbbc11808a53f90139249f51 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 6 Aug 2025 12:14:15 -0700 Subject: [PATCH 07/10] brought back rocprim .jenkins (these should get deleted in rocprim PR not hipcub) --- projects/rocprim/.jenkins/common.groovy | 105 ++++++++++++++++++++ projects/rocprim/.jenkins/precheckin.groovy | 81 +++++++++++++++ projects/rocprim/.jenkins/static.groovy | 82 +++++++++++++++ 3 files changed, 268 insertions(+) create mode 100644 projects/rocprim/.jenkins/common.groovy create mode 100644 projects/rocprim/.jenkins/precheckin.groovy create mode 100644 projects/rocprim/.jenkins/static.groovy diff --git a/projects/rocprim/.jenkins/common.groovy b/projects/rocprim/.jenkins/common.groovy new file mode 100644 index 00000000000..0ffd1dee600 --- /dev/null +++ b/projects/rocprim/.jenkins/common.groovy @@ -0,0 +1,105 @@ +// This file is for internal AMD use. +// If you are interested in running your own Jenkins, please raise a github issue for assistance. + +def runCompileCommand(platform, project, jobName, boolean debug=false, boolean staticLibrary=false) +{ + project.paths.construct_build_prefix() + + String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug' : '-DCMAKE_BUILD_TYPE=Release' + String buildStatic = staticLibrary ? '-DBUILD_SHARED_LIBS=OFF' : '-DBUILD_SHARED_LIBS=ON' + String buildTypeDir = debug ? 'debug' : 'release' + String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake' + //Set CI node's gfx arch as target if PR, otherwise use default targets of the library + String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : '' + + def command = """#!/usr/bin/env bash + set -x + cd ${project.paths.project_build_prefix} + mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir} + ${auxiliary.gfxTargetParser()} + ${cmake} --toolchain=toolchain-linux.cmake ${buildTypeArg} ${buildStatic} ${amdgpuTargets} -DBUILD_TEST=ON -DBUILD_BENCHMARK=ON ../.. + make -j\$(nproc) + """ + + platform.runCommand(this, command) +} + + +def runTestCommand (platform, project, boolean rocmExamples=false) +{ + String sudo = auxiliary.sudo(platform.jenkinsLabel) + + def testCommand = "ctest --output-on-failure " + def testCommandExcludeRegex = /(rocprim.block_histogram)/ + def testCommandExclude = "--exclude-regex \"${testCommandExcludeRegex}\"" + def hmmExcludeRegex = '' + def hmmTestCommandExclude = "--exclude-regex \"${hmmExcludeRegex}\"" + def hmmTestCommand = '' + if (platform.jenkinsLabel.contains('gfx90a')) + { + echo("HMM TESTS DISABLED") + /*hmmTestCommand = """ + export HSA_XNACK=1 + export ROCPRIM_USE_HMM=1 + ${testCommand} ${hmmTestCommandExclude} + """*/ + } + echo(env.JOB_NAME) + if (env.JOB_NAME.contains('bleeding-edge')) + { + testCommand = '' + testCommandExclude = '' + hmmTestCommand = '' + echo("TESTS DISABLED") + } + def command = """#!/usr/bin/env bash + set -x + cd ${project.paths.project_build_prefix} + cd ${project.testDirectory} + ${testCommand} ${testCommandExclude} + if (( \$? != 0 )); then + exit 1 + fi + ${hmmTestCommand} + """ + platform.runCommand(this, command) + //ROCM Examples + if (rocmExamples){ + String buildString = "" + if (platform.os.contains("ubuntu")){ + buildString += "sudo dpkg -i *.deb" + } + else { + buildString += "sudo rpm -i *.rpm" + } + testCommand = """#!/usr/bin/env bash + set -ex + cd ${project.paths.project_build_prefix}/build/release/package + ls + ${buildString} + cd ../../.. + testDirs=("Libraries/rocPRIM") + git clone https://github.com/ROCm/rocm-examples.git + rocm_examples_dir=\$(readlink -f rocm-examples) + for testDir in \${testDirs[@]}; do + cd \${rocm_examples_dir}/\${testDir} + cmake -S . -B build + cmake --build build + cd ./build + ctest --output-on-failure + done + """ + platform.runCommand(this, testCommand, "ROCM Examples") + + } +} + +def runPackageCommand(platform, project) +{ + def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/release") + + platform.runCommand(this, packageHelper[0]) + platform.archiveArtifacts(this, packageHelper[1]) +} + +return this diff --git a/projects/rocprim/.jenkins/precheckin.groovy b/projects/rocprim/.jenkins/precheckin.groovy new file mode 100644 index 00000000000..bbb8274743c --- /dev/null +++ b/projects/rocprim/.jenkins/precheckin.groovy @@ -0,0 +1,81 @@ +#!/usr/bin/env groovy +@Library('rocJenkins@pong') _ +import com.amd.project.* +import com.amd.docker.* +import java.nio.file.Path; + +def runCI = +{ + nodeDetails, jobName-> + + def prj = new rocProject('rocPRIM', 'PreCheckin') + prj.paths.build_command = './install -c' + prj.timeout.compile = 600 + + def nodes = new dockerNodes(nodeDetails, jobName, prj) + + def commonGroovy + + boolean formatCheck = false + + def compileCommand = + { + platform, project-> + + commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" + commonGroovy.runCompileCommand(platform, project, jobName) + } + + def testCommand = + { + platform, project-> + + commonGroovy.runTestCommand(platform, project, true) + } + + def packageCommand = + { + platform, project-> + + commonGroovy.runPackageCommand(platform, project) + } + + buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) +} + +ci: { + String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) + + def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])], + "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])], + "rocm-docker":[]] + propertyList = auxiliary.appendPropertyList(propertyList) + + def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])] + jobNameList = auxiliary.appendJobNameList(jobNameList, 'rocPRIM') + + propertyList.each + { + jobName, property-> + if (urlJobName == jobName) + properties(auxiliary.addCommonProperties(property)) + } + + jobNameList.each + { + jobName, nodeDetails-> + if (urlJobName == jobName) + stage(jobName) { + runCI(nodeDetails, jobName) + } + } + + // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 + if(!jobNameList.keySet().contains(urlJobName)) + { + properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) + stage(urlJobName) { + runCI([ubuntu16:['gfx906']], urlJobName) + } + } +} diff --git a/projects/rocprim/.jenkins/static.groovy b/projects/rocprim/.jenkins/static.groovy new file mode 100644 index 00000000000..75606419fdf --- /dev/null +++ b/projects/rocprim/.jenkins/static.groovy @@ -0,0 +1,82 @@ +#!/usr/bin/env groovy +@Library('rocJenkins@pong') _ +import com.amd.project.* +import com.amd.docker.* +import java.nio.file.Path; + +def runCI = +{ + nodeDetails, jobName-> + + def prj = new rocProject('rocPRIM', 'static') + prj.paths.build_command = './install -c -s' + prj.timeout.compile = 600 + prj.timeout.packaging = 120 + + def nodes = new dockerNodes(nodeDetails, jobName, prj) + + def commonGroovy + + boolean formatCheck = false + + def compileCommand = + { + platform, project-> + + commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" + commonGroovy.runCompileCommand(platform, project, jobName, debug=false, staticLibrary=true) + } + + def testCommand = + { + platform, project-> + + commonGroovy.runTestCommand(platform, project) + } + + def packageCommand = + { + platform, project-> + + commonGroovy.runPackageCommand(platform, project) + } + + buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) +} + +ci: { + String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) + + def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])], + "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])], + "rocm-docker":[]] + propertyList = auxiliary.appendPropertyList(propertyList) + + def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])] + jobNameList = auxiliary.appendJobNameList(jobNameList, 'rocPRIM') + + propertyList.each + { + jobName, property-> + if (urlJobName == jobName) + properties(auxiliary.addCommonProperties(property)) + } + + jobNameList.each + { + jobName, nodeDetails-> + if (urlJobName == jobName) + stage(jobName) { + runCI(nodeDetails, jobName) + } + } + + // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 + if(!jobNameList.keySet().contains(urlJobName)) + { + properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) + stage(urlJobName) { + runCI([ubuntu16:['gfx906']], urlJobName) + } + } +} From 4128a344e5098b5a498cea47491b155fb8ff44d6 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 6 Aug 2025 12:14:56 -0700 Subject: [PATCH 08/10] removed .jenkins again (to accomodate the rocprim fix) --- projects/rocprim/.jenkins/common.groovy | 105 -------------------- projects/rocprim/.jenkins/precheckin.groovy | 81 --------------- projects/rocprim/.jenkins/static.groovy | 82 --------------- 3 files changed, 268 deletions(-) delete mode 100644 projects/rocprim/.jenkins/common.groovy delete mode 100644 projects/rocprim/.jenkins/precheckin.groovy delete mode 100644 projects/rocprim/.jenkins/static.groovy diff --git a/projects/rocprim/.jenkins/common.groovy b/projects/rocprim/.jenkins/common.groovy deleted file mode 100644 index 0ffd1dee600..00000000000 --- a/projects/rocprim/.jenkins/common.groovy +++ /dev/null @@ -1,105 +0,0 @@ -// This file is for internal AMD use. -// If you are interested in running your own Jenkins, please raise a github issue for assistance. - -def runCompileCommand(platform, project, jobName, boolean debug=false, boolean staticLibrary=false) -{ - project.paths.construct_build_prefix() - - String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug' : '-DCMAKE_BUILD_TYPE=Release' - String buildStatic = staticLibrary ? '-DBUILD_SHARED_LIBS=OFF' : '-DBUILD_SHARED_LIBS=ON' - String buildTypeDir = debug ? 'debug' : 'release' - String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake' - //Set CI node's gfx arch as target if PR, otherwise use default targets of the library - String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : '' - - def command = """#!/usr/bin/env bash - set -x - cd ${project.paths.project_build_prefix} - mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir} - ${auxiliary.gfxTargetParser()} - ${cmake} --toolchain=toolchain-linux.cmake ${buildTypeArg} ${buildStatic} ${amdgpuTargets} -DBUILD_TEST=ON -DBUILD_BENCHMARK=ON ../.. - make -j\$(nproc) - """ - - platform.runCommand(this, command) -} - - -def runTestCommand (platform, project, boolean rocmExamples=false) -{ - String sudo = auxiliary.sudo(platform.jenkinsLabel) - - def testCommand = "ctest --output-on-failure " - def testCommandExcludeRegex = /(rocprim.block_histogram)/ - def testCommandExclude = "--exclude-regex \"${testCommandExcludeRegex}\"" - def hmmExcludeRegex = '' - def hmmTestCommandExclude = "--exclude-regex \"${hmmExcludeRegex}\"" - def hmmTestCommand = '' - if (platform.jenkinsLabel.contains('gfx90a')) - { - echo("HMM TESTS DISABLED") - /*hmmTestCommand = """ - export HSA_XNACK=1 - export ROCPRIM_USE_HMM=1 - ${testCommand} ${hmmTestCommandExclude} - """*/ - } - echo(env.JOB_NAME) - if (env.JOB_NAME.contains('bleeding-edge')) - { - testCommand = '' - testCommandExclude = '' - hmmTestCommand = '' - echo("TESTS DISABLED") - } - def command = """#!/usr/bin/env bash - set -x - cd ${project.paths.project_build_prefix} - cd ${project.testDirectory} - ${testCommand} ${testCommandExclude} - if (( \$? != 0 )); then - exit 1 - fi - ${hmmTestCommand} - """ - platform.runCommand(this, command) - //ROCM Examples - if (rocmExamples){ - String buildString = "" - if (platform.os.contains("ubuntu")){ - buildString += "sudo dpkg -i *.deb" - } - else { - buildString += "sudo rpm -i *.rpm" - } - testCommand = """#!/usr/bin/env bash - set -ex - cd ${project.paths.project_build_prefix}/build/release/package - ls - ${buildString} - cd ../../.. - testDirs=("Libraries/rocPRIM") - git clone https://github.com/ROCm/rocm-examples.git - rocm_examples_dir=\$(readlink -f rocm-examples) - for testDir in \${testDirs[@]}; do - cd \${rocm_examples_dir}/\${testDir} - cmake -S . -B build - cmake --build build - cd ./build - ctest --output-on-failure - done - """ - platform.runCommand(this, testCommand, "ROCM Examples") - - } -} - -def runPackageCommand(platform, project) -{ - def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/release") - - platform.runCommand(this, packageHelper[0]) - platform.archiveArtifacts(this, packageHelper[1]) -} - -return this diff --git a/projects/rocprim/.jenkins/precheckin.groovy b/projects/rocprim/.jenkins/precheckin.groovy deleted file mode 100644 index bbb8274743c..00000000000 --- a/projects/rocprim/.jenkins/precheckin.groovy +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env groovy -@Library('rocJenkins@pong') _ -import com.amd.project.* -import com.amd.docker.* -import java.nio.file.Path; - -def runCI = -{ - nodeDetails, jobName-> - - def prj = new rocProject('rocPRIM', 'PreCheckin') - prj.paths.build_command = './install -c' - prj.timeout.compile = 600 - - def nodes = new dockerNodes(nodeDetails, jobName, prj) - - def commonGroovy - - boolean formatCheck = false - - def compileCommand = - { - platform, project-> - - commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" - commonGroovy.runCompileCommand(platform, project, jobName) - } - - def testCommand = - { - platform, project-> - - commonGroovy.runTestCommand(platform, project, true) - } - - def packageCommand = - { - platform, project-> - - commonGroovy.runPackageCommand(platform, project) - } - - buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) -} - -ci: { - String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) - - def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])], - "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])], - "rocm-docker":[]] - propertyList = auxiliary.appendPropertyList(propertyList) - - def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])] - jobNameList = auxiliary.appendJobNameList(jobNameList, 'rocPRIM') - - propertyList.each - { - jobName, property-> - if (urlJobName == jobName) - properties(auxiliary.addCommonProperties(property)) - } - - jobNameList.each - { - jobName, nodeDetails-> - if (urlJobName == jobName) - stage(jobName) { - runCI(nodeDetails, jobName) - } - } - - // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 - if(!jobNameList.keySet().contains(urlJobName)) - { - properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) - stage(urlJobName) { - runCI([ubuntu16:['gfx906']], urlJobName) - } - } -} diff --git a/projects/rocprim/.jenkins/static.groovy b/projects/rocprim/.jenkins/static.groovy deleted file mode 100644 index 75606419fdf..00000000000 --- a/projects/rocprim/.jenkins/static.groovy +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env groovy -@Library('rocJenkins@pong') _ -import com.amd.project.* -import com.amd.docker.* -import java.nio.file.Path; - -def runCI = -{ - nodeDetails, jobName-> - - def prj = new rocProject('rocPRIM', 'static') - prj.paths.build_command = './install -c -s' - prj.timeout.compile = 600 - prj.timeout.packaging = 120 - - def nodes = new dockerNodes(nodeDetails, jobName, prj) - - def commonGroovy - - boolean formatCheck = false - - def compileCommand = - { - platform, project-> - - commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" - commonGroovy.runCompileCommand(platform, project, jobName, debug=false, staticLibrary=true) - } - - def testCommand = - { - platform, project-> - - commonGroovy.runTestCommand(platform, project) - } - - def packageCommand = - { - platform, project-> - - commonGroovy.runPackageCommand(platform, project) - } - - buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) -} - -ci: { - String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) - - def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])], - "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])], - "rocm-docker":[]] - propertyList = auxiliary.appendPropertyList(propertyList) - - def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])] - jobNameList = auxiliary.appendJobNameList(jobNameList, 'rocPRIM') - - propertyList.each - { - jobName, property-> - if (urlJobName == jobName) - properties(auxiliary.addCommonProperties(property)) - } - - jobNameList.each - { - jobName, nodeDetails-> - if (urlJobName == jobName) - stage(jobName) { - runCI(nodeDetails, jobName) - } - } - - // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 - if(!jobNameList.keySet().contains(urlJobName)) - { - properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) - stage(urlJobName) { - runCI([ubuntu16:['gfx906']], urlJobName) - } - } -} From 53124d95a28390be4ced80e8cfbe56d106d7fbec Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 6 Aug 2025 22:04:56 +0000 Subject: [PATCH 09/10] changed fall back exclusive_scan name to be less ambiguous --- projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp index 14748a17ff2..bb018014a88 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp @@ -608,7 +608,7 @@ void rank_with_prefix_sum_kernel(const KeyType* keys_input, #if defined(_GLIBCXX_RELEASE) && (GLIBCXX_RELEASE < 9) template -void exclusive_scan(It first, It last, OutIt out, T init) +void fall_back_exclusive_scan(It first, It last, OutIt out, T init) { // Fallback implementation for exclusive scan if gcc version is < 9 for (; first != last; ++first) @@ -725,7 +725,7 @@ void test_radix_rank_with_prefix_sum_output() pfs_expected.begin() + pfs_offset, 0); #else - exclusive_scan(histogram.begin(), + fall_back_exclusive_scan(histogram.begin(), histogram.end(), pfs_expected.begin() + pfs_offset, 0); From b6184c0fff30d18df5707f0eda46c9a6ccedd0a2 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 6 Aug 2025 22:21:25 +0000 Subject: [PATCH 10/10] commented on name change --- projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp index bb018014a88..67575363526 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp @@ -607,6 +607,10 @@ void rank_with_prefix_sum_kernel(const KeyType* keys_input, #if defined(_GLIBCXX_RELEASE) && (GLIBCXX_RELEASE < 9) +/** + * name this function fall_back_exclusive_scan to prevent + * ambiguous name error + */ template void fall_back_exclusive_scan(It first, It last, OutIt out, T init) {