From 74fa48daaa1b554835af929abe0a09398c070ea9 Mon Sep 17 00:00:00 2001
From: NguyenNhuDi <zee.nguyen@amd.com>
Date: Fri, 1 Aug 2025 12:01:34 -0700
Subject: [PATCH 01/10] chery-pick syntax fix from fork

---
 .../include/rocprim/block/block_shuffle.hpp   | 54 ++++++++++---------
 1 file changed, 29 insertions(+), 25 deletions(-)
diff --git a/projects/rocprim/rocprim/include/rocprim/block/block_shuffle.hpp b/projects/rocprim/rocprim/include/rocprim/block/block_shuffle.hpp
index 0db3ea34fd0..c1278bb8f28 100644
--- a/projects/rocprim/rocprim/include/rocprim/block/block_shuffle.hpp
+++ b/projects/rocprim/rocprim/include/rocprim/block/block_shuffle.hpp
@@ -168,8 +168,8 @@ class block_shuffle
     /// \param [out] output reference to a output value, that receives data from another thread
     /// \param [in] distance The input threadId + distance = output threadId.
     /// \param [in] storage reference to a temporary storage object of type storage_type.
-    ROCPRIM_DEVICE ROCPRIM_INLINE void
-        offset(const size_t& flat_id, T input, T& output, int distance, storage_type& storage)
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void offset(const size_t& flat_id, T input, T& output, int distance, storage_type& storage)
     {
         storage.buffer.emplace(flat_id, input);
 
@@ -243,8 +243,8 @@ class block_shuffle
     /// \param [out] output reference to a output value, that receives data from another thread
     /// \param [in] distance The input threadId + distance = output threadId.
     /// \param [in] storage reference to a temporary storage object of type storage_type.
-    ROCPRIM_DEVICE ROCPRIM_INLINE void
-        rotate(const size_t& flat_id, T input, T& output, int distance, storage_type& storage)
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void rotate(const size_t& flat_id, T input, T& output, int distance, storage_type& storage)
     {
         storage.buffer.emplace(flat_id, input);
 
@@ -320,10 +320,11 @@ class block_shuffle
     /// \param [in] storage reference to a temporary storage object of type storage_type.
     /// The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
     template<unsigned int ItemsPerThread>
-    ROCPRIM_DEVICE ROCPRIM_INLINE void up(const size_t& flat_id,
-                                          T (&input)[ItemsPerThread],
-                                          T (&prev)[ItemsPerThread],
-                                          storage_type& storage)
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void up(const size_t& flat_id,
+            T (&input)[ItemsPerThread],
+            T (&prev)[ItemsPerThread],
+            storage_type& storage)
     {
         storage.buffer.emplace(flat_id, input[ItemsPerThread - 1]);
 
@@ -390,16 +391,17 @@ class block_shuffle
     /// <em>thread</em><sub><tt>BlockSize-1</tt></sub>, provided to all threads
     /// \param [in] storage reference to a temporary storage object of type storage_type.
     template<int ItemsPerThread>
-    ROCPRIM_DEVICE ROCPRIM_INLINE void up(const size_t& flat_id,
-                                          T (&input)[ItemsPerThread],
-                                          T (&prev)[ItemsPerThread],
-                                          T&            block_suffix,
-                                          storage_type& storage)
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void up(const size_t& flat_id,
+            T (&input)[ItemsPerThread],
+            T (&prev)[ItemsPerThread],
+            T&            block_suffix,
+            storage_type& storage)
     {
         up(flat_id, input, prev, storage);
 
         // Update block prefix
-        block_suffix = storage->buffer.get_unsafe_array()[BlockSize - 1];
+        block_suffix = storage.buffer.get_unsafe_array()[BlockSize - 1];
     }
 
     /// \brief The thread block rotates a blocked arrange of input items,
@@ -458,10 +460,11 @@ class block_shuffle
     /// The item \p prev[0] is not updated for <em>thread</em><sub>BlockSize - 1</sub>.
     /// \param [in] storage reference to a temporary storage object of type storage_type.
     template<unsigned int ItemsPerThread>
-    ROCPRIM_DEVICE ROCPRIM_INLINE void down(const size_t& flat_id,
-                                            T (&input)[ItemsPerThread],
-                                            T (&next)[ItemsPerThread],
-                                            storage_type& storage)
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void down(const size_t& flat_id,
+              T (&input)[ItemsPerThread],
+              T (&next)[ItemsPerThread],
+              storage_type& storage)
     {
         storage.buffer.emplace(flat_id, input[0]);
 
@@ -525,16 +528,17 @@ class block_shuffle
     /// \param [out] block_prefix  The item \p input[0] from <em>thread</em><sub><tt>0</tt></sub>, provided to all threads
     /// \param [in] storage reference to a temporary storage object of type storage_type.
     template<unsigned int ItemsPerThread>
-    ROCPRIM_DEVICE ROCPRIM_INLINE void down(const size_t& flat_id,
-                                            T (&input)[ItemsPerThread],
-                                            T (&next)[ItemsPerThread],
-                                            T&            block_prefix,
-                                            storage_type& storage)
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void down(const size_t& flat_id,
+              T (&input)[ItemsPerThread],
+              T (&next)[ItemsPerThread],
+              T&            block_prefix,
+              storage_type& storage)
     {
         this->down(flat_id, input, next, storage);
 
         // Update block prefixstorage_->
-        block_prefix = storage->next[0];
+        block_prefix = storage.buffer.get_unsafe_array()[0];
     }
 };
 
@@ -543,4 +547,4 @@ END_ROCPRIM_NAMESPACE
 /// @}
 // end of group blockmodule
 
-#endif // ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_
+#endif // ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_
\ No newline at end of file

From ee7fe6c0b7bd1c2f9935890fbe8421dff7219a24 Mon Sep 17 00:00:00 2001
From: NguyenNhuDi <zee.nguyen@amd.com>
Date: Fri, 1 Aug 2025 12:50:56 -0700
Subject: [PATCH 02/10] deleted .jenkins to get correct math-ci configuration

---
 projects/rocprim/.jenkins/common.groovy     | 105 --------------------
 projects/rocprim/.jenkins/precheckin.groovy |  81 ---------------
 projects/rocprim/.jenkins/static.groovy     |  82 ---------------
 3 files changed, 268 deletions(-)
 delete mode 100644 projects/rocprim/.jenkins/common.groovy
 delete mode 100644 projects/rocprim/.jenkins/precheckin.groovy
 delete mode 100644 projects/rocprim/.jenkins/static.groovy

diff --git a/projects/rocprim/.jenkins/common.groovy b/projects/rocprim/.jenkins/common.groovy
deleted file mode 100644
index 0ffd1dee600..00000000000
--- a/projects/rocprim/.jenkins/common.groovy
+++ /dev/null
@@ -1,105 +0,0 @@
-// This file is for internal AMD use.
-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
-
-def runCompileCommand(platform, project, jobName, boolean debug=false, boolean staticLibrary=false)
-{
-    project.paths.construct_build_prefix()
-
-    String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug' : '-DCMAKE_BUILD_TYPE=Release'
-    String buildStatic = staticLibrary ? '-DBUILD_SHARED_LIBS=OFF' : '-DBUILD_SHARED_LIBS=ON'
-    String buildTypeDir = debug ? 'debug' : 'release'
-    String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake'
-    //Set CI node's gfx arch as target if PR, otherwise use default targets of the library
-    String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : ''
-
-    def command = """#!/usr/bin/env bash
-                set -x
-                cd ${project.paths.project_build_prefix}
-                mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir}
-                ${auxiliary.gfxTargetParser()}
-                ${cmake} --toolchain=toolchain-linux.cmake ${buildTypeArg} ${buildStatic} ${amdgpuTargets} -DBUILD_TEST=ON -DBUILD_BENCHMARK=ON ../..
-                make -j\$(nproc)
-                """
-
-    platform.runCommand(this, command)
-}
-
-
-def runTestCommand (platform, project, boolean rocmExamples=false)
-{
-    String sudo = auxiliary.sudo(platform.jenkinsLabel)
-
-    def testCommand = "ctest --output-on-failure "
-    def testCommandExcludeRegex = /(rocprim.block_histogram)/
-    def testCommandExclude = "--exclude-regex \"${testCommandExcludeRegex}\""
-    def hmmExcludeRegex = ''
-    def hmmTestCommandExclude = "--exclude-regex \"${hmmExcludeRegex}\""
-    def hmmTestCommand = ''
-    if (platform.jenkinsLabel.contains('gfx90a'))
-    {
-        echo("HMM TESTS DISABLED")
-        /*hmmTestCommand = """
-                            export HSA_XNACK=1
-                            export ROCPRIM_USE_HMM=1
-                            ${testCommand} ${hmmTestCommandExclude}
-                         """*/
-    }
-    echo(env.JOB_NAME)
-    if (env.JOB_NAME.contains('bleeding-edge'))
-    {
-        testCommand = ''
-        testCommandExclude = ''
-        hmmTestCommand = ''
-        echo("TESTS DISABLED")
-    }
-    def command = """#!/usr/bin/env bash
-                set -x
-                cd ${project.paths.project_build_prefix}
-                cd ${project.testDirectory}
-                ${testCommand} ${testCommandExclude}
-                if (( \$? != 0 )); then
-                    exit 1
-                fi
-                ${hmmTestCommand}
-            """
-    platform.runCommand(this, command)
-    //ROCM Examples
-    if (rocmExamples){
-        String buildString = ""
-        if (platform.os.contains("ubuntu")){
-            buildString += "sudo dpkg -i *.deb"
-        }
-        else {
-            buildString += "sudo rpm -i *.rpm"
-        }
-        testCommand = """#!/usr/bin/env bash
-                    set -ex
-                    cd ${project.paths.project_build_prefix}/build/release/package
-                    ls
-                    ${buildString}
-                    cd ../../..
-                    testDirs=("Libraries/rocPRIM")
-                    git clone https://github.com/ROCm/rocm-examples.git
-                    rocm_examples_dir=\$(readlink -f rocm-examples)
-                    for testDir in \${testDirs[@]}; do
-                        cd \${rocm_examples_dir}/\${testDir}
-                        cmake -S . -B build
-                        cmake --build build
-                        cd ./build
-                        ctest --output-on-failure
-                    done
-                """
-        platform.runCommand(this, testCommand, "ROCM Examples")  
-
-    }
-}
-
-def runPackageCommand(platform, project)
-{
-    def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/release")
-
-    platform.runCommand(this, packageHelper[0])
-        platform.archiveArtifacts(this, packageHelper[1])
-}
-
-return this
diff --git a/projects/rocprim/.jenkins/precheckin.groovy b/projects/rocprim/.jenkins/precheckin.groovy
deleted file mode 100644
index bbb8274743c..00000000000
--- a/projects/rocprim/.jenkins/precheckin.groovy
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/usr/bin/env groovy
-@Library('rocJenkins@pong') _
-import com.amd.project.*
-import com.amd.docker.*
-import java.nio.file.Path;
-
-def runCI = 
-{
-    nodeDetails, jobName->
-
-    def prj = new rocProject('rocPRIM', 'PreCheckin')
-    prj.paths.build_command = './install -c'
-    prj.timeout.compile = 600
-
-    def nodes = new dockerNodes(nodeDetails, jobName, prj)
-
-    def commonGroovy
-
-    boolean formatCheck = false
-     
-    def compileCommand =
-    {
-        platform, project->
-
-        commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
-        commonGroovy.runCompileCommand(platform, project, jobName)
-    }
-
-    def testCommand =
-    {
-        platform, project->
-
-        commonGroovy.runTestCommand(platform, project, true)
-    }
-
-    def packageCommand =
-    {
-        platform, project->
-
-        commonGroovy.runPackageCommand(platform, project)
-    }
-
-    buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
-}
-
-ci: { 
-    String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
-
-    def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])], 
-                        "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])],
-                        "rocm-docker":[]]
-    propertyList = auxiliary.appendPropertyList(propertyList)
-
-    def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])]
-    jobNameList = auxiliary.appendJobNameList(jobNameList, 'rocPRIM')
-
-    propertyList.each 
-    {
-        jobName, property->
-        if (urlJobName == jobName)
-            properties(auxiliary.addCommonProperties(property))
-    }
-    
-    jobNameList.each 
-    {
-        jobName, nodeDetails->
-        if (urlJobName == jobName)
-            stage(jobName) {
-                runCI(nodeDetails, jobName)
-            }
-    }
-
-    // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
-    if(!jobNameList.keySet().contains(urlJobName))
-    {
-        properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
-        stage(urlJobName) {
-            runCI([ubuntu16:['gfx906']], urlJobName)
-        }
-    }
-}
diff --git a/projects/rocprim/.jenkins/static.groovy b/projects/rocprim/.jenkins/static.groovy
deleted file mode 100644
index 75606419fdf..00000000000
--- a/projects/rocprim/.jenkins/static.groovy
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/usr/bin/env groovy
-@Library('rocJenkins@pong') _
-import com.amd.project.*
-import com.amd.docker.*
-import java.nio.file.Path;
-
-def runCI = 
-{
-    nodeDetails, jobName->
-
-    def prj = new rocProject('rocPRIM', 'static')
-    prj.paths.build_command = './install -c -s'
-    prj.timeout.compile = 600
-    prj.timeout.packaging = 120
-
-    def nodes = new dockerNodes(nodeDetails, jobName, prj)
-
-    def commonGroovy
-
-    boolean formatCheck = false
-     
-    def compileCommand =
-    {
-        platform, project->
-
-        commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
-        commonGroovy.runCompileCommand(platform, project, jobName, debug=false, staticLibrary=true)
-    }
-
-    def testCommand =
-    {
-        platform, project->
-
-        commonGroovy.runTestCommand(platform, project)
-    }
-
-    def packageCommand =
-    {
-        platform, project->
-
-        commonGroovy.runPackageCommand(platform, project)
-    }
-
-    buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
-}
-
-ci: { 
-    String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
-
-    def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])], 
-                        "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])],
-                        "rocm-docker":[]]
-    propertyList = auxiliary.appendPropertyList(propertyList)
-
-    def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])]
-    jobNameList = auxiliary.appendJobNameList(jobNameList, 'rocPRIM')
-
-    propertyList.each 
-    {
-        jobName, property->
-        if (urlJobName == jobName)
-            properties(auxiliary.addCommonProperties(property))
-    }
-    
-    jobNameList.each 
-    {
-        jobName, nodeDetails->
-        if (urlJobName == jobName)
-            stage(jobName) {
-                runCI(nodeDetails, jobName)
-            }
-    }
-
-    // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
-    if(!jobNameList.keySet().contains(urlJobName))
-    {
-        properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
-        stage(urlJobName) {
-            runCI([ubuntu16:['gfx906']], urlJobName)
-        }
-    }
-}

From ba1ca085bbe07bb0a3ed98bd5f450f45360b5d00 Mon Sep 17 00:00:00 2001
From: Di Nguyen <dinguyennhu@gmail.com>
Date: Wed, 16 Jul 2025 09:23:54 -0600
Subject: [PATCH 03/10] [hipCUB][Code Coverage] Increase Code Coverage (#411)

Re making PR #172 to merge to develop instead of release-staging
---
 projects/hipcub/CHANGELOG.md                  |    7 +
 .../test_hipcub_block_discontinuity.cpp       |  361 ++-
 .../hipcub/test_hipcub_block_exchange.cpp     | 1618 +++++++++++--
 .../hipcub/test_hipcub_block_merge_sort.cpp   |  903 ++++++-
 .../hipcub/test_hipcub_block_radix_rank.cpp   |  524 ++++-
 .../hipcub/test_hipcub_block_radix_sort.cpp   |  443 ++--
 .../test/hipcub/test_hipcub_block_reduce.cpp  |  572 +++--
 .../test_hipcub_block_run_length_decode.cpp   |  194 +-
 .../test/hipcub/test_hipcub_block_scan.cpp    | 2072 ++++++++++++++---
 .../test/hipcub/test_hipcub_block_shuffle.cpp |  512 ++--
 10 files changed, 5764 insertions(+), 1442 deletions(-)

diff --git a/projects/hipcub/CHANGELOG.md b/projects/hipcub/CHANGELOG.md
index a388d28f69d..bc353989653 100644
--- a/projects/hipcub/CHANGELOG.md
+++ b/projects/hipcub/CHANGELOG.md
@@ -19,6 +19,13 @@ Full documentation for hipCUB is available at [https://rocm.docs.amd.com/project
 * `UnrolledThreadLoad`, `UnrolledCopy`, and `ThreadLoadVolatilePointer` were added to align hipCUB with CUB.
 * `ThreadStoreVolatilePtr` and the `IterateThreadStore` struct were added to align hipCUB with CUB.
 * Added `hipcub::InclusiveScanInit` for CUB parity.
+* Additional Unit Tests for:
+  * block_exchange
+  * block_merge_sort
+  * block_radix_rank
+  * block_radix_sort
+  * block_reduce
+  * block_shuffle
 
 ### Removed
 
diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_discontinuity.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_discontinuity.cpp
index 29e07b759a3..827d34451f5 100644
--- a/projects/hipcub/test/hipcub/test_hipcub_block_discontinuity.cpp
+++ b/projects/hipcub/test/hipcub/test_hipcub_block_discontinuity.cpp
@@ -24,28 +24,23 @@
 
 // hipcub API
 #include "hipcub/block/block_discontinuity.hpp"
-#include "hipcub/thread/thread_operators.hpp"
 #include "hipcub/block/block_load.hpp"
 #include "hipcub/block/block_store.hpp"
+#include "hipcub/thread/thread_operators.hpp"
 
-template<
-    class T,
-    class Flag,
-    unsigned int BlockSize,
-    unsigned int ItemsPerThread,
-    class FlagOp
->
+template<class T, class Flag, unsigned int BlockSize, unsigned int ItemsPerThread, class FlagOp>
 struct params
 {
-    using type = T;
-    using flag_type = Flag;
-    static constexpr unsigned int block_size = BlockSize;
+    using type                                     = T;
+    using flag_type                                = Flag;
+    static constexpr unsigned int block_size       = BlockSize;
     static constexpr unsigned int items_per_thread = ItemsPerThread;
-    using flag_op_type = FlagOp;
+    using flag_op_type                             = FlagOp;
 };
 
 template<class Params>
-class HipcubBlockDiscontinuity : public ::testing::Test {
+class HipcubBlockDiscontinuity : public ::testing::Test
+{
 public:
     using params = Params;
 };
@@ -111,20 +106,18 @@ using Params = ::testing::Types<
 
 TYPED_TEST_SUITE(HipcubBlockDiscontinuity, Params);
 
-template<
-    class Type,
-    class FlagType,
-    class FlagOpType,
-    unsigned int BlockSize,
-    unsigned int ItemsPerThread
->
+template<class Type,
+         class FlagType,
+         class FlagOpType,
+         unsigned int BlockSize,
+         unsigned int ItemsPerThread>
 __global__
 __launch_bounds__(BlockSize)
 void flag_heads_kernel(Type* device_input, long long* device_heads)
 {
-    const unsigned int lid = hipThreadIdx_x;
+    const unsigned int lid             = hipThreadIdx_x;
     const unsigned int items_per_block = BlockSize * ItemsPerThread;
-    const unsigned int block_offset = hipBlockIdx_x * items_per_block;
+    const unsigned int block_offset    = hipBlockIdx_x * items_per_block;
 
     Type input[ItemsPerThread];
     hipcub::LoadDirectBlocked(lid, device_input + block_offset, input);
@@ -154,17 +147,16 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagHeads)
     using type = typename TestFixture::params::type;
     // std::vector<bool> is a special case that will cause an error in hipMemcpy
     using stored_flag_type = typename std::conditional<
-                               std::is_same<bool, typename TestFixture::params::flag_type>::value,
-                               int,
-                               typename TestFixture::params::flag_type
-                           >::type;
-    using flag_type = typename TestFixture::params::flag_type;
-    using flag_op_type = typename TestFixture::params::flag_op_type;
-    constexpr size_t block_size = TestFixture::params::block_size;
+        std::is_same<bool, typename TestFixture::params::flag_type>::value,
+        int,
+        typename TestFixture::params::flag_type>::type;
+    using flag_type                   = typename TestFixture::params::flag_type;
+    using flag_op_type                = typename TestFixture::params::flag_op_type;
+    constexpr size_t block_size       = TestFixture::params::block_size;
     constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
-    constexpr size_t items_per_block = block_size * items_per_thread;
-    const size_t size = items_per_block * 2048;
-    constexpr size_t grid_size = size / items_per_block;
+    constexpr size_t items_per_block  = block_size * items_per_thread;
+    const size_t     size             = items_per_block * 2048;
+    constexpr size_t grid_size        = size / items_per_block;
 
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
@@ -172,10 +164,10 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagHeads)
         return;
     }
 
-
-    for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
-        unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
         SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
 
         // Generate data
@@ -188,7 +180,7 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagHeads)
 
         // Calculate expected results on host
         std::vector<stored_flag_type> expected_heads(size);
-        flag_op_type flag_op;
+        flag_op_type                  flag_op;
         for(size_t bi = 0; bi < size / items_per_block; bi++)
         {
             for(size_t ii = 0; ii < items_per_block; ii++)
@@ -196,9 +188,8 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagHeads)
                 const size_t i = bi * items_per_block + ii;
                 if(ii == 0)
                 {
-                    expected_heads[i] = bi % 2 == 1
-                        ? apply(flag_op, input[i - 1], input[i], ii)
-                        : flag_type(true);
+                    expected_heads[i] = bi % 2 == 1 ? apply(flag_op, input[i - 1], input[i], ii)
+                                                    : flag_type(true);
                 }
                 else
                 {
@@ -209,40 +200,38 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagHeads)
 
         // Preparing Device
         type* device_input;
-        HIP_CHECK(test_common_utils::hipMallocHelper(&device_input, input.size() * sizeof(typename decltype(input)::value_type)));
+        HIP_CHECK(test_common_utils::hipMallocHelper(
+            &device_input,
+            input.size() * sizeof(typename decltype(input)::value_type)));
         long long* device_heads;
-        HIP_CHECK(test_common_utils::hipMallocHelper(&device_heads, heads.size() * sizeof(typename decltype(heads)::value_type)));
+        HIP_CHECK(test_common_utils::hipMallocHelper(
+            &device_heads,
+            heads.size() * sizeof(typename decltype(heads)::value_type)));
 
-        HIP_CHECK(
-            hipMemcpy(
-                device_input, input.data(),
-                input.size() * sizeof(type),
-                hipMemcpyHostToDevice
-            )
-        );
+        HIP_CHECK(hipMemcpy(device_input,
+                            input.data(),
+                            input.size() * sizeof(type),
+                            hipMemcpyHostToDevice));
 
+        HIP_CHECK(hipGetLastError());
         // Running kernel
         hipLaunchKernelGGL(
             HIP_KERNEL_NAME(
-                flag_heads_kernel<
-                    type, flag_type, flag_op_type,
-                    block_size, items_per_thread
-                >
-            ),
-            dim3(grid_size), dim3(block_size), 0, 0,
-            device_input, device_heads
-        );
-        HIP_CHECK(hipPeekAtLastError());
+                flag_heads_kernel<type, flag_type, flag_op_type, block_size, items_per_thread>),
+            dim3(grid_size),
+            dim3(block_size),
+            0,
+            0,
+            device_input,
+            device_heads);
+        HIP_CHECK(hipGetLastError());
         HIP_CHECK(hipDeviceSynchronize());
 
         // Reading results
-        HIP_CHECK(
-            hipMemcpy(
-                heads.data(), device_heads,
-                heads.size() * sizeof(typename decltype(heads)::value_type),
-                hipMemcpyDeviceToHost
-            )
-        );
+        HIP_CHECK(hipMemcpy(heads.data(),
+                            device_heads,
+                            heads.size() * sizeof(typename decltype(heads)::value_type),
+                            hipMemcpyDeviceToHost));
 
         // Validating results
         for(size_t i = 0; i < size; i++)
@@ -255,20 +244,18 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagHeads)
     }
 }
 
-template<
-    class Type,
-    class FlagType,
-    class FlagOpType,
-    unsigned int BlockSize,
-    unsigned int ItemsPerThread
->
+template<class Type,
+         class FlagType,
+         class FlagOpType,
+         unsigned int BlockSize,
+         unsigned int ItemsPerThread>
 __global__
 __launch_bounds__(BlockSize)
 void flag_tails_kernel(Type* device_input, long long* device_tails)
 {
-    const unsigned int lid = hipThreadIdx_x;
+    const unsigned int lid             = hipThreadIdx_x;
     const unsigned int items_per_block = BlockSize * ItemsPerThread;
-    const unsigned int block_offset = hipBlockIdx_x * items_per_block;
+    const unsigned int block_offset    = hipBlockIdx_x * items_per_block;
 
     Type input[ItemsPerThread];
     hipcub::LoadDirectBlocked(lid, device_input + block_offset, input);
@@ -298,17 +285,16 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagTails)
     using type = typename TestFixture::params::type;
     // std::vector<bool> is a special case that will cause an error in hipMemcpy
     using stored_flag_type = typename std::conditional<
-                               std::is_same<bool, typename TestFixture::params::flag_type>::value,
-                               int,
-                               typename TestFixture::params::flag_type
-                           >::type;
-    using flag_type = typename TestFixture::params::flag_type;
-    using flag_op_type = typename TestFixture::params::flag_op_type;
-    constexpr size_t block_size = TestFixture::params::block_size;
+        std::is_same<bool, typename TestFixture::params::flag_type>::value,
+        int,
+        typename TestFixture::params::flag_type>::type;
+    using flag_type                   = typename TestFixture::params::flag_type;
+    using flag_op_type                = typename TestFixture::params::flag_op_type;
+    constexpr size_t block_size       = TestFixture::params::block_size;
     constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
-    constexpr size_t items_per_block = block_size * items_per_thread;
-    const size_t size = items_per_block * 2048;
-    constexpr size_t grid_size = size / items_per_block;
+    constexpr size_t items_per_block  = block_size * items_per_thread;
+    const size_t     size             = items_per_block * 2048;
+    constexpr size_t grid_size        = size / items_per_block;
 
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
@@ -316,9 +302,10 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagTails)
         return;
     }
 
-    for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
-        unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
         SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
 
         // Generate data
@@ -331,7 +318,7 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagTails)
 
         // Calculate expected results on host
         std::vector<stored_flag_type> expected_tails(size);
-        flag_op_type flag_op;
+        flag_op_type                  flag_op;
         for(size_t bi = 0; bi < size / items_per_block; bi++)
         {
             for(size_t ii = 0; ii < items_per_block; ii++)
@@ -339,9 +326,8 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagTails)
                 const size_t i = bi * items_per_block + ii;
                 if(ii == items_per_block - 1)
                 {
-                    expected_tails[i] = bi % 2 == 0
-                        ? apply(flag_op, input[i], input[i + 1], ii + 1)
-                        : flag_type(true);
+                    expected_tails[i] = bi % 2 == 0 ? apply(flag_op, input[i], input[i + 1], ii + 1)
+                                                    : flag_type(true);
                 }
                 else
                 {
@@ -352,40 +338,39 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagTails)
 
         // Preparing Device
         type* device_input;
-        HIP_CHECK(test_common_utils::hipMallocHelper(&device_input, input.size() * sizeof(typename decltype(input)::value_type)));
+        HIP_CHECK(test_common_utils::hipMallocHelper(
+            &device_input,
+            input.size() * sizeof(typename decltype(input)::value_type)));
         long long* device_tails;
-        HIP_CHECK(test_common_utils::hipMallocHelper(&device_tails, tails.size() * sizeof(typename decltype(tails)::value_type)));
+        HIP_CHECK(test_common_utils::hipMallocHelper(
+            &device_tails,
+            tails.size() * sizeof(typename decltype(tails)::value_type)));
 
-        HIP_CHECK(
-            hipMemcpy(
-                device_input, input.data(),
-                input.size() * sizeof(type),
-                hipMemcpyHostToDevice
-            )
-        );
+        HIP_CHECK(hipMemcpy(device_input,
+                            input.data(),
+                            input.size() * sizeof(type),
+                            hipMemcpyHostToDevice));
 
+        HIP_CHECK(hipGetLastError());
         // Running kernel
         hipLaunchKernelGGL(
             HIP_KERNEL_NAME(
-                flag_tails_kernel<
-                    type, flag_type, flag_op_type,
-                    block_size, items_per_thread
-                >
-            ),
-            dim3(grid_size), dim3(block_size), 0, 0,
-            device_input, device_tails
-        );
-        HIP_CHECK(hipPeekAtLastError());
+                flag_tails_kernel<type, flag_type, flag_op_type, block_size, items_per_thread>),
+            dim3(grid_size),
+            dim3(block_size),
+            0,
+            0,
+            device_input,
+            device_tails);
+        HIP_CHECK(hipGetLastError());
+
         HIP_CHECK(hipDeviceSynchronize());
 
         // Reading results
-        HIP_CHECK(
-            hipMemcpy(
-                tails.data(), device_tails,
-                tails.size() * sizeof(typename decltype(tails)::value_type),
-                hipMemcpyDeviceToHost
-            )
-        );
+        HIP_CHECK(hipMemcpy(tails.data(),
+                            device_tails,
+                            tails.size() * sizeof(typename decltype(tails)::value_type),
+                            hipMemcpyDeviceToHost));
 
         // Validating results
         for(size_t i = 0; i < size; i++)
@@ -398,20 +383,20 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagTails)
     }
 }
 
-template<
-    class Type,
-    class FlagType,
-    class FlagOpType,
-    unsigned int BlockSize,
-    unsigned int ItemsPerThread
->
+template<class Type,
+         class FlagType,
+         class FlagOpType,
+         unsigned int BlockSize,
+         unsigned int ItemsPerThread>
 __global__
 __launch_bounds__(BlockSize)
-void flag_heads_and_tails_kernel(Type* device_input, long long* device_heads, long long* device_tails)
+void flag_heads_and_tails_kernel(Type*      device_input,
+                                 long long* device_heads,
+                                 long long* device_tails)
 {
-    const unsigned int lid = hipThreadIdx_x;
+    const unsigned int lid             = hipThreadIdx_x;
     const unsigned int items_per_block = BlockSize * ItemsPerThread;
-    const unsigned int block_offset = hipBlockIdx_x * items_per_block;
+    const unsigned int block_offset    = hipBlockIdx_x * items_per_block;
 
     Type input[ItemsPerThread];
     hipcub::LoadDirectBlocked(lid, device_input + block_offset, input);
@@ -423,18 +408,31 @@ void flag_heads_and_tails_kernel(Type* device_input, long long* device_heads, lo
     if(hipBlockIdx_x % 4 == 0)
     {
         const Type tile_successor_item = device_input[block_offset + items_per_block];
-        bdiscontinuity.FlagHeadsAndTails(head_flags, tail_flags, tile_successor_item, input, FlagOpType());
+        bdiscontinuity.FlagHeadsAndTails(head_flags,
+                                         tail_flags,
+                                         tile_successor_item,
+                                         input,
+                                         FlagOpType());
     }
     else if(hipBlockIdx_x % 4 == 1)
     {
         const Type tile_predecessor_item = device_input[block_offset - 1];
-        const Type tile_successor_item = device_input[block_offset + items_per_block];
-        bdiscontinuity.FlagHeadsAndTails(head_flags, tile_predecessor_item, tail_flags, tile_successor_item, input, FlagOpType());
+        const Type tile_successor_item   = device_input[block_offset + items_per_block];
+        bdiscontinuity.FlagHeadsAndTails(head_flags,
+                                         tile_predecessor_item,
+                                         tail_flags,
+                                         tile_successor_item,
+                                         input,
+                                         FlagOpType());
     }
     else if(hipBlockIdx_x % 4 == 2)
     {
         const Type tile_predecessor_item = device_input[block_offset - 1];
-        bdiscontinuity.FlagHeadsAndTails(head_flags, tile_predecessor_item, tail_flags, input, FlagOpType());
+        bdiscontinuity.FlagHeadsAndTails(head_flags,
+                                         tile_predecessor_item,
+                                         tail_flags,
+                                         input,
+                                         FlagOpType());
     }
     else if(hipBlockIdx_x % 4 == 3)
     {
@@ -454,17 +452,16 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagHeadsAndTails)
     using type = typename TestFixture::params::type;
     // std::vector<bool> is a special case that will cause an error in hipMemcpy
     using stored_flag_type = typename std::conditional<
-                               std::is_same<bool, typename TestFixture::params::flag_type>::value,
-                               int,
-                               typename TestFixture::params::flag_type
-                           >::type;
-    using flag_type = typename TestFixture::params::flag_type;
-    using flag_op_type = typename TestFixture::params::flag_op_type;
-    constexpr size_t block_size = TestFixture::params::block_size;
+        std::is_same<bool, typename TestFixture::params::flag_type>::value,
+        int,
+        typename TestFixture::params::flag_type>::type;
+    using flag_type                   = typename TestFixture::params::flag_type;
+    using flag_op_type                = typename TestFixture::params::flag_op_type;
+    constexpr size_t block_size       = TestFixture::params::block_size;
     constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
-    constexpr size_t items_per_block = block_size * items_per_thread;
-    const size_t size = items_per_block * 2048;
-    constexpr size_t grid_size = size / items_per_block;
+    constexpr size_t items_per_block  = block_size * items_per_thread;
+    const size_t     size             = items_per_block * 2048;
+    constexpr size_t grid_size        = size / items_per_block;
 
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
@@ -472,10 +469,10 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagHeadsAndTails)
         return;
     }
 
-
-    for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
-        unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
         SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
 
         // Generate data
@@ -490,7 +487,7 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagHeadsAndTails)
         // Calculate expected results on host
         std::vector<stored_flag_type> expected_heads(size);
         std::vector<stored_flag_type> expected_tails(size);
-        flag_op_type flag_op;
+        flag_op_type                  flag_op;
         for(size_t bi = 0; bi < size / items_per_block; bi++)
         {
             for(size_t ii = 0; ii < items_per_block; ii++)
@@ -499,8 +496,8 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagHeadsAndTails)
                 if(ii == 0)
                 {
                     expected_heads[i] = (bi % 4 == 1 || bi % 4 == 2)
-                        ? apply(flag_op, input[i - 1], input[i], ii)
-                        : flag_type(true);
+                                            ? apply(flag_op, input[i - 1], input[i], ii)
+                                            : flag_type(true);
                 }
                 else
                 {
@@ -509,8 +506,8 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagHeadsAndTails)
                 if(ii == items_per_block - 1)
                 {
                     expected_tails[i] = (bi % 4 == 0 || bi % 4 == 1)
-                        ? apply(flag_op, input[i], input[i + 1], ii + 1)
-                        : flag_type(true);
+                                            ? apply(flag_op, input[i], input[i + 1], ii + 1)
+                                            : flag_type(true);
                 }
                 else
                 {
@@ -521,50 +518,50 @@ TYPED_TEST(HipcubBlockDiscontinuity, FlagHeadsAndTails)
 
         // Preparing Device
         type* device_input;
-        HIP_CHECK(test_common_utils::hipMallocHelper(&device_input, input.size() * sizeof(typename decltype(input)::value_type)));
+        HIP_CHECK(test_common_utils::hipMallocHelper(
+            &device_input,
+            input.size() * sizeof(typename decltype(input)::value_type)));
         long long* device_heads;
-        HIP_CHECK(test_common_utils::hipMallocHelper(&device_heads, tails.size() * sizeof(typename decltype(heads)::value_type)));
+        HIP_CHECK(test_common_utils::hipMallocHelper(
+            &device_heads,
+            tails.size() * sizeof(typename decltype(heads)::value_type)));
         long long* device_tails;
-        HIP_CHECK(test_common_utils::hipMallocHelper(&device_tails, tails.size() * sizeof(typename decltype(tails)::value_type)));
+        HIP_CHECK(test_common_utils::hipMallocHelper(
+            &device_tails,
+            tails.size() * sizeof(typename decltype(tails)::value_type)));
 
-        HIP_CHECK(
-            hipMemcpy(
-                device_input, input.data(),
-                input.size() * sizeof(type),
-                hipMemcpyHostToDevice
-            )
-        );
+        HIP_CHECK(hipMemcpy(device_input,
+                            input.data(),
+                            input.size() * sizeof(type),
+                            hipMemcpyHostToDevice));
 
+        HIP_CHECK(hipGetLastError());
         // Running kernel
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(
-                flag_heads_and_tails_kernel<
-                    type, flag_type, flag_op_type,
-                    block_size, items_per_thread
-                >
-            ),
-            dim3(grid_size), dim3(block_size), 0, 0,
-            device_input, device_heads, device_tails
-        );
-        HIP_CHECK(hipPeekAtLastError());
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(flag_heads_and_tails_kernel<type,
+                                                                       flag_type,
+                                                                       flag_op_type,
+                                                                       block_size,
+                                                                       items_per_thread>),
+                           dim3(grid_size),
+                           dim3(block_size),
+                           0,
+                           0,
+                           device_input,
+                           device_heads,
+                           device_tails);
+        HIP_CHECK(hipGetLastError());
         HIP_CHECK(hipDeviceSynchronize());
 
         // Reading results
-        HIP_CHECK(
-            hipMemcpy(
-                heads.data(), device_heads,
-                heads.size() * sizeof(typename decltype(heads)::value_type),
-                hipMemcpyDeviceToHost
-            )
-        );
-
-        HIP_CHECK(
-            hipMemcpy(
-                tails.data(), device_tails,
-                tails.size() * sizeof(typename decltype(tails)::value_type),
-                hipMemcpyDeviceToHost
-            )
-        );
+        HIP_CHECK(hipMemcpy(heads.data(),
+                            device_heads,
+                            heads.size() * sizeof(typename decltype(heads)::value_type),
+                            hipMemcpyDeviceToHost));
+
+        HIP_CHECK(hipMemcpy(tails.data(),
+                            device_tails,
+                            tails.size() * sizeof(typename decltype(tails)::value_type),
+                            hipMemcpyDeviceToHost));
 
         // Validating results
         for(size_t i = 0; i < size; i++)
diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_exchange.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_exchange.cpp
index 4c14811ed2d..08833ccfb80 100644
--- a/projects/hipcub/test/hipcub/test_hipcub_block_exchange.cpp
+++ b/projects/hipcub/test/hipcub/test_hipcub_block_exchange.cpp
@@ -26,22 +26,18 @@
 #include "hipcub/block/block_load.hpp"
 #include "hipcub/block/block_store.hpp"
 
-template<
-    class T,
-    class U,
-    unsigned int BlockSize,
-    unsigned int ItemsPerThread
->
+template<class T, class U, unsigned int BlockSize, unsigned int ItemsPerThread>
 struct params
 {
-    using type = T;
-    using output_type = U;
-    static constexpr unsigned int block_size = BlockSize;
+    using type                                     = T;
+    using output_type                              = U;
+    static constexpr unsigned int block_size       = BlockSize;
     static constexpr unsigned int items_per_thread = ItemsPerThread;
 };
 
 template<class Params>
-class HipcubBlockExchangeTests : public ::testing::Test {
+class HipcubBlockExchangeTests : public ::testing::Test
+{
 public:
     using params = Params;
 };
@@ -58,8 +54,8 @@ struct dummy
     dummy() = default;
 
     template<class U>
-    HIPCUB_HOST_DEVICE
-    dummy(U a) : x(a + 1), y(a * 2) { }
+    HIPCUB_HOST_DEVICE dummy(U a) : x(a + 1), y(a * 2)
+    {}
 
     HIPCUB_HOST_DEVICE
     bool operator==(const dummy& rhs) const
@@ -96,21 +92,16 @@ using Params = ::testing::Types<
 
 TYPED_TEST_SUITE(HipcubBlockExchangeTests, Params);
 
-template<
-    class Type,
-    class OutputType,
-    unsigned int ItemsPerBlock,
-    unsigned int ItemsPerThread
->
+template<class Type, class OutputType, unsigned int ItemsPerBlock, unsigned int ItemsPerThread>
 __global__
 __launch_bounds__(512)
 void blocked_to_striped_kernel(Type* device_input, OutputType* device_output)
 {
-    constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread);
-    const unsigned int lid = hipThreadIdx_x;
-    const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock;
+    constexpr unsigned int block_size   = (ItemsPerBlock / ItemsPerThread);
+    const unsigned int     lid          = hipThreadIdx_x;
+    const unsigned int     block_offset = hipBlockIdx_x * ItemsPerBlock;
 
-    Type input[ItemsPerThread];
+    Type       input[ItemsPerThread];
     OutputType output[ItemsPerThread];
     hipcub::LoadDirectBlocked(lid, device_input + block_offset, input);
 
@@ -141,7 +132,7 @@ TYPED_TEST(HipcubBlockExchangeTests, BlockedToStriped)
 
     const size_t size = items_per_block * 113;
     // Generate data
-    std::vector<type> input(size);
+    std::vector<type>        input(size);
     std::vector<output_type> expected(size);
     std::vector<output_type> output(size, test_utils::convert_to_device<output_type>(0));
 
@@ -165,36 +156,37 @@ TYPED_TEST(HipcubBlockExchangeTests, BlockedToStriped)
 
     // Preparing device
     type* device_input;
-    HIP_CHECK(test_common_utils::hipMallocHelper(&device_input, input.size() * sizeof(typename decltype(input)::value_type)));
+    HIP_CHECK(test_common_utils::hipMallocHelper(
+        &device_input,
+        input.size() * sizeof(typename decltype(input)::value_type)));
     output_type* device_output;
-    HIP_CHECK(test_common_utils::hipMallocHelper(&device_output, output.size() * sizeof(typename decltype(output)::value_type)));
+    HIP_CHECK(test_common_utils::hipMallocHelper(
+        &device_output,
+        output.size() * sizeof(typename decltype(output)::value_type)));
 
     HIP_CHECK(
-        hipMemcpy(
-            device_input, input.data(),
-            input.size() * sizeof(type),
-            hipMemcpyHostToDevice
-        )
-    );
+        hipMemcpy(device_input, input.data(), input.size() * sizeof(type), hipMemcpyHostToDevice));
 
+    HIP_CHECK(hipGetLastError());
     // Running kernel
     constexpr unsigned int grid_size = (size / items_per_block);
     hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(blocked_to_striped_kernel<type, output_type, items_per_block, items_per_thread>),
-        dim3(grid_size), dim3(block_size), 0, 0,
-        device_input, device_output
-    );
-    HIP_CHECK(hipPeekAtLastError());
+        HIP_KERNEL_NAME(
+            blocked_to_striped_kernel<type, output_type, items_per_block, items_per_thread>),
+        dim3(grid_size),
+        dim3(block_size),
+        0,
+        0,
+        device_input,
+        device_output);
+    HIP_CHECK(hipGetLastError());
     HIP_CHECK(hipDeviceSynchronize());
 
     // Reading results
-    HIP_CHECK(
-        hipMemcpy(
-            output.data(), device_output,
-            output.size() * sizeof(typename decltype(output)::value_type),
-            hipMemcpyDeviceToHost
-        )
-    );
+    HIP_CHECK(hipMemcpy(output.data(),
+                        device_output,
+                        output.size() * sizeof(typename decltype(output)::value_type),
+                        hipMemcpyDeviceToHost));
 
     for(size_t i = 0; i < size; i++)
     {
@@ -206,21 +198,16 @@ TYPED_TEST(HipcubBlockExchangeTests, BlockedToStriped)
     HIP_CHECK(hipFree(device_output));
 }
 
-template<
-    class Type,
-    class OutputType,
-    unsigned int ItemsPerBlock,
-    unsigned int ItemsPerThread
->
+template<class Type, class OutputType, unsigned int ItemsPerBlock, unsigned int ItemsPerThread>
 __global__
 __launch_bounds__(512)
 void striped_to_blocked_kernel(Type* device_input, OutputType* device_output)
 {
-    constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread);
-    const unsigned int lid = hipThreadIdx_x;
-    const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock;
+    constexpr unsigned int block_size   = (ItemsPerBlock / ItemsPerThread);
+    const unsigned int     lid          = hipThreadIdx_x;
+    const unsigned int     block_offset = hipBlockIdx_x * ItemsPerBlock;
 
-    Type input[ItemsPerThread];
+    Type       input[ItemsPerThread];
     OutputType output[ItemsPerThread];
     hipcub::LoadDirectBlocked(lid, device_input + block_offset, input);
 
@@ -251,7 +238,7 @@ TYPED_TEST(HipcubBlockExchangeTests, StripedToBlocked)
 
     const size_t size = items_per_block * 113;
     // Generate data
-    std::vector<type> input(size);
+    std::vector<type>        input(size);
     std::vector<output_type> expected(size);
     std::vector<output_type> output(size, test_utils::convert_to_device<output_type>(0));
 
@@ -275,36 +262,36 @@ TYPED_TEST(HipcubBlockExchangeTests, StripedToBlocked)
 
     // Preparing device
     type* device_input;
-    HIP_CHECK(test_common_utils::hipMallocHelper(&device_input, input.size() * sizeof(typename decltype(input)::value_type)));
+    HIP_CHECK(test_common_utils::hipMallocHelper(
+        &device_input,
+        input.size() * sizeof(typename decltype(input)::value_type)));
     output_type* device_output;
-    HIP_CHECK(test_common_utils::hipMallocHelper(&device_output, output.size() * sizeof(typename decltype(output)::value_type)));
+    HIP_CHECK(test_common_utils::hipMallocHelper(
+        &device_output,
+        output.size() * sizeof(typename decltype(output)::value_type)));
 
     HIP_CHECK(
-        hipMemcpy(
-            device_input, input.data(),
-            input.size() * sizeof(type),
-            hipMemcpyHostToDevice
-        )
-    );
+        hipMemcpy(device_input, input.data(), input.size() * sizeof(type), hipMemcpyHostToDevice));
 
     // Running kernel
     constexpr unsigned int grid_size = (size / items_per_block);
     hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(striped_to_blocked_kernel<type, output_type, items_per_block, items_per_thread>),
-        dim3(grid_size), dim3(block_size), 0, 0,
-        device_input, device_output
-    );
-    HIP_CHECK(hipPeekAtLastError());
+        HIP_KERNEL_NAME(
+            striped_to_blocked_kernel<type, output_type, items_per_block, items_per_thread>),
+        dim3(grid_size),
+        dim3(block_size),
+        0,
+        0,
+        device_input,
+        device_output);
+    HIP_CHECK(hipGetLastError());
     HIP_CHECK(hipDeviceSynchronize());
 
     // Reading results
-    HIP_CHECK(
-        hipMemcpy(
-            output.data(), device_output,
-            output.size() * sizeof(typename decltype(output)::value_type),
-            hipMemcpyDeviceToHost
-        )
-    );
+    HIP_CHECK(hipMemcpy(output.data(),
+                        device_output,
+                        output.size() * sizeof(typename decltype(output)::value_type),
+                        hipMemcpyDeviceToHost));
 
     for(size_t i = 0; i < size; i++)
     {
@@ -316,21 +303,16 @@ TYPED_TEST(HipcubBlockExchangeTests, StripedToBlocked)
     HIP_CHECK(hipFree(device_output));
 }
 
-template<
-    class Type,
-    class OutputType,
-    unsigned int ItemsPerBlock,
-    unsigned int ItemsPerThread
->
+template<class Type, class OutputType, unsigned int ItemsPerBlock, unsigned int ItemsPerThread>
 __global__
 __launch_bounds__(512)
 void blocked_to_warp_striped_kernel(Type* device_input, OutputType* device_output)
 {
-    constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread);
-    const unsigned int lid = hipThreadIdx_x;
-    const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock;
+    constexpr unsigned int block_size   = (ItemsPerBlock / ItemsPerThread);
+    const unsigned int     lid          = hipThreadIdx_x;
+    const unsigned int     block_offset = hipBlockIdx_x * ItemsPerBlock;
 
-    Type input[ItemsPerThread];
+    Type       input[ItemsPerThread];
     OutputType output[ItemsPerThread];
     hipcub::LoadDirectBlocked(lid, device_input + block_offset, input);
 
@@ -372,14 +354,16 @@ TYPED_TEST(HipcubBlockExchangeTests, BlockedToWarpStriped)
 
     const size_t size = items_per_block * 113;
     // Generate data
-    std::vector<type> input(size);
+    std::vector<type>        input(size);
     std::vector<output_type> expected(size);
     std::vector<output_type> output(size, test_utils::convert_to_device<output_type>(0));
 
-    constexpr size_t warp_size_32 = test_utils::get_min_warp_size(block_size, size_t(HIPCUB_WARP_SIZE_32));
-    constexpr size_t warp_size_64 = test_utils::get_min_warp_size(block_size, size_t(HIPCUB_WARP_SIZE_64));
-    constexpr size_t warps_no_32 = (block_size + warp_size_32 - 1) / warp_size_32;
-    constexpr size_t warps_no_64 = (block_size + warp_size_64 - 1) / warp_size_64;
+    constexpr size_t warp_size_32
+        = test_utils::get_min_warp_size(block_size, size_t(HIPCUB_WARP_SIZE_32));
+    constexpr size_t warp_size_64
+        = test_utils::get_min_warp_size(block_size, size_t(HIPCUB_WARP_SIZE_64));
+    constexpr size_t warps_no_32       = (block_size + warp_size_32 - 1) / warp_size_32;
+    constexpr size_t warps_no_64       = (block_size + warp_size_64 - 1) / warp_size_64;
     constexpr size_t items_per_warp_32 = warp_size_32 * items_per_thread;
     constexpr size_t items_per_warp_64 = warp_size_64 * items_per_thread;
 
@@ -387,24 +371,28 @@ TYPED_TEST(HipcubBlockExchangeTests, BlockedToWarpStriped)
     std::vector<fundemental_type> values(size);
     std::iota(values.begin(), values.end(), 0);
 
-    const size_t warps_no = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? warps_no_32 : warps_no_64;
-    const size_t warp_size = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? warp_size_32 : warp_size_64;
-    const size_t items_per_warp = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? items_per_warp_32 : items_per_warp_64;
+    const size_t warps_no
+        = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? warps_no_32 : warps_no_64;
+    const size_t warp_size
+        = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? warp_size_32 : warp_size_64;
+    const size_t items_per_warp
+        = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? items_per_warp_32 : items_per_warp_64;
 
     for(size_t bi = 0; bi < size / items_per_block; bi++)
     {
         for(size_t wi = 0; wi < warps_no; wi++)
         {
-            const size_t current_warp_size = wi == warps_no - 1
-                ? (block_size % warp_size != 0 ? block_size % warp_size : warp_size)
-                : warp_size;
+            const size_t current_warp_size
+                = wi == warps_no - 1
+                      ? (block_size % warp_size != 0 ? block_size % warp_size : warp_size)
+                      : warp_size;
             for(size_t li = 0; li < current_warp_size; li++)
             {
                 for(size_t ii = 0; ii < items_per_thread; ii++)
                 {
                     const size_t offset = bi * items_per_block + wi * items_per_warp;
-                    const size_t i0 = offset + li * items_per_thread + ii;
-                    const size_t i1 = offset + ii * current_warp_size + li;
+                    const size_t i0     = offset + li * items_per_thread + ii;
+                    const size_t i1     = offset + ii * current_warp_size + li;
                     input[i1]           = test_utils::convert_to_device<type>(values[i1]);
                     expected[i0]        = test_utils::convert_to_device<type>(values[i1]);
                 }
@@ -414,38 +402,36 @@ TYPED_TEST(HipcubBlockExchangeTests, BlockedToWarpStriped)
 
     // Preparing device
     type* device_input;
-    HIP_CHECK(test_common_utils::hipMallocHelper(&device_input, input.size() * sizeof(typename decltype(input)::value_type)));
+    HIP_CHECK(test_common_utils::hipMallocHelper(
+        &device_input,
+        input.size() * sizeof(typename decltype(input)::value_type)));
     output_type* device_output;
-    HIP_CHECK(test_common_utils::hipMallocHelper(&device_output, output.size() * sizeof(typename decltype(output)::value_type)));
+    HIP_CHECK(test_common_utils::hipMallocHelper(
+        &device_output,
+        output.size() * sizeof(typename decltype(output)::value_type)));
 
     HIP_CHECK(
-        hipMemcpy(
-            device_input, input.data(),
-            input.size() * sizeof(type),
-            hipMemcpyHostToDevice
-        )
-    );
+        hipMemcpy(device_input, input.data(), input.size() * sizeof(type), hipMemcpyHostToDevice));
 
     // Running kernel
     constexpr unsigned int grid_size = (size / items_per_block);
     hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(blocked_to_warp_striped_kernel<
-                type, output_type, items_per_block, items_per_thread
-        >),
-        dim3(grid_size), dim3(block_size), 0, 0,
-        device_input, device_output
-    );
-    HIP_CHECK(hipPeekAtLastError());
+        HIP_KERNEL_NAME(
+            blocked_to_warp_striped_kernel<type, output_type, items_per_block, items_per_thread>),
+        dim3(grid_size),
+        dim3(block_size),
+        0,
+        0,
+        device_input,
+        device_output);
+    HIP_CHECK(hipGetLastError());
     HIP_CHECK(hipDeviceSynchronize());
 
     // Reading results
-    HIP_CHECK(
-        hipMemcpy(
-            output.data(), device_output,
-            output.size() * sizeof(typename decltype(output)::value_type),
-            hipMemcpyDeviceToHost
-        )
-    );
+    HIP_CHECK(hipMemcpy(output.data(),
+                        device_output,
+                        output.size() * sizeof(typename decltype(output)::value_type),
+                        hipMemcpyDeviceToHost));
 
     for(size_t i = 0; i < size; i++)
     {
@@ -457,21 +443,16 @@ TYPED_TEST(HipcubBlockExchangeTests, BlockedToWarpStriped)
     HIP_CHECK(hipFree(device_output));
 }
 
-template<
-    class Type,
-    class OutputType,
-    unsigned int ItemsPerBlock,
-    unsigned int ItemsPerThread
->
+template<class Type, class OutputType, unsigned int ItemsPerBlock, unsigned int ItemsPerThread>
 __global__
 __launch_bounds__(512)
 void warp_striped_to_blocked_kernel(Type* device_input, OutputType* device_output)
 {
-    constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread);
-    const unsigned int lid = hipThreadIdx_x;
-    const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock;
+    constexpr unsigned int block_size   = (ItemsPerBlock / ItemsPerThread);
+    const unsigned int     lid          = hipThreadIdx_x;
+    const unsigned int     block_offset = hipBlockIdx_x * ItemsPerBlock;
 
-    Type input[ItemsPerThread];
+    Type       input[ItemsPerThread];
     OutputType output[ItemsPerThread];
     hipcub::LoadDirectBlocked(lid, device_input + block_offset, input);
 
@@ -513,14 +494,16 @@ TYPED_TEST(HipcubBlockExchangeTests, WarpStripedToBlocked)
 
     const size_t size = items_per_block * 113;
     // Generate data
-    std::vector<type> input(size);
+    std::vector<type>        input(size);
     std::vector<output_type> expected(size);
     std::vector<output_type> output(size, test_utils::convert_to_device<output_type>(0));
 
-    constexpr size_t warp_size_32 = test_utils::get_min_warp_size(block_size, size_t(HIPCUB_WARP_SIZE_32));
-    constexpr size_t warp_size_64 = test_utils::get_min_warp_size(block_size, size_t(HIPCUB_WARP_SIZE_64));
-    constexpr size_t warps_no_32 = (block_size + warp_size_32 - 1) / warp_size_32;
-    constexpr size_t warps_no_64 = (block_size + warp_size_64 - 1) / warp_size_64;
+    constexpr size_t warp_size_32
+        = test_utils::get_min_warp_size(block_size, size_t(HIPCUB_WARP_SIZE_32));
+    constexpr size_t warp_size_64
+        = test_utils::get_min_warp_size(block_size, size_t(HIPCUB_WARP_SIZE_64));
+    constexpr size_t warps_no_32       = (block_size + warp_size_32 - 1) / warp_size_32;
+    constexpr size_t warps_no_64       = (block_size + warp_size_64 - 1) / warp_size_64;
     constexpr size_t items_per_warp_32 = warp_size_32 * items_per_thread;
     constexpr size_t items_per_warp_64 = warp_size_64 * items_per_thread;
 
@@ -528,17 +511,21 @@ TYPED_TEST(HipcubBlockExchangeTests, WarpStripedToBlocked)
     std::vector<fundemental_type> values(size);
     std::iota(values.begin(), values.end(), 0);
 
-    const size_t warps_no = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? warps_no_32 : warps_no_64;
-    const size_t warp_size = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? warp_size_32 : warp_size_64;
-    const size_t items_per_warp = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? items_per_warp_32 : items_per_warp_64;
+    const size_t warps_no
+        = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? warps_no_32 : warps_no_64;
+    const size_t warp_size
+        = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? warp_size_32 : warp_size_64;
+    const size_t items_per_warp
+        = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? items_per_warp_32 : items_per_warp_64;
 
     for(size_t bi = 0; bi < size / items_per_block; bi++)
     {
         for(size_t wi = 0; wi < warps_no; wi++)
         {
-            const size_t current_warp_size = wi == warps_no - 1
-                ? (block_size % warp_size != 0 ? block_size % warp_size : warp_size)
-                : warp_size;
+            const size_t current_warp_size
+                = wi == warps_no - 1
+                      ? (block_size % warp_size != 0 ? block_size % warp_size : warp_size)
+                      : warp_size;
             for(size_t li = 0; li < current_warp_size; li++)
             {
                 for(size_t ii = 0; ii < items_per_thread; ii++)
@@ -555,36 +542,36 @@ TYPED_TEST(HipcubBlockExchangeTests, WarpStripedToBlocked)
 
     // Preparing device
     type* device_input;
-    HIP_CHECK(test_common_utils::hipMallocHelper(&device_input, input.size() * sizeof(typename decltype(input)::value_type)));
+    HIP_CHECK(test_common_utils::hipMallocHelper(
+        &device_input,
+        input.size() * sizeof(typename decltype(input)::value_type)));
     output_type* device_output;
-    HIP_CHECK(test_common_utils::hipMallocHelper(&device_output, output.size() * sizeof(typename decltype(output)::value_type)));
+    HIP_CHECK(test_common_utils::hipMallocHelper(
+        &device_output,
+        output.size() * sizeof(typename decltype(output)::value_type)));
 
     HIP_CHECK(
-        hipMemcpy(
-            device_input, input.data(),
-            input.size() * sizeof(type),
-            hipMemcpyHostToDevice
-        )
-    );
+        hipMemcpy(device_input, input.data(), input.size() * sizeof(type), hipMemcpyHostToDevice));
 
     // Running kernel
     constexpr unsigned int grid_size = (size / items_per_block);
     hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(warp_striped_to_blocked_kernel<type, output_type, items_per_block, items_per_thread>),
-        dim3(grid_size), dim3(block_size), 0, 0,
-        device_input, device_output
-    );
-    HIP_CHECK(hipPeekAtLastError());
+        HIP_KERNEL_NAME(
+            warp_striped_to_blocked_kernel<type, output_type, items_per_block, items_per_thread>),
+        dim3(grid_size),
+        dim3(block_size),
+        0,
+        0,
+        device_input,
+        device_output);
+    HIP_CHECK(hipGetLastError());
     HIP_CHECK(hipDeviceSynchronize());
 
     // Reading results
-    HIP_CHECK(
-        hipMemcpy(
-            output.data(), device_output,
-            output.size() * sizeof(typename decltype(output)::value_type),
-            hipMemcpyDeviceToHost
-        )
-    );
+    HIP_CHECK(hipMemcpy(output.data(),
+                        device_output,
+                        output.size() * sizeof(typename decltype(output)::value_type),
+                        hipMemcpyDeviceToHost));
 
     for(size_t i = 0; i < size; i++)
     {
@@ -596,22 +583,19 @@ TYPED_TEST(HipcubBlockExchangeTests, WarpStripedToBlocked)
     HIP_CHECK(hipFree(device_output));
 }
 
-template<
-    class Type,
-    class OutputType,
-    unsigned int ItemsPerBlock,
-    unsigned int ItemsPerThread
->
+template<class Type, class OutputType, unsigned int ItemsPerBlock, unsigned int ItemsPerThread>
 __global__
 __launch_bounds__(512)
-void scatter_to_blocked_kernel(Type* device_input, OutputType* device_output, unsigned int* device_ranks)
+void scatter_to_blocked_kernel(Type*         device_input,
+                               OutputType*   device_output,
+                               unsigned int* device_ranks)
 {
-    constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread);
-    const unsigned int lid = hipThreadIdx_x;
-    const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock;
+    constexpr unsigned int block_size   = (ItemsPerBlock / ItemsPerThread);
+    const unsigned int     lid          = hipThreadIdx_x;
+    const unsigned int     block_offset = hipBlockIdx_x * ItemsPerBlock;
 
-    Type input[ItemsPerThread];
-    OutputType output[ItemsPerThread];
+    Type         input[ItemsPerThread];
+    OutputType   output[ItemsPerThread];
     unsigned int ranks[ItemsPerThread];
     hipcub::LoadDirectBlocked(lid, device_input + block_offset, input);
     hipcub::LoadDirectBlocked(lid, device_ranks + block_offset, ranks);
@@ -643,8 +627,8 @@ TYPED_TEST(HipcubBlockExchangeTests, ScatterToBlocked)
 
     const size_t size = items_per_block * 113;
     // Generate data
-    std::vector<type> input(size);
-    std::vector<output_type> expected(size);
+    std::vector<type>         input(size);
+    std::vector<output_type>  expected(size);
     std::vector<output_type>  output(size, test_utils::convert_to_device<output_type>(0));
     std::vector<unsigned int> ranks(size);
 
@@ -653,7 +637,9 @@ TYPED_TEST(HipcubBlockExchangeTests, ScatterToBlocked)
     {
         auto block_ranks = ranks.begin() + bi * items_per_block;
         std::iota(block_ranks, block_ranks + items_per_block, 0);
-        std::shuffle(block_ranks, block_ranks + items_per_block, std::mt19937{std::random_device{}()});
+        std::shuffle(block_ranks,
+                     block_ranks + items_per_block,
+                     std::mt19937{std::random_device{}()});
     }
     std::vector<fundemental_type> values(size);
     std::iota(values.begin(), values.end(), 0);
@@ -674,46 +660,46 @@ TYPED_TEST(HipcubBlockExchangeTests, ScatterToBlocked)
 
     // Preparing device
     type* device_input;
-    HIP_CHECK(test_common_utils::hipMallocHelper(&device_input, input.size() * sizeof(typename decltype(input)::value_type)));
+    HIP_CHECK(test_common_utils::hipMallocHelper(
+        &device_input,
+        input.size() * sizeof(typename decltype(input)::value_type)));
     output_type* device_output;
-    HIP_CHECK(test_common_utils::hipMallocHelper(&device_output, output.size() * sizeof(typename decltype(output)::value_type)));
+    HIP_CHECK(test_common_utils::hipMallocHelper(
+        &device_output,
+        output.size() * sizeof(typename decltype(output)::value_type)));
     unsigned int* device_ranks;
-    HIP_CHECK(test_common_utils::hipMallocHelper(&device_ranks, ranks.size() * sizeof(typename decltype(ranks)::value_type)));
+    HIP_CHECK(test_common_utils::hipMallocHelper(
+        &device_ranks,
+        ranks.size() * sizeof(typename decltype(ranks)::value_type)));
 
     HIP_CHECK(
-        hipMemcpy(
-            device_input, input.data(),
-            input.size() * sizeof(type),
-            hipMemcpyHostToDevice
-        )
-    );
+        hipMemcpy(device_input, input.data(), input.size() * sizeof(type), hipMemcpyHostToDevice));
 
-    HIP_CHECK(
-        hipMemcpy(
-            device_ranks, ranks.data(),
-            ranks.size() * sizeof(unsigned int),
-            hipMemcpyHostToDevice
-        )
-    );
+    HIP_CHECK(hipMemcpy(device_ranks,
+                        ranks.data(),
+                        ranks.size() * sizeof(unsigned int),
+                        hipMemcpyHostToDevice));
 
     // Running kernel
     constexpr unsigned int grid_size = (size / items_per_block);
     hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(scatter_to_blocked_kernel<type, output_type, items_per_block, items_per_thread>),
-        dim3(grid_size), dim3(block_size), 0, 0,
-        device_input, device_output, device_ranks
-    );
-    HIP_CHECK(hipPeekAtLastError());
+        HIP_KERNEL_NAME(
+            scatter_to_blocked_kernel<type, output_type, items_per_block, items_per_thread>),
+        dim3(grid_size),
+        dim3(block_size),
+        0,
+        0,
+        device_input,
+        device_output,
+        device_ranks);
+    HIP_CHECK(hipGetLastError());
     HIP_CHECK(hipDeviceSynchronize());
 
     // Reading results
-    HIP_CHECK(
-        hipMemcpy(
-            output.data(), device_output,
-            output.size() * sizeof(typename decltype(output)::value_type),
-            hipMemcpyDeviceToHost
-        )
-    );
+    HIP_CHECK(hipMemcpy(output.data(),
+                        device_output,
+                        output.size() * sizeof(typename decltype(output)::value_type),
+                        hipMemcpyDeviceToHost));
 
     for(size_t i = 0; i < size; i++)
     {
@@ -726,22 +712,19 @@ TYPED_TEST(HipcubBlockExchangeTests, ScatterToBlocked)
     HIP_CHECK(hipFree(device_ranks));
 }
 
-template<
-    class Type,
-    class OutputType,
-    unsigned int ItemsPerBlock,
-    unsigned int ItemsPerThread
->
+template<class Type, class OutputType, unsigned int ItemsPerBlock, unsigned int ItemsPerThread>
 __global__
 __launch_bounds__(512)
-void scatter_to_striped_kernel(Type* device_input, OutputType* device_output, unsigned int* device_ranks)
+void scatter_to_striped_kernel(Type*         device_input,
+                               OutputType*   device_output,
+                               unsigned int* device_ranks)
 {
-    constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread);
-    const unsigned int lid = hipThreadIdx_x;
-    const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock;
+    constexpr unsigned int block_size   = (ItemsPerBlock / ItemsPerThread);
+    const unsigned int     lid          = hipThreadIdx_x;
+    const unsigned int     block_offset = hipBlockIdx_x * ItemsPerBlock;
 
-    Type input[ItemsPerThread];
-    OutputType output[ItemsPerThread];
+    Type         input[ItemsPerThread];
+    OutputType   output[ItemsPerThread];
     unsigned int ranks[ItemsPerThread];
     hipcub::LoadDirectBlocked(lid, device_input + block_offset, input);
     hipcub::LoadDirectBlocked(lid, device_ranks + block_offset, ranks);
@@ -773,8 +756,8 @@ TYPED_TEST(HipcubBlockExchangeTests, ScatterToStriped)
 
     const size_t size = items_per_block * 113;
     // Generate data
-    std::vector<type> input(size);
-    std::vector<output_type> expected(size);
+    std::vector<type>         input(size);
+    std::vector<output_type>  expected(size);
     std::vector<output_type>  output(size, test_utils::convert_to_device<output_type>(0));
     std::vector<unsigned int> ranks(size);
 
@@ -783,7 +766,9 @@ TYPED_TEST(HipcubBlockExchangeTests, ScatterToStriped)
     {
         auto block_ranks = ranks.begin() + bi * items_per_block;
         std::iota(block_ranks, block_ranks + items_per_block, 0);
-        std::shuffle(block_ranks, block_ranks + items_per_block, std::mt19937{std::random_device{}()});
+        std::shuffle(block_ranks,
+                     block_ranks + items_per_block,
+                     std::mt19937{std::random_device{}()});
     }
     std::vector<fundemental_type> values(size);
     std::iota(values.begin(), values.end(), 0);
@@ -794,10 +779,9 @@ TYPED_TEST(HipcubBlockExchangeTests, ScatterToStriped)
             for(size_t ii = 0; ii < items_per_thread; ii++)
             {
                 const size_t offset = bi * items_per_block;
-                const size_t i0 = offset + ti * items_per_thread + ii;
-                const size_t i1 = offset
-                    + ranks[i0] % block_size * items_per_thread
-                    + ranks[i0] / block_size;
+                const size_t i0     = offset + ti * items_per_thread + ii;
+                const size_t i1
+                    = offset + ranks[i0] % block_size * items_per_thread + ranks[i0] / block_size;
                 input[i0]    = test_utils::convert_to_device<type>(values[i0]);
                 expected[i1] = test_utils::convert_to_device<type>(values[i0]);
             }
@@ -806,46 +790,46 @@ TYPED_TEST(HipcubBlockExchangeTests, ScatterToStriped)
 
     // Preparing device
     type* device_input;
-    HIP_CHECK(test_common_utils::hipMallocHelper(&device_input, input.size() * sizeof(typename decltype(input)::value_type)));
+    HIP_CHECK(test_common_utils::hipMallocHelper(
+        &device_input,
+        input.size() * sizeof(typename decltype(input)::value_type)));
     output_type* device_output;
-    HIP_CHECK(test_common_utils::hipMallocHelper(&device_output, output.size() * sizeof(typename decltype(output)::value_type)));
+    HIP_CHECK(test_common_utils::hipMallocHelper(
+        &device_output,
+        output.size() * sizeof(typename decltype(output)::value_type)));
     unsigned int* device_ranks;
-    HIP_CHECK(test_common_utils::hipMallocHelper(&device_ranks, ranks.size() * sizeof(typename decltype(ranks)::value_type)));
+    HIP_CHECK(test_common_utils::hipMallocHelper(
+        &device_ranks,
+        ranks.size() * sizeof(typename decltype(ranks)::value_type)));
 
     HIP_CHECK(
-        hipMemcpy(
-            device_input, input.data(),
-            input.size() * sizeof(type),
-            hipMemcpyHostToDevice
-        )
-    );
+        hipMemcpy(device_input, input.data(), input.size() * sizeof(type), hipMemcpyHostToDevice));
 
-    HIP_CHECK(
-        hipMemcpy(
-            device_ranks, ranks.data(),
-            ranks.size() * sizeof(unsigned int),
-            hipMemcpyHostToDevice
-        )
-    );
+    HIP_CHECK(hipMemcpy(device_ranks,
+                        ranks.data(),
+                        ranks.size() * sizeof(unsigned int),
+                        hipMemcpyHostToDevice));
 
     // Running kernel
     constexpr unsigned int grid_size = (size / items_per_block);
     hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(scatter_to_striped_kernel<type, output_type, items_per_block, items_per_thread>),
-        dim3(grid_size), dim3(block_size), 0, 0,
-        device_input, device_output, device_ranks
-    );
-    HIP_CHECK(hipPeekAtLastError());
+        HIP_KERNEL_NAME(
+            scatter_to_striped_kernel<type, output_type, items_per_block, items_per_thread>),
+        dim3(grid_size),
+        dim3(block_size),
+        0,
+        0,
+        device_input,
+        device_output,
+        device_ranks);
+    HIP_CHECK(hipGetLastError());
     HIP_CHECK(hipDeviceSynchronize());
 
     // Reading results
-    HIP_CHECK(
-        hipMemcpy(
-            output.data(), device_output,
-            output.size() * sizeof(typename decltype(output)::value_type),
-            hipMemcpyDeviceToHost
-        )
-    );
+    HIP_CHECK(hipMemcpy(output.data(),
+                        device_output,
+                        output.size() * sizeof(typename decltype(output)::value_type),
+                        hipMemcpyDeviceToHost));
 
     for(size_t i = 0; i < size; i++)
     {
@@ -856,5 +840,1139 @@ TYPED_TEST(HipcubBlockExchangeTests, ScatterToStriped)
     HIP_CHECK(hipFree(device_input));
     HIP_CHECK(hipFree(device_output));
     HIP_CHECK(hipFree(device_ranks));
+}
+
+template<typename T, size_t items_per_thread, size_t block_size>
+__global__
+void scatter_to_stripped_guarded_kernel(T* device_input, T* device_output, int* device_ranks)
+{
+    const size_t items_per_block = items_per_thread * block_size;
+    const size_t offset          = (blockIdx.x * items_per_block) + threadIdx.x * items_per_thread;
+
+    T   input[items_per_thread];
+    T   output[items_per_thread];
+    int ranks[items_per_thread];
+
+    for(size_t i = 0; i < items_per_thread; i++)
+    {
+        input[i] = device_input[offset + i];
+        ranks[i] = device_ranks[offset + i];
+    }
+    hipcub::BlockExchange<T, block_size, items_per_thread> exchange;
+    exchange.ScatterToStripedGuarded(input, output, ranks);
+
+    for(size_t i = 0; i < items_per_thread; i++)
+    {
+        device_output[offset + i] = (i == items_per_thread - 1) && (threadIdx.x == block_size - 1)
+                                        ? static_cast<T>(0)
+                                        : output[i];
+    }
+}
+
+TYPED_TEST(HipcubBlockExchangeTests, ScatterToStripedGuarded)
+{
+    using type                        = typename TestFixture::params::type;
+    constexpr size_t block_size       = TestFixture::params::block_size;
+    constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
+    constexpr size_t grid_size        = 113;
+
+    const size_t items_per_block = block_size * items_per_thread;
+    const size_t size            = grid_size * items_per_block;
+
+    type* host_input    = new type[size];
+    type* host_expected = new type[size];
+    int*  host_ranks    = new int[size];
+
+    std::iota(host_input, host_input + size, 0);
+    for(size_t i = 0; i < grid_size; i++)
+    {
+        size_t offset = i * items_per_block;
+        std::iota(host_ranks + offset, host_ranks + offset + items_per_block - 1, 0);
+        std::shuffle(host_ranks + offset,
+                     host_ranks + offset + items_per_block - 1,
+                     std::mt19937{std::random_device{}()});
+    }
+    for(size_t i = items_per_block - 1; i < size; i += items_per_block)
+    {
+        host_ranks[i]    = -1;
+        host_expected[i] = static_cast<type>(0);
+    }
+
+    for(size_t bi = 0; bi < size / items_per_block; bi++)
+    {
+        for(size_t ti = 0; ti < block_size; ti++)
+        {
+            for(size_t ii = 0; ii < items_per_thread; ii++)
+            {
+                const size_t offset = bi * items_per_block;
+                const size_t i0     = offset + ti * items_per_thread + ii;
+                const size_t i1     = offset + host_ranks[i0] % block_size * items_per_thread
+                                  + host_ranks[i0] / block_size;
+                if(i1 >= 0 && i1 < size)
+                    host_expected[i1] = host_input[i0];
+            }
+        }
+    }
+
+    type* device_input;
+    type* device_output;
+    int*  device_ranks;
+
+    HIP_CHECK(hipMalloc(&device_input, sizeof(type) * size));
+    HIP_CHECK(hipMalloc(&device_output, sizeof(type) * size));
+    HIP_CHECK(hipMalloc(&device_ranks, sizeof(int) * size));
+
+    HIP_CHECK(hipMemcpy(device_input, host_input, sizeof(type) * size, hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(device_ranks, host_ranks, sizeof(int) * size, hipMemcpyHostToDevice));
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(scatter_to_stripped_guarded_kernel<type, items_per_thread, block_size>),
+        dim3(grid_size),
+        dim3(block_size),
+        0,
+        0,
+        device_input,
+        device_output,
+        device_ranks);
+
+    type* host_output = new type[size];
+    HIP_CHECK(hipMemcpy(host_output, device_output, sizeof(type) * size, hipMemcpyDeviceToHost));
+
+    for(size_t i = 0; i < size; i++)
+        ASSERT_EQ(host_output[i], host_expected[i]);
+
+    delete[] host_input;
+    delete[] host_expected;
+    delete[] host_ranks;
+    delete[] host_output;
+
+    HIP_CHECK(hipFree(device_input));
+    HIP_CHECK(hipFree(device_output));
+    HIP_CHECK(hipFree(device_ranks));
+}
+
+template<typename T, size_t items_per_thread, size_t block_size>
+__global__
+void scatter_to_stripped_flagged_kernel(T*    device_input,
+                                        T*    device_output,
+                                        int*  device_ranks,
+                                        bool* device_flags)
+{
+    const size_t items_per_block = items_per_thread * block_size;
+    const size_t offset          = (blockIdx.x * items_per_block) + threadIdx.x * items_per_thread;
+
+    T    input[items_per_thread];
+    T    output[items_per_thread];
+    int  ranks[items_per_thread];
+    bool flags[items_per_thread];
+
+    for(size_t i = 0; i < items_per_thread; i++)
+    {
+        input[i] = device_input[offset + i];
+        ranks[i] = device_ranks[offset + i];
+        flags[i] = device_flags[offset + i];
+    }
+    hipcub::BlockExchange<T, block_size, items_per_thread> exchange;
+    exchange.ScatterToStripedFlagged(input, output, ranks, flags);
+
+    for(size_t i = 0; i < items_per_thread; i++)
+    {
+        device_output[offset + i] = (i == items_per_thread - 1) && (threadIdx.x == block_size - 1)
+                                        ? static_cast<T>(0)
+                                        : output[i];
+    }
+}
+
+TYPED_TEST(HipcubBlockExchangeTests, ScatterToStripedFlagged)
+{
+    using type                        = typename TestFixture::params::type;
+    constexpr size_t block_size       = TestFixture::params::block_size;
+    constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
+    constexpr size_t grid_size        = 113;
+
+    const size_t items_per_block = block_size * items_per_thread;
+    const size_t size            = grid_size * items_per_block;
+
+    type* host_input    = new type[size];
+    type* host_expected = new type[size];
+    int*  host_ranks    = new int[size];
+    bool* host_flags    = new bool[size];
+
+    std::iota(host_input, host_input + size, 0);
+    for(size_t i = 0; i < grid_size; i++)
+    {
+        size_t offset = i * items_per_block;
+        std::iota(host_ranks + offset, host_ranks + offset + items_per_block - 1, 0);
+        std::shuffle(host_ranks + offset,
+                     host_ranks + offset + items_per_block - 1,
+                     std::mt19937{std::random_device{}()});
+    }
+
+    for(size_t i = items_per_block - 1; i < size; i += items_per_block)
+    {
+        host_ranks[i]    = -1;
+        host_expected[i] = static_cast<type>(0);
+    }
+
+    for(size_t bi = 0; bi < size / items_per_block; bi++)
+    {
+        for(size_t ti = 0; ti < block_size; ti++)
+        {
+            for(size_t ii = 0; ii < items_per_thread; ii++)
+            {
+                const size_t offset = bi * items_per_block;
+                const size_t i0     = offset + ti * items_per_thread + ii;
+                const size_t i1     = offset + host_ranks[i0] % block_size * items_per_thread
+                                  + host_ranks[i0] / block_size;
+                if(i1 >= 0 && i1 < size)
+                    host_expected[i1] = host_input[i0];
+                host_flags[i0]
+                    = (ti == block_size - 1) && (ii == items_per_thread - 1) ? false : true;
+            }
+        }
+    }
+
+    for(size_t i = items_per_block - 1; i < size; i += items_per_block)
+        host_ranks[i] = 5;
+
+    type* device_input;
+    type* device_output;
+    int*  device_ranks;
+    bool* device_flags;
+
+    HIP_CHECK(hipMalloc(&device_input, sizeof(type) * size));
+    HIP_CHECK(hipMalloc(&device_output, sizeof(type) * size));
+    HIP_CHECK(hipMalloc(&device_ranks, sizeof(int) * size));
+    HIP_CHECK(hipMalloc(&device_flags, sizeof(bool) * size));
+
+    HIP_CHECK(hipMemcpy(device_input, host_input, sizeof(type) * size, hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(device_ranks, host_ranks, sizeof(int) * size, hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(device_flags, host_flags, sizeof(bool) * size, hipMemcpyHostToDevice));
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(scatter_to_stripped_flagged_kernel<type, items_per_thread, block_size>),
+        dim3(grid_size),
+        dim3(block_size),
+        0,
+        0,
+        device_input,
+        device_output,
+        device_ranks,
+        device_flags);
+
+    type* host_output = new type[size];
+    HIP_CHECK(hipMemcpy(host_output, device_output, sizeof(type) * size, hipMemcpyDeviceToHost));
+
+    for(size_t i = 0; i < size; i++)
+        ASSERT_EQ(host_output[i], host_expected[i]);
+
+    delete[] host_input;
+    delete[] host_expected;
+    delete[] host_ranks;
+    delete[] host_output;
+    delete[] host_flags;
+
+    HIP_CHECK(hipFree(device_input));
+    HIP_CHECK(hipFree(device_output));
+    HIP_CHECK(hipFree(device_ranks));
+    HIP_CHECK(hipFree(device_flags));
+}
 
+template<class Type, unsigned int ItemsPerBlock, unsigned int ItemsPerThread>
+__global__
+__launch_bounds__(512)
+void striped_to_blocked_one_param_kernel(Type* device_input)
+{
+    constexpr unsigned int block_size   = (ItemsPerBlock / ItemsPerThread);
+    const unsigned int     lid          = hipThreadIdx_x;
+    const unsigned int     block_offset = hipBlockIdx_x * ItemsPerBlock;
+
+    Type input[ItemsPerThread];
+    hipcub::LoadDirectBlocked(lid, device_input + block_offset, input);
+
+    hipcub::BlockExchange<Type, block_size, ItemsPerThread> exchange;
+    exchange.StripedToBlocked(input);
+
+    hipcub::StoreDirectBlocked(lid, device_input + block_offset, input);
+}
+
+TYPED_TEST(HipcubBlockExchangeTests, StripedToBlockedOneParam)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using type        = typename TestFixture::params::type;
+    using output_type = typename TestFixture::params::output_type;
+
+    constexpr size_t block_size       = TestFixture::params::block_size;
+    constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
+    constexpr size_t items_per_block  = block_size * items_per_thread;
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size())
+    {
+        return;
+    }
+
+    const size_t size = items_per_block * 113;
+    // Generate data
+    type* input    = new type[size];
+    type* expected = new type[size];
+
+    // Calculate input and expected results on host
+    type* values = new type[size];
+    std::iota(values, values + size, 0);
+
+    for(size_t bi = 0; bi < size / items_per_block; bi++)
+    {
+        for(size_t ti = 0; ti < block_size; ti++)
+        {
+            for(size_t ii = 0; ii < items_per_thread; ii++)
+            {
+                const size_t offset = bi * items_per_block;
+                const size_t i0     = offset + ti * items_per_thread + ii;
+                const size_t i1     = offset + ii * block_size + ti;
+                input[i0]           = values[i1];
+                expected[i1]        = values[i1];
+            }
+        }
+    }
+
+    // Preparing device
+    type* device_input;
+    HIP_CHECK(hipMalloc(&device_input, sizeof(type) * size));
+
+    HIP_CHECK(hipMemcpy(device_input, input, sizeof(type) * size, hipMemcpyHostToDevice));
+
+    // Running kernel
+    constexpr unsigned int grid_size = (size / items_per_block);
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(
+            striped_to_blocked_one_param_kernel<type, items_per_block, items_per_thread>),
+        dim3(grid_size),
+        dim3(block_size),
+        0,
+        0,
+        device_input);
+    HIP_CHECK(hipGetLastError());
+    HIP_CHECK(hipDeviceSynchronize());
+
+    // Reading results
+    HIP_CHECK(hipMemcpy(input, device_input, sizeof(type) * size, hipMemcpyDeviceToHost));
+
+    for(size_t i = 0; i < size; i++)
+    {
+        ASSERT_EQ(input[i], expected[i]);
+    }
+
+    HIP_CHECK(hipFree(device_input));
+    delete[] input;
+    delete[] expected;
+    delete[] values;
+}
+
+template<class Type, unsigned int ItemsPerBlock, unsigned int ItemsPerThread>
+__global__
+__launch_bounds__(512)
+void blocked_to_striped_one_param_kernel(Type* device_input)
+{
+    constexpr unsigned int block_size   = (ItemsPerBlock / ItemsPerThread);
+    const unsigned int     lid          = hipThreadIdx_x;
+    const unsigned int     block_offset = hipBlockIdx_x * ItemsPerBlock;
+
+    Type input[ItemsPerThread];
+    hipcub::LoadDirectBlocked(lid, device_input + block_offset, input);
+
+    hipcub::BlockExchange<Type, block_size, ItemsPerThread> exchange;
+    exchange.BlockedToStriped(input);
+
+    hipcub::StoreDirectBlocked(lid, device_input + block_offset, input);
+}
+
+TYPED_TEST(HipcubBlockExchangeTests, BlockedToStripedOneParam)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using type        = typename TestFixture::params::type;
+    using output_type = typename TestFixture::params::output_type;
+
+    constexpr size_t block_size       = TestFixture::params::block_size;
+    constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
+    constexpr size_t items_per_block  = block_size * items_per_thread;
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size())
+    {
+        return;
+    }
+
+    const size_t size = items_per_block * 113;
+    // Generate data
+    type* input    = new type[size];
+    type* expected = new type[size];
+
+    // Calculate input and expected results on host
+    type* values = new type[size];
+    std::iota(values, values + size, 0);
+
+    for(size_t bi = 0; bi < size / items_per_block; bi++)
+    {
+        for(size_t ti = 0; ti < block_size; ti++)
+        {
+            for(size_t ii = 0; ii < items_per_thread; ii++)
+            {
+                const size_t offset = bi * items_per_block;
+                const size_t i0     = offset + ti * items_per_thread + ii;
+                const size_t i1     = offset + ii * block_size + ti;
+                input[i1]           = values[i1];
+                expected[i0]        = values[i1];
+            }
+        }
+    }
+
+    // Preparing device
+    type* device_input;
+    HIP_CHECK(hipMalloc(&device_input, sizeof(type) * size));
+
+    HIP_CHECK(hipMemcpy(device_input, input, sizeof(type) * size, hipMemcpyHostToDevice));
+
+    // Running kernel
+    constexpr unsigned int grid_size = (size / items_per_block);
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(
+            blocked_to_striped_one_param_kernel<type, items_per_block, items_per_thread>),
+        dim3(grid_size),
+        dim3(block_size),
+        0,
+        0,
+        device_input);
+    HIP_CHECK(hipGetLastError());
+    HIP_CHECK(hipDeviceSynchronize());
+
+    // Reading results
+    HIP_CHECK(hipMemcpy(input, device_input, sizeof(type) * size, hipMemcpyDeviceToHost));
+
+    for(size_t i = 0; i < size; i++)
+    {
+        ASSERT_EQ(input[i], expected[i]);
+    }
+
+    HIP_CHECK(hipFree(device_input));
+    delete[] input;
+    delete[] expected;
+    delete[] values;
 }
+
+template<class Type, unsigned int ItemsPerBlock, unsigned int ItemsPerThread>
+__global__
+__launch_bounds__(512)
+void warp_striped_to_blocked_one_param_kernel(Type* device_input)
+{
+    constexpr unsigned int block_size   = (ItemsPerBlock / ItemsPerThread);
+    const unsigned int     lid          = hipThreadIdx_x;
+    const unsigned int     block_offset = hipBlockIdx_x * ItemsPerBlock;
+
+    Type input[ItemsPerThread];
+    hipcub::LoadDirectBlocked(lid, device_input + block_offset, input);
+
+    hipcub::BlockExchange<Type, block_size, ItemsPerThread> exchange;
+    exchange.WarpStripedToBlocked(input);
+
+    hipcub::StoreDirectBlocked(lid, device_input + block_offset, input);
+}
+
+TYPED_TEST(HipcubBlockExchangeTests, WarpStripedToBlockedOneParam)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using type = typename TestFixture::params::type;
+
+    constexpr size_t block_size       = TestFixture::params::block_size;
+    constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
+    constexpr size_t items_per_block  = block_size * items_per_thread;
+
+    const unsigned int current_device_warp_size = HIPCUB_HOST_WARP_THREADS;
+    // Given block size not supported
+    bool is_block_size_unsupported = block_size > test_utils::get_max_block_size();
+#ifdef HIPCUB_CUB_API
+    // CUB does not support exchanges to/from warp-striped arrangements
+    // for incomplete blocks (not divisible by warp size)
+    // Workaround for nvcc warning: "dynamic initialization in unreachable code"
+    // (not a simple if with compile-time expression)
+    is_block_size_unsupported |= block_size % current_device_warp_size != 0;
+#endif
+    if(is_block_size_unsupported)
+    {
+        printf("Unsupported test block size: %zu.     Skipping test\n", block_size);
+        GTEST_SKIP();
+    }
+
+    const size_t size = items_per_block * 113;
+    // Generate data
+    type* input    = new type[size];
+    type* expected = new type[size];
+
+    constexpr size_t warp_size_32
+        = test_utils::get_min_warp_size(block_size, size_t(HIPCUB_WARP_SIZE_32));
+    constexpr size_t warp_size_64
+        = test_utils::get_min_warp_size(block_size, size_t(HIPCUB_WARP_SIZE_64));
+    constexpr size_t warps_no_32       = (block_size + warp_size_32 - 1) / warp_size_32;
+    constexpr size_t warps_no_64       = (block_size + warp_size_64 - 1) / warp_size_64;
+    constexpr size_t items_per_warp_32 = warp_size_32 * items_per_thread;
+    constexpr size_t items_per_warp_64 = warp_size_64 * items_per_thread;
+
+    // Calculate input and expected results on host
+    type* values = new type[size];
+    std::iota(values, values + size, 0);
+
+    const size_t warps_no
+        = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? warps_no_32 : warps_no_64;
+    const size_t warp_size
+        = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? warp_size_32 : warp_size_64;
+    const size_t items_per_warp
+        = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? items_per_warp_32 : items_per_warp_64;
+
+    for(size_t bi = 0; bi < size / items_per_block; bi++)
+    {
+        for(size_t wi = 0; wi < warps_no; wi++)
+        {
+            const size_t current_warp_size
+                = wi == warps_no - 1
+                      ? (block_size % warp_size != 0 ? block_size % warp_size : warp_size)
+                      : warp_size;
+            for(size_t li = 0; li < current_warp_size; li++)
+            {
+                for(size_t ii = 0; ii < items_per_thread; ii++)
+                {
+                    const size_t offset = bi * items_per_block + wi * items_per_warp;
+                    const size_t i0     = offset + li * items_per_thread + ii;
+                    const size_t i1     = offset + ii * current_warp_size + li;
+                    input[i0]           = values[i1];
+                    expected[i1]        = values[i1];
+                }
+            }
+        }
+    }
+
+    // Preparing device
+    type* device_input;
+    HIP_CHECK(hipMalloc(&device_input, sizeof(type) * size));
+
+    HIP_CHECK(hipMemcpy(device_input, input, sizeof(type) * size, hipMemcpyHostToDevice));
+
+    // Running kernel
+    constexpr unsigned int grid_size = (size / items_per_block);
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(
+            warp_striped_to_blocked_one_param_kernel<type, items_per_block, items_per_thread>),
+        dim3(grid_size),
+        dim3(block_size),
+        0,
+        0,
+        device_input);
+    HIP_CHECK(hipGetLastError());
+    HIP_CHECK(hipDeviceSynchronize());
+
+    // Reading results
+    HIP_CHECK(hipMemcpy(input, device_input, sizeof(type) * size, hipMemcpyDeviceToHost));
+
+    for(size_t i = 0; i < size; i++)
+        ASSERT_EQ(input[i], expected[i]);
+
+    HIP_CHECK(hipFree(device_input));
+    delete[] input;
+    delete[] expected;
+    delete[] values;
+}
+
+template<class Type, unsigned int ItemsPerBlock, unsigned int ItemsPerThread>
+__global__
+__launch_bounds__(512)
+void blocked_to_warp_striped_one_param_kernel(Type* device_input)
+{
+    constexpr unsigned int block_size   = (ItemsPerBlock / ItemsPerThread);
+    const unsigned int     lid          = hipThreadIdx_x;
+    const unsigned int     block_offset = hipBlockIdx_x * ItemsPerBlock;
+
+    Type input[ItemsPerThread];
+    hipcub::LoadDirectBlocked(lid, device_input + block_offset, input);
+
+    hipcub::BlockExchange<Type, block_size, ItemsPerThread> exchange;
+    exchange.BlockedToWarpStriped(input);
+
+    hipcub::StoreDirectBlocked(lid, device_input + block_offset, input);
+}
+
+TYPED_TEST(HipcubBlockExchangeTests, BlockedToWarpStripedOneParam)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using type = typename TestFixture::params::type;
+
+    constexpr size_t block_size       = TestFixture::params::block_size;
+    constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
+    constexpr size_t items_per_block  = block_size * items_per_thread;
+
+    const unsigned int current_device_warp_size = HIPCUB_HOST_WARP_THREADS;
+    // Given block size not supported
+    bool is_block_size_unsupported = block_size > test_utils::get_max_block_size();
+#ifdef HIPCUB_CUB_API
+    // CUB does not support exchanges to/from warp-striped arrangements
+    // for incomplete blocks (not divisible by warp size)
+    // Workaround for nvcc warning: "dynamic initialization in unreachable code"
+    // (not a simple if with compile-time expression)
+    is_block_size_unsupported |= block_size % current_device_warp_size != 0;
+#endif
+    if(is_block_size_unsupported)
+    {
+        printf("Unsupported test block size: %zu.     Skipping test\n", block_size);
+        GTEST_SKIP();
+    }
+
+    const size_t size = items_per_block * 113;
+    // Generate data
+    std::vector<type> input(size);
+    std::vector<type> expected(size);
+
+    constexpr size_t warp_size_32
+        = test_utils::get_min_warp_size(block_size, size_t(HIPCUB_WARP_SIZE_32));
+    constexpr size_t warp_size_64
+        = test_utils::get_min_warp_size(block_size, size_t(HIPCUB_WARP_SIZE_64));
+    constexpr size_t warps_no_32       = (block_size + warp_size_32 - 1) / warp_size_32;
+    constexpr size_t warps_no_64       = (block_size + warp_size_64 - 1) / warp_size_64;
+    constexpr size_t items_per_warp_32 = warp_size_32 * items_per_thread;
+    constexpr size_t items_per_warp_64 = warp_size_64 * items_per_thread;
+
+    // Calculate input and expected results on host
+    std::vector<type> values(size);
+    std::iota(values.begin(), values.end(), 0);
+
+    const size_t warps_no
+        = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? warps_no_32 : warps_no_64;
+    const size_t warp_size
+        = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? warp_size_32 : warp_size_64;
+    const size_t items_per_warp
+        = current_device_warp_size == HIPCUB_WARP_SIZE_32 ? items_per_warp_32 : items_per_warp_64;
+
+    for(size_t bi = 0; bi < size / items_per_block; bi++)
+    {
+        for(size_t wi = 0; wi < warps_no; wi++)
+        {
+            const size_t current_warp_size
+                = wi == warps_no - 1
+                      ? (block_size % warp_size != 0 ? block_size % warp_size : warp_size)
+                      : warp_size;
+            for(size_t li = 0; li < current_warp_size; li++)
+            {
+                for(size_t ii = 0; ii < items_per_thread; ii++)
+                {
+                    const size_t offset = bi * items_per_block + wi * items_per_warp;
+                    const size_t i0     = offset + li * items_per_thread + ii;
+                    const size_t i1     = offset + ii * current_warp_size + li;
+                    input[i1]           = test_utils::convert_to_device<type>(values[i1]);
+                    expected[i0]        = test_utils::convert_to_device<type>(values[i1]);
+                }
+            }
+        }
+    }
+
+    // Preparing device
+    type* device_input;
+    HIP_CHECK(test_common_utils::hipMallocHelper(
+        &device_input,
+        input.size() * sizeof(typename decltype(input)::value_type)));
+
+    HIP_CHECK(
+        hipMemcpy(device_input, input.data(), input.size() * sizeof(type), hipMemcpyHostToDevice));
+
+    // Running kernel
+    constexpr unsigned int grid_size = (size / items_per_block);
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(
+            blocked_to_warp_striped_one_param_kernel<type, items_per_block, items_per_thread>),
+        dim3(grid_size),
+        dim3(block_size),
+        0,
+        0,
+        device_input);
+    HIP_CHECK(hipGetLastError());
+    HIP_CHECK(hipDeviceSynchronize());
+
+    // Reading results
+    HIP_CHECK(hipMemcpy(input.data(),
+                        device_input,
+                        input.size() * sizeof(typename decltype(input)::value_type),
+                        hipMemcpyDeviceToHost));
+
+    for(size_t i = 0; i < size; i++)
+    {
+        ASSERT_EQ(test_utils::convert_to_native(input[i]),
+                  test_utils::convert_to_native(expected[i]));
+    }
+
+    HIP_CHECK(hipFree(device_input));
+}
+
+template<class Type, unsigned int ItemsPerBlock, unsigned int ItemsPerThread>
+__global__
+__launch_bounds__(512)
+void scatter_to_blocked_no_output_param_kernel(Type* device_input, unsigned int* device_ranks)
+{
+    constexpr unsigned int block_size   = (ItemsPerBlock / ItemsPerThread);
+    const unsigned int     lid          = hipThreadIdx_x;
+    const unsigned int     block_offset = hipBlockIdx_x * ItemsPerBlock;
+
+    Type         input[ItemsPerThread];
+    unsigned int ranks[ItemsPerThread];
+    hipcub::LoadDirectBlocked(lid, device_input + block_offset, input);
+    hipcub::LoadDirectBlocked(lid, device_ranks + block_offset, ranks);
+
+    hipcub::BlockExchange<Type, block_size, ItemsPerThread> exchange;
+    exchange.ScatterToBlocked(input, ranks);
+
+    hipcub::StoreDirectBlocked(lid, device_input + block_offset, input);
+}
+
+TYPED_TEST(HipcubBlockExchangeTests, ScatterToBlockedNoOutputParam)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using type = typename TestFixture::params::type;
+
+    constexpr size_t block_size       = TestFixture::params::block_size;
+    constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
+    constexpr size_t items_per_block  = block_size * items_per_thread;
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size())
+    {
+        return;
+    }
+
+    const size_t size = items_per_block * 113;
+    // Generate data
+    std::vector<type>         input(size);
+    std::vector<type>         expected(size);
+    std::vector<unsigned int> ranks(size);
+
+    // Calculate input and expected results on host
+    for(size_t bi = 0; bi < size / items_per_block; bi++)
+    {
+        auto block_ranks = ranks.begin() + bi * items_per_block;
+        std::iota(block_ranks, block_ranks + items_per_block, 0);
+        std::shuffle(block_ranks,
+                     block_ranks + items_per_block,
+                     std::mt19937{std::random_device{}()});
+    }
+    std::vector<type> values(size);
+    std::iota(values.begin(), values.end(), 0);
+    for(size_t bi = 0; bi < size / items_per_block; bi++)
+    {
+        for(size_t ti = 0; ti < block_size; ti++)
+        {
+            for(size_t ii = 0; ii < items_per_thread; ii++)
+            {
+                const size_t offset = bi * items_per_block;
+                const size_t i0     = offset + ti * items_per_thread + ii;
+                const size_t i1     = offset + ranks[i0];
+                input[i0]           = test_utils::convert_to_device<type>(values[i0]);
+                expected[i1]        = test_utils::convert_to_device<type>(values[i0]);
+            }
+        }
+    }
+
+    // Preparing device
+    type* device_input;
+    HIP_CHECK(test_common_utils::hipMallocHelper(
+        &device_input,
+        input.size() * sizeof(typename decltype(input)::value_type)));
+    unsigned int* device_ranks;
+    HIP_CHECK(test_common_utils::hipMallocHelper(
+        &device_ranks,
+        ranks.size() * sizeof(typename decltype(ranks)::value_type)));
+
+    HIP_CHECK(
+        hipMemcpy(device_input, input.data(), input.size() * sizeof(type), hipMemcpyHostToDevice));
+
+    HIP_CHECK(hipMemcpy(device_ranks,
+                        ranks.data(),
+                        ranks.size() * sizeof(unsigned int),
+                        hipMemcpyHostToDevice));
+
+    // Running kernel
+    constexpr unsigned int grid_size = (size / items_per_block);
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(
+            scatter_to_blocked_no_output_param_kernel<type, items_per_block, items_per_thread>),
+        dim3(grid_size),
+        dim3(block_size),
+        0,
+        0,
+        device_input,
+        device_ranks);
+    HIP_CHECK(hipGetLastError());
+    HIP_CHECK(hipDeviceSynchronize());
+
+    // Reading results
+    HIP_CHECK(hipMemcpy(input.data(),
+                        device_input,
+                        input.size() * sizeof(typename decltype(input)::value_type),
+                        hipMemcpyDeviceToHost));
+
+    for(size_t i = 0; i < size; i++)
+    {
+        ASSERT_EQ(test_utils::convert_to_native(input[i]),
+                  test_utils::convert_to_native(expected[i]));
+    }
+
+    HIP_CHECK(hipFree(device_input));
+    HIP_CHECK(hipFree(device_ranks));
+}
+
+template<class Type, unsigned int ItemsPerBlock, unsigned int ItemsPerThread>
+__global__
+__launch_bounds__(512)
+void scatter_to_striped_no_output_param_kernel(Type* device_input, unsigned int* device_ranks)
+{
+    constexpr unsigned int block_size   = (ItemsPerBlock / ItemsPerThread);
+    const unsigned int     lid          = hipThreadIdx_x;
+    const unsigned int     block_offset = hipBlockIdx_x * ItemsPerBlock;
+
+    Type         input[ItemsPerThread];
+    unsigned int ranks[ItemsPerThread];
+    hipcub::LoadDirectBlocked(lid, device_input + block_offset, input);
+    hipcub::LoadDirectBlocked(lid, device_ranks + block_offset, ranks);
+
+    hipcub::BlockExchange<Type, block_size, ItemsPerThread> exchange;
+    exchange.ScatterToStriped(input, ranks);
+
+    hipcub::StoreDirectBlocked(lid, device_input + block_offset, input);
+}
+
+TYPED_TEST(HipcubBlockExchangeTests, ScatterToStripedNoOutputParam)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using type = typename TestFixture::params::type;
+
+    constexpr size_t block_size       = TestFixture::params::block_size;
+    constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
+    constexpr size_t items_per_block  = block_size * items_per_thread;
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size())
+    {
+        return;
+    }
+
+    const size_t size = items_per_block * 113;
+    // Generate data
+    std::vector<type>         input(size);
+    std::vector<type>         expected(size);
+    std::vector<unsigned int> ranks(size);
+
+    // Calculate input and expected results on host
+    for(size_t bi = 0; bi < size / items_per_block; bi++)
+    {
+        auto block_ranks = ranks.begin() + bi * items_per_block;
+        std::iota(block_ranks, block_ranks + items_per_block, 0);
+        std::shuffle(block_ranks,
+                     block_ranks + items_per_block,
+                     std::mt19937{std::random_device{}()});
+    }
+    std::vector<type> values(size);
+    std::iota(values.begin(), values.end(), 0);
+    for(size_t bi = 0; bi < size / items_per_block; bi++)
+    {
+        for(size_t ti = 0; ti < block_size; ti++)
+        {
+            for(size_t ii = 0; ii < items_per_thread; ii++)
+            {
+                const size_t offset = bi * items_per_block;
+                const size_t i0     = offset + ti * items_per_thread + ii;
+                const size_t i1
+                    = offset + ranks[i0] % block_size * items_per_thread + ranks[i0] / block_size;
+                input[i0]    = test_utils::convert_to_device<type>(values[i0]);
+                expected[i1] = test_utils::convert_to_device<type>(values[i0]);
+            }
+        }
+    }
+
+    // Preparing device
+    type* device_input;
+    HIP_CHECK(test_common_utils::hipMallocHelper(
+        &device_input,
+        input.size() * sizeof(typename decltype(input)::value_type)));
+    unsigned int* device_ranks;
+    HIP_CHECK(test_common_utils::hipMallocHelper(
+        &device_ranks,
+        ranks.size() * sizeof(typename decltype(ranks)::value_type)));
+
+    HIP_CHECK(
+        hipMemcpy(device_input, input.data(), input.size() * sizeof(type), hipMemcpyHostToDevice));
+
+    HIP_CHECK(hipMemcpy(device_ranks,
+                        ranks.data(),
+                        ranks.size() * sizeof(unsigned int),
+                        hipMemcpyHostToDevice));
+
+    // Running kernel
+    constexpr unsigned int grid_size = (size / items_per_block);
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(
+            scatter_to_striped_no_output_param_kernel<type, items_per_block, items_per_thread>),
+        dim3(grid_size),
+        dim3(block_size),
+        0,
+        0,
+        device_input,
+        device_ranks);
+    HIP_CHECK(hipGetLastError());
+    HIP_CHECK(hipDeviceSynchronize());
+
+    // Reading results
+    HIP_CHECK(hipMemcpy(input.data(),
+                        device_input,
+                        input.size() * sizeof(typename decltype(input)::value_type),
+                        hipMemcpyDeviceToHost));
+
+    for(size_t i = 0; i < size; i++)
+    {
+        ASSERT_EQ(test_utils::convert_to_native(input[i]),
+                  test_utils::convert_to_native(expected[i]));
+    }
+
+    HIP_CHECK(hipFree(device_input));
+    HIP_CHECK(hipFree(device_ranks));
+}
+
+template<typename T, size_t items_per_thread, size_t block_size>
+__global__
+void scatter_to_stripped_guarded_no_output_param_kernel(T* device_input, int* device_ranks)
+{
+    const size_t items_per_block = items_per_thread * block_size;
+    const size_t offset          = (blockIdx.x * items_per_block) + threadIdx.x * items_per_thread;
+
+    T   input[items_per_thread];
+    int ranks[items_per_thread];
+
+    for(size_t i = 0; i < items_per_thread; i++)
+    {
+        input[i] = device_input[offset + i];
+        ranks[i] = device_ranks[offset + i];
+    }
+    hipcub::BlockExchange<T, block_size, items_per_thread> exchange;
+    exchange.ScatterToStripedGuarded(input, ranks);
+
+    for(size_t i = 0; i < items_per_thread; i++)
+    {
+        device_input[offset + i] = (i == items_per_thread - 1) && (threadIdx.x == block_size - 1)
+                                       ? static_cast<T>(0)
+                                       : input[i];
+    }
+}
+
+TYPED_TEST(HipcubBlockExchangeTests, ScatterToStripedGuardedNoOutputParam)
+{
+    using type                        = typename TestFixture::params::type;
+    constexpr size_t block_size       = TestFixture::params::block_size;
+    constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
+    constexpr size_t grid_size        = 113;
+
+    const size_t items_per_block = block_size * items_per_thread;
+    const size_t size            = grid_size * items_per_block;
+
+    type* host_input    = new type[size];
+    type* host_expected = new type[size];
+    int*  host_ranks    = new int[size];
+
+    std::iota(host_input, host_input + size, 0);
+    for(size_t i = 0; i < grid_size; i++)
+    {
+        size_t offset = i * items_per_block;
+        std::iota(host_ranks + offset, host_ranks + offset + items_per_block - 1, 0);
+        std::shuffle(host_ranks + offset,
+                     host_ranks + offset + items_per_block - 1,
+                     std::mt19937{std::random_device{}()});
+    }
+    for(size_t i = items_per_block - 1; i < size; i += items_per_block)
+    {
+        host_ranks[i]    = -1;
+        host_expected[i] = static_cast<type>(0);
+    }
+
+    for(size_t bi = 0; bi < size / items_per_block; bi++)
+    {
+        for(size_t ti = 0; ti < block_size; ti++)
+        {
+            for(size_t ii = 0; ii < items_per_thread; ii++)
+            {
+                const size_t offset = bi * items_per_block;
+                const size_t i0     = offset + ti * items_per_thread + ii;
+                const size_t i1     = offset + host_ranks[i0] % block_size * items_per_thread
+                                  + host_ranks[i0] / block_size;
+                if(i1 >= 0 && i1 < size)
+                    host_expected[i1] = host_input[i0];
+            }
+        }
+    }
+
+    type* device_input;
+    int*  device_ranks;
+
+    HIP_CHECK(hipMalloc(&device_input, sizeof(type) * size));
+    HIP_CHECK(hipMalloc(&device_ranks, sizeof(int) * size));
+
+    HIP_CHECK(hipMemcpy(device_input, host_input, sizeof(type) * size, hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(device_ranks, host_ranks, sizeof(int) * size, hipMemcpyHostToDevice));
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(
+            scatter_to_stripped_guarded_no_output_param_kernel<type, items_per_thread, block_size>),
+        dim3(grid_size),
+        dim3(block_size),
+        0,
+        0,
+        device_input,
+        device_ranks);
+
+    type* host_output = new type[size];
+    HIP_CHECK(hipMemcpy(host_input, device_input, sizeof(type) * size, hipMemcpyDeviceToHost));
+
+    for(size_t i = 0; i < size; i++)
+        ASSERT_EQ(host_input[i], host_expected[i]);
+
+    delete[] host_input;
+    delete[] host_expected;
+    delete[] host_ranks;
+
+    HIP_CHECK(hipFree(device_input));
+    HIP_CHECK(hipFree(device_ranks));
+}
+
+template<typename T, size_t items_per_thread, size_t block_size>
+__global__
+void scatter_to_stripped_flagged_no_output_param_kernel(T*    device_input,
+                                                        int*  device_ranks,
+                                                        bool* device_flags)
+{
+    const size_t items_per_block = items_per_thread * block_size;
+    const size_t offset          = (blockIdx.x * items_per_block) + threadIdx.x * items_per_thread;
+
+    T    input[items_per_thread];
+    int  ranks[items_per_thread];
+    bool flags[items_per_thread];
+
+    for(size_t i = 0; i < items_per_thread; i++)
+    {
+        input[i] = device_input[offset + i];
+        ranks[i] = device_ranks[offset + i];
+        flags[i] = device_flags[offset + i];
+    }
+    hipcub::BlockExchange<T, block_size, items_per_thread> exchange;
+    exchange.ScatterToStripedFlagged(input, ranks, flags);
+
+    for(size_t i = 0; i < items_per_thread; i++)
+    {
+        device_input[offset + i] = (i == items_per_thread - 1) && (threadIdx.x == block_size - 1)
+                                       ? static_cast<T>(0)
+                                       : input[i];
+    }
+}
+
+TYPED_TEST(HipcubBlockExchangeTests, ScatterToStripedFlaggedNoOutputParam)
+{
+    using type                        = typename TestFixture::params::type;
+    constexpr size_t block_size       = TestFixture::params::block_size;
+    constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
+    constexpr size_t grid_size        = 113;
+
+    const size_t items_per_block = block_size * items_per_thread;
+    const size_t size            = grid_size * items_per_block;
+
+    type* host_input    = new type[size];
+    type* host_expected = new type[size];
+    int*  host_ranks    = new int[size];
+    bool* host_flags    = new bool[size];
+
+    std::iota(host_input, host_input + size, 0);
+    for(size_t i = 0; i < grid_size; i++)
+    {
+        size_t offset = i * items_per_block;
+        std::iota(host_ranks + offset, host_ranks + offset + items_per_block - 1, 0);
+        std::shuffle(host_ranks + offset,
+                     host_ranks + offset + items_per_block - 1,
+                     std::mt19937{std::random_device{}()});
+    }
+
+    for(size_t i = items_per_block - 1; i < size; i += items_per_block)
+    {
+        host_ranks[i]    = -1;
+        host_expected[i] = static_cast<type>(0);
+    }
+
+    for(size_t bi = 0; bi < size / items_per_block; bi++)
+    {
+        for(size_t ti = 0; ti < block_size; ti++)
+        {
+            for(size_t ii = 0; ii < items_per_thread; ii++)
+            {
+                const size_t offset = bi * items_per_block;
+                const size_t i0     = offset + ti * items_per_thread + ii;
+                const size_t i1     = offset + host_ranks[i0] % block_size * items_per_thread
+                                  + host_ranks[i0] / block_size;
+                if(i1 >= 0 && i1 < size)
+                    host_expected[i1] = host_input[i0];
+                host_flags[i0]
+                    = (ti == block_size - 1) && (ii == items_per_thread - 1) ? false : true;
+            }
+        }
+    }
+
+    for(size_t i = items_per_block - 1; i < size; i += items_per_block)
+        host_ranks[i] = 5;
+
+    type* device_input;
+    int*  device_ranks;
+    bool* device_flags;
+
+    HIP_CHECK(hipMalloc(&device_input, sizeof(type) * size));
+    HIP_CHECK(hipMalloc(&device_ranks, sizeof(int) * size));
+    HIP_CHECK(hipMalloc(&device_flags, sizeof(bool) * size));
+
+    HIP_CHECK(hipMemcpy(device_input, host_input, sizeof(type) * size, hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(device_ranks, host_ranks, sizeof(int) * size, hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(device_flags, host_flags, sizeof(bool) * size, hipMemcpyHostToDevice));
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(
+            scatter_to_stripped_flagged_no_output_param_kernel<type, items_per_thread, block_size>),
+        dim3(grid_size),
+        dim3(block_size),
+        0,
+        0,
+        device_input,
+        device_ranks,
+        device_flags);
+
+    HIP_CHECK(hipMemcpy(host_input, device_input, sizeof(type) * size, hipMemcpyDeviceToHost));
+
+    for(size_t i = 0; i < size; i++)
+        ASSERT_EQ(host_input[i], host_expected[i]);
+
+    delete[] host_input;
+    delete[] host_expected;
+    delete[] host_ranks;
+    delete[] host_flags;
+
+    HIP_CHECK(hipFree(device_input));
+    HIP_CHECK(hipFree(device_ranks));
+    HIP_CHECK(hipFree(device_flags));
+}
\ No newline at end of file
diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_merge_sort.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_merge_sort.cpp
index e3f91c10e10..598aea81d13 100644
--- a/projects/hipcub/test/hipcub/test_hipcub_block_merge_sort.cpp
+++ b/projects/hipcub/test/hipcub/test_hipcub_block_merge_sort.cpp
@@ -23,32 +23,34 @@
 #include "common_test_header.hpp"
 
 // hipcub API
-#include "hipcub/block/block_merge_sort.hpp"
 #include "hipcub/block/block_load.hpp"
+#include "hipcub/block/block_merge_sort.hpp"
 #include "hipcub/block/block_store.hpp"
 
+#include <algorithm>
+#include <string>
 
+#define ull unsigned long long
 
-template<
-    class Key,
-    class Value,
-    unsigned int BlockSize,
-    unsigned int ItemsPerThread,
-    class CompareFunction = test_utils::less,
-    bool ToStriped = false
->
+template<class Key,
+         class Value,
+         unsigned int BlockSize,
+         unsigned int ItemsPerThread,
+         class CompareFunction = test_utils::less,
+         bool ToStriped        = false>
 struct params
 {
-    using key_type = Key;
-    using value_type = Value;
-    static constexpr unsigned int block_size = BlockSize;
+    using key_type                                 = Key;
+    using value_type                               = Value;
+    static constexpr unsigned int block_size       = BlockSize;
     static constexpr unsigned int items_per_thread = ItemsPerThread;
-    using compare_function = CompareFunction;
-    static constexpr bool to_striped = ToStriped;
+    using compare_function                         = CompareFunction;
+    static constexpr bool to_striped               = ToStriped;
 };
 
 template<class Params>
-class HipcubBlockMergeSort : public ::testing::Test {
+class HipcubBlockMergeSort : public ::testing::Test
+{
 public:
     using params = Params;
 };
@@ -75,21 +77,14 @@ using Params = ::testing::Types<
 
 TYPED_TEST_SUITE(HipcubBlockMergeSort, Params);
 
-template<
-    unsigned int BlockSize,
-    unsigned int ItemsPerThread,
-    class key_type,
-    typename CompareOp
->
+template<unsigned int BlockSize, unsigned int ItemsPerThread, class key_type, typename CompareOp>
 __global__
 __launch_bounds__(BlockSize)
-void sort_key_kernel(
-    key_type* device_keys_output,
-    CompareOp compare_op)
+void sort_key_kernel(key_type* device_keys_output, CompareOp compare_op)
 {
     constexpr unsigned int items_per_block = BlockSize * ItemsPerThread;
-    const unsigned int lid = hipThreadIdx_x;
-    const unsigned int block_offset = hipBlockIdx_x * items_per_block;
+    const unsigned int     lid             = hipThreadIdx_x;
+    const unsigned int     block_offset    = hipBlockIdx_x * items_per_block;
 
     key_type keys[ItemsPerThread];
     hipcub::LoadDirectBlocked(lid, device_keys_output + block_offset, keys);
@@ -106,23 +101,24 @@ TYPED_TEST(HipcubBlockMergeSort, SortKeys)
     SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
     HIP_CHECK(hipSetDevice(device_id));
 
-    using key_type = typename TestFixture::params::key_type;
-    constexpr size_t block_size = TestFixture::params::block_size;
+    using key_type                    = typename TestFixture::params::key_type;
+    constexpr size_t block_size       = TestFixture::params::block_size;
     constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
-    using compare_function = typename TestFixture::params::compare_function;
-    constexpr size_t items_per_block = block_size * items_per_thread;
+    using compare_function            = typename TestFixture::params::compare_function;
+    constexpr size_t items_per_block  = block_size * items_per_thread;
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
     {
         GTEST_SKIP();
     }
 
-    const size_t size = items_per_block * 1134;
+    const size_t size      = items_per_block * 1134;
     const size_t grid_size = size / items_per_block;
 
-    for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
-        unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
         SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
 
         // Generate data
@@ -136,40 +132,35 @@ TYPED_TEST(HipcubBlockMergeSort, SortKeys)
         std::vector<key_type> expected(keys_output);
         for(size_t i = 0; i < size / items_per_block; i++)
         {
-            std::stable_sort(
-                expected.begin() + (i * items_per_block),
-                expected.begin() + ((i + 1) * items_per_block),
-                compare_function()
-            );
+            std::stable_sort(expected.begin() + (i * items_per_block),
+                             expected.begin() + ((i + 1) * items_per_block),
+                             compare_function());
         }
 
         // Preparing device
         key_type* device_keys_output;
-        HIP_CHECK(test_common_utils::hipMallocHelper(&device_keys_output, keys_output.size() * sizeof(key_type)));
+        HIP_CHECK(test_common_utils::hipMallocHelper(&device_keys_output,
+                                                     keys_output.size() * sizeof(key_type)));
 
-        HIP_CHECK(
-            hipMemcpy(
-                device_keys_output, keys_output.data(),
-                keys_output.size() * sizeof(typename decltype(keys_output)::value_type),
-                hipMemcpyHostToDevice
-            )
-        );
+        HIP_CHECK(hipMemcpy(device_keys_output,
+                            keys_output.data(),
+                            keys_output.size() * sizeof(typename decltype(keys_output)::value_type),
+                            hipMemcpyHostToDevice));
 
         // Running kernel
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(sort_key_kernel<block_size, items_per_thread, key_type>),
-            dim3(grid_size), dim3(block_size), 0, 0,
-            device_keys_output, compare_function()
-        );
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(sort_key_kernel<block_size, items_per_thread, key_type>),
+                           dim3(grid_size),
+                           dim3(block_size),
+                           0,
+                           0,
+                           device_keys_output,
+                           compare_function());
 
         // Getting results to host
-        HIP_CHECK(
-            hipMemcpy(
-                keys_output.data(), device_keys_output,
-                keys_output.size() * sizeof(typename decltype(keys_output)::value_type),
-                hipMemcpyDeviceToHost
-            )
-        );
+        HIP_CHECK(hipMemcpy(keys_output.data(),
+                            device_keys_output,
+                            keys_output.size() * sizeof(typename decltype(keys_output)::value_type),
+                            hipMemcpyDeviceToHost));
 
         // Verifying results
         for(size_t i = 0; i < size; i++)
@@ -181,25 +172,146 @@ TYPED_TEST(HipcubBlockMergeSort, SortKeys)
         HIP_CHECK(hipFree(device_keys_output));
     }
 }
-template<
-    unsigned int BlockSize,
-    unsigned int ItemsPerThread,
-    class key_type,
-    class value_type,
-    class CompareOp
-    >
+
+template<typename T, size_t items_per_thread, size_t block_size, class CompareOp>
+__global__
+void sort_key_with_valid_items_kernel(T*        device_input,
+                                      CompareOp compare_op,
+                                      int       valid_items,
+                                      T         default_val)
+{
+    constexpr size_t items_per_block = items_per_thread * block_size;
+    const size_t     offset = (blockIdx.x * items_per_block) + (threadIdx.x * items_per_thread);
+
+    T input[items_per_thread];
+
+    for(size_t i = 0; i < items_per_thread; i++)
+        input[i] = device_input[offset + i];
+
+    hipcub::BlockMergeSort<T, block_size, items_per_thread> bsort;
+
+    bsort.Sort(input, compare_op, valid_items, default_val);
+
+    for(size_t i = 0; i < items_per_thread; i++)
+        device_input[offset + i] = input[i];
+}
+
+TYPED_TEST(HipcubBlockMergeSort, SortKeysWithValidItems)
+{
+
+    constexpr size_t block_size       = TestFixture::params::block_size;
+    constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
+    using compare_function            = typename TestFixture::params::compare_function;
+    using T                           = typename TestFixture::params::key_type;
+    constexpr int items_per_block     = items_per_thread * block_size;
+    constexpr int grid_size           = 113;
+
+    auto compare_op = compare_function();
+
+    if(block_size > test_utils::get_max_block_size())
+    {
+        GTEST_SKIP();
+    }
+
+    constexpr size_t size = grid_size * items_per_block;
+
+    // minus|plus two to prevent overflow weirdness
+    const T mini = std::numeric_limits<T>::min() + static_cast<T>(2);
+    const T maxi = std::numeric_limits<T>::max() - static_cast<T>(2);
+
+    const T   default_val        = static_cast<T>(compare_op(mini, maxi) ? maxi : mini);
+    const int valid_items_arr[8] = {items_per_block / 2,
+                                    items_per_block / 3,
+                                    items_per_block / 4,
+                                    items_per_block / 5,
+                                    items_per_block - 10,
+                                    items_per_block - 5,
+                                    items_per_block - 2,
+                                    items_per_block - 1};
+
+    T* host_keys_input    = new T[size];
+    T* host_keys_output   = new T[size];
+    T* host_keys_expected = new T[size];
+
+    T* device_keys_input;
+    HIP_CHECK(hipMalloc(&device_keys_input, sizeof(T) * size));
+
+    for(size_t it = 0; it < 8; it++)
+    {
+        int valid_items = valid_items_arr[it];
+
+        // need to cast the 0 because of __half and bfloat16 types
+        T elem = static_cast<T>(0);
+        for(size_t i = 0; i < size; i++)
+        {
+            if(elem > maxi)
+                elem = static_cast<T>(0);
+            host_keys_input[i] = host_keys_expected[i] = elem++;
+        }
+
+        // filling in the default_val
+        for(size_t bI = 0; bI < grid_size; bI++)
+        {
+            size_t offset = (bI * items_per_block);
+            for(size_t i = valid_items; i < items_per_block; i++)
+                host_keys_expected[offset + i] = default_val;
+        }
+
+        // sorting the values
+        for(size_t bI = 0; bI < grid_size; bI++)
+        {
+            size_t offset = (bI * items_per_block);
+            std::sort(host_keys_expected + offset,
+                      host_keys_expected + offset + items_per_block,
+                      compare_op);
+        }
+
+        HIP_CHECK(
+            hipMemcpy(device_keys_input, host_keys_input, sizeof(T) * size, hipMemcpyHostToDevice));
+
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(sort_key_with_valid_items_kernel<T, items_per_thread, block_size>),
+            dim3(grid_size),
+            dim3(block_size),
+            0,
+            0,
+            device_keys_input,
+            compare_op,
+            valid_items,
+            default_val);
+
+        HIP_CHECK(hipMemcpy(host_keys_output,
+                            device_keys_input,
+                            sizeof(T) * size,
+                            hipMemcpyDeviceToHost));
+
+        for(size_t i = 0; i < size; i++)
+            ASSERT_EQ(host_keys_expected[i], host_keys_output[i]);
+    }
+
+    delete[] host_keys_input;
+    delete[] host_keys_output;
+    delete[] host_keys_expected;
+
+    HIP_CHECK(hipFree(device_keys_input));
+}
+
+template<unsigned int BlockSize,
+         unsigned int ItemsPerThread,
+         class key_type,
+         class value_type,
+         class CompareOp>
 __global__
     __launch_bounds__(BlockSize)
-        void sort_key_value_kernel(
-            key_type* device_keys_output,
-            value_type* device_values_output,
-            CompareOp compare_op)
+void sort_key_value_kernel(key_type*   device_keys_output,
+                           value_type* device_values_output,
+                           CompareOp   compare_op)
 {
     constexpr unsigned int items_per_block = BlockSize * ItemsPerThread;
-    const unsigned int lid = hipThreadIdx_x;
-    const unsigned int block_offset = hipBlockIdx_x * items_per_block;
+    const unsigned int     lid             = hipThreadIdx_x;
+    const unsigned int     block_offset    = hipBlockIdx_x * items_per_block;
 
-    key_type keys[ItemsPerThread];
+    key_type   keys[ItemsPerThread];
     value_type values[ItemsPerThread];
     hipcub::LoadDirectBlocked(lid, device_keys_output + block_offset, keys);
     hipcub::LoadDirectBlocked(lid, device_values_output + block_offset, values);
@@ -217,24 +329,25 @@ TYPED_TEST(HipcubBlockMergeSort, SortKeysValues)
     SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
     HIP_CHECK(hipSetDevice(device_id));
 
-    using key_type = typename TestFixture::params::key_type;
-    using value_type = typename TestFixture::params::value_type;
-    constexpr size_t block_size = TestFixture::params::block_size;
+    using key_type                    = typename TestFixture::params::key_type;
+    using value_type                  = typename TestFixture::params::value_type;
+    constexpr size_t block_size       = TestFixture::params::block_size;
     constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
-    using compare_function = typename TestFixture::params::compare_function;
-    constexpr size_t items_per_block = block_size * items_per_thread;
+    using compare_function            = typename TestFixture::params::compare_function;
+    constexpr size_t items_per_block  = block_size * items_per_thread;
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
     {
         return;
     }
 
-    const size_t size = items_per_block * 1134;
+    const size_t size      = items_per_block * 1134;
     const size_t grid_size = size / items_per_block;
 
-    for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
-        unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
         SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
 
         // Generate data
@@ -245,11 +358,11 @@ TYPED_TEST(HipcubBlockMergeSort, SortKeysValues)
                                                             seed_value);
 
         std::vector<value_type> values_output;
-        values_output =
-            test_utils::get_random_data<value_type>(size,
-                                                    std::numeric_limits<value_type>::min(),
-                                                    std::numeric_limits<value_type>::max(),
-                                                    seed_value + seed_value_addition);
+        values_output
+            = test_utils::get_random_data<value_type>(size,
+                                                      std::numeric_limits<value_type>::min(),
+                                                      std::numeric_limits<value_type>::max(),
+                                                      seed_value + seed_value_addition);
 
         using key_value = std::pair<key_type, value_type>;
 
@@ -265,56 +378,292 @@ TYPED_TEST(HipcubBlockMergeSort, SortKeysValues)
         {
             std::stable_sort(expected.begin() + (i * items_per_block),
                              expected.begin() + ((i + 1) * items_per_block),
-                             [compare_op](const key_value & a, const key_value & b)
-                             {
-                                 return compare_op(a.first, b.first);
-                             });
+                             [compare_op](const key_value& a, const key_value& b)
+                             { return compare_op(a.first, b.first); });
         }
 
         key_type* device_keys_output;
-        HIP_CHECK(test_common_utils::hipMallocHelper(&device_keys_output, keys_output.size() * sizeof(key_type)));
+        HIP_CHECK(test_common_utils::hipMallocHelper(&device_keys_output,
+                                                     keys_output.size() * sizeof(key_type)));
         value_type* device_values_output;
-        HIP_CHECK(test_common_utils::hipMallocHelper(&device_values_output, values_output.size() * sizeof(value_type)));
+        HIP_CHECK(test_common_utils::hipMallocHelper(&device_values_output,
+                                                     values_output.size() * sizeof(value_type)));
 
-        HIP_CHECK(
-            hipMemcpy(
-                device_keys_output, keys_output.data(),
-                keys_output.size() * sizeof(typename decltype(keys_output)::value_type),
-                hipMemcpyHostToDevice
-            )
-        );
+        HIP_CHECK(hipMemcpy(device_keys_output,
+                            keys_output.data(),
+                            keys_output.size() * sizeof(typename decltype(keys_output)::value_type),
+                            hipMemcpyHostToDevice));
 
         HIP_CHECK(
-            hipMemcpy(
-                device_values_output, values_output.data(),
-                values_output.size() * sizeof(typename decltype(values_output)::value_type),
-                hipMemcpyHostToDevice
-            )
-        );
+            hipMemcpy(device_values_output,
+                      values_output.data(),
+                      values_output.size() * sizeof(typename decltype(values_output)::value_type),
+                      hipMemcpyHostToDevice));
 
         // Running kernel
         hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(sort_key_value_kernel<block_size, items_per_thread, key_type, value_type>),
-            dim3(grid_size), dim3(block_size), 0, 0,
-            device_keys_output, device_values_output, compare_op
-        );
+            HIP_KERNEL_NAME(
+                sort_key_value_kernel<block_size, items_per_thread, key_type, value_type>),
+            dim3(grid_size),
+            dim3(block_size),
+            0,
+            0,
+            device_keys_output,
+            device_values_output,
+            compare_op);
 
         // Getting results to host
+        HIP_CHECK(hipMemcpy(keys_output.data(),
+                            device_keys_output,
+                            keys_output.size() * sizeof(typename decltype(keys_output)::value_type),
+                            hipMemcpyDeviceToHost));
+
         HIP_CHECK(
-            hipMemcpy(
-                keys_output.data(), device_keys_output,
-                keys_output.size() * sizeof(typename decltype(keys_output)::value_type),
-                hipMemcpyDeviceToHost
-            )
-        );
+            hipMemcpy(values_output.data(),
+                      device_values_output,
+                      values_output.size() * sizeof(typename decltype(values_output)::value_type),
+                      hipMemcpyDeviceToHost));
+
+        for(size_t i = 0; i < size; i++)
+        {
+            ASSERT_EQ(test_utils::convert_to_native(keys_output[i]),
+                      test_utils::convert_to_native(expected[i].first));
+            ASSERT_EQ(test_utils::convert_to_native(values_output[i]),
+                      test_utils::convert_to_native(expected[i].second));
+        }
+
+        HIP_CHECK(hipFree(device_keys_output));
+        HIP_CHECK(hipFree(device_values_output));
+    }
+}
+
+template<typename T, size_t items_per_thread, size_t block_size, class CompareOp>
+__global__
+void stable_sort_kernel(T* device_input, CompareOp compare_op)
+{
+    constexpr size_t items_per_block = items_per_thread * block_size;
+    const size_t     offset = (blockIdx.x * items_per_block) + (threadIdx.x * items_per_thread);
+
+    T input[items_per_thread];
+
+    for(size_t i = 0; i < items_per_thread; i++)
+        input[i] = device_input[offset + i];
+
+    hipcub::BlockMergeSort<T, block_size, items_per_thread, int> bsort;
+
+    bsort.StableSort(input,
+                     [&](const T& lhs, const T& rhs) { return compare_op(lhs.elem, rhs.elem); });
+
+    for(size_t i = 0; i < items_per_thread; i++)
+        device_input[offset + i] = input[i];
+}
+
+TYPED_TEST(HipcubBlockMergeSort, StableSort)
+{
+    constexpr size_t block_size       = TestFixture::params::block_size;
+    constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
+    using T                           = typename TestFixture::params::key_type;
+    using compare_function            = typename TestFixture::params::compare_function;
+    constexpr size_t items_per_block  = items_per_thread * block_size;
+    constexpr size_t grid_size        = 113;
+    const size_t     size             = grid_size * items_per_block;
+
+    auto compare_op = compare_function();
+    if(block_size > test_utils::get_max_block_size())
+    {
+        GTEST_SKIP();
+    }
+    struct custom_type
+    {
+        T      elem;
+        size_t id;
+    };
+
+    custom_type* host_input    = new custom_type[size];
+    custom_type* host_expected = new custom_type[size];
+
+    //populate the inputs
+    for(size_t i = 0; i < size; i++)
+    {
+        if(i % 2)
+            host_expected[i] = host_input[i] = {static_cast<T>(i - 1), i};
+        else
+            host_expected[i] = host_input[i] = {static_cast<T>(i), i};
+    }
+
+    // get the expected result
+    for(size_t bI = 0; bI < grid_size; bI++)
+    {
+        size_t offset = (bI * items_per_block);
+        std::stable_sort(host_expected + offset,
+                         host_expected + offset + items_per_block,
+                         [&](const custom_type& lhs, const custom_type& rhs)
+                         { return compare_op(lhs.elem, rhs.elem); });
+    }
+    custom_type* device_input;
+
+    HIP_CHECK(hipMalloc(&device_input, sizeof(custom_type) * size));
+    HIP_CHECK(
+        hipMemcpy(device_input, host_input, sizeof(custom_type) * size, hipMemcpyHostToDevice));
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(stable_sort_kernel<custom_type, items_per_thread, block_size>),
+        dim3(grid_size),
+        dim3(block_size),
+        0,
+        0,
+        device_input,
+        compare_op);
+
+    HIP_CHECK(
+        hipMemcpy(host_input, device_input, sizeof(custom_type) * size, hipMemcpyDeviceToHost));
+
+    for(size_t i = 0; i < size; i++)
+    {
+        ASSERT_EQ(host_input[i].elem, host_expected[i].elem);
+        ASSERT_EQ(host_input[i].id, host_expected[i].id);
+    }
+
+    delete[] host_input;
+    delete[] host_expected;
+
+    HIP_CHECK(hipFree(device_input));
+}
+
+template<unsigned int BlockSize,
+         unsigned int ItemsPerThread,
+         class key_type,
+         class value_type,
+         class CompareOp>
+__global__
+    __launch_bounds__(BlockSize)
+void stable_sort_key_value_kernel(key_type*   device_keys_output,
+                                  value_type* device_values_output,
+                                  CompareOp   compare_op)
+{
+    constexpr unsigned int items_per_block = BlockSize * ItemsPerThread;
+    const unsigned int     lid             = hipThreadIdx_x;
+    const unsigned int     block_offset    = hipBlockIdx_x * items_per_block;
+
+    key_type   keys[ItemsPerThread];
+    value_type values[ItemsPerThread];
+    hipcub::LoadDirectBlocked(lid, device_keys_output + block_offset, keys);
+    hipcub::LoadDirectBlocked(lid, device_values_output + block_offset, values);
+
+    hipcub::BlockMergeSort<key_type, BlockSize, ItemsPerThread, value_type> bsort;
+    bsort.StableSort(keys, values, compare_op);
+
+    hipcub::StoreDirectBlocked(lid, device_keys_output + block_offset, keys);
+    hipcub::StoreDirectBlocked(lid, device_values_output + block_offset, values);
+}
+
+TYPED_TEST(HipcubBlockMergeSort, StableSortKeysValues)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using key_type                    = typename TestFixture::params::key_type;
+    using value_type                  = typename TestFixture::params::value_type;
+    constexpr size_t block_size       = TestFixture::params::block_size;
+    constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
+    using compare_function            = typename TestFixture::params::compare_function;
+    constexpr size_t items_per_block  = block_size * items_per_thread;
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size())
+    {
+        return;
+    }
+
+    const size_t size      = items_per_block * 1134;
+    const size_t grid_size = size / items_per_block;
+
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    {
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
+
+        // Generate data
+        std::vector<key_type> keys_output;
+        keys_output = test_utils::get_random_data<key_type>(size,
+                                                            std::numeric_limits<key_type>::min(),
+                                                            std::numeric_limits<key_type>::max(),
+                                                            seed_value);
+
+        std::vector<value_type> values_output;
+        values_output
+            = test_utils::get_random_data<value_type>(size,
+                                                      std::numeric_limits<value_type>::min(),
+                                                      std::numeric_limits<value_type>::max(),
+                                                      seed_value + seed_value_addition);
+
+        // Set some keys to be the same, but have different values to test stability
+        for(size_t i = 0; i < 10; i++)
+        {
+            keys_output[i]   = static_cast<key_type>(0);
+            values_output[i] = static_cast<value_type>(i);
+        }
+
+        using key_value = std::pair<key_type, value_type>;
+
+        // Calculate expected results on host
+        std::vector<key_value> expected(size);
+        for(size_t i = 0; i < size; i++)
+        {
+            expected[i] = key_value(keys_output[i], values_output[i]);
+        }
+
+        compare_function compare_op;
+        for(size_t i = 0; i < size / items_per_block; i++)
+        {
+            std::stable_sort(expected.begin() + (i * items_per_block),
+                             expected.begin() + ((i + 1) * items_per_block),
+                             [compare_op](const key_value& a, const key_value& b)
+                             { return compare_op(a.first, b.first); });
+        }
+
+        key_type* device_keys_output;
+        HIP_CHECK(test_common_utils::hipMallocHelper(&device_keys_output,
+                                                     keys_output.size() * sizeof(key_type)));
+        value_type* device_values_output;
+        HIP_CHECK(test_common_utils::hipMallocHelper(&device_values_output,
+                                                     values_output.size() * sizeof(value_type)));
+
+        HIP_CHECK(hipMemcpy(device_keys_output,
+                            keys_output.data(),
+                            keys_output.size() * sizeof(typename decltype(keys_output)::value_type),
+                            hipMemcpyHostToDevice));
 
         HIP_CHECK(
-            hipMemcpy(
-                values_output.data(), device_values_output,
-                values_output.size() * sizeof(typename decltype(values_output)::value_type),
-                hipMemcpyDeviceToHost
-            )
-        );
+            hipMemcpy(device_values_output,
+                      values_output.data(),
+                      values_output.size() * sizeof(typename decltype(values_output)::value_type),
+                      hipMemcpyHostToDevice));
+
+        // Running kernel
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(
+                stable_sort_key_value_kernel<block_size, items_per_thread, key_type, value_type>),
+            dim3(grid_size),
+            dim3(block_size),
+            0,
+            0,
+            device_keys_output,
+            device_values_output,
+            compare_op);
+
+        // Getting results to host
+        HIP_CHECK(hipMemcpy(keys_output.data(),
+                            device_keys_output,
+                            keys_output.size() * sizeof(typename decltype(keys_output)::value_type),
+                            hipMemcpyDeviceToHost));
+
+        HIP_CHECK(
+            hipMemcpy(values_output.data(),
+                      device_values_output,
+                      values_output.size() * sizeof(typename decltype(values_output)::value_type),
+                      hipMemcpyDeviceToHost));
 
         for(size_t i = 0; i < size; i++)
         {
@@ -328,3 +677,317 @@ TYPED_TEST(HipcubBlockMergeSort, SortKeysValues)
         HIP_CHECK(hipFree(device_values_output));
     }
 }
+
+template<typename T, size_t items_per_thread, size_t block_size, class CompareOp>
+__global__
+void stable_sort_key_with_valid_items_kernel(T*        device_input,
+                                             CompareOp compare_op,
+                                             int       valid_items,
+                                             T         default_val)
+{
+    constexpr size_t items_per_block = items_per_thread * block_size;
+    const size_t     offset = (blockIdx.x * items_per_block) + (threadIdx.x * items_per_thread);
+
+    T input[items_per_thread];
+
+    for(size_t i = 0; i < items_per_thread; i++)
+        input[i] = device_input[offset + i];
+
+    hipcub::BlockMergeSort<T, block_size, items_per_thread> bsort;
+
+    bsort.StableSort(
+        input,
+        [&](const T& lhs, const T& rhs) { return compare_op(lhs.elem, rhs.elem); },
+        valid_items,
+        default_val);
+
+    for(size_t i = 0; i < items_per_thread; i++)
+        device_input[offset + i] = input[i];
+}
+
+TYPED_TEST(HipcubBlockMergeSort, StableSortKeysWithValidItems)
+{
+
+    constexpr size_t block_size       = TestFixture::params::block_size;
+    constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
+    using compare_function            = typename TestFixture::params::compare_function;
+    using T                           = typename TestFixture::params::key_type;
+    constexpr int items_per_block     = items_per_thread * block_size;
+    constexpr int grid_size           = 113;
+
+    auto compare_op = compare_function();
+
+    if(block_size > test_utils::get_max_block_size())
+    {
+        GTEST_SKIP();
+    }
+
+    struct custom_type
+    {
+        T      elem;
+        size_t id;
+    };
+
+    constexpr size_t size = grid_size * items_per_block;
+
+    // minus|plus two to prevent overflow weirdness
+    const T mini = std::numeric_limits<T>::min() + static_cast<T>(2);
+    const T maxi = std::numeric_limits<T>::max() - static_cast<T>(2);
+
+    const custom_type default_val = {static_cast<T>(compare_op(mini, maxi) ? maxi : mini), 0};
+    const int         valid_items_arr[8] = {items_per_block / 2,
+                                            items_per_block / 3,
+                                            items_per_block / 4,
+                                            items_per_block / 5,
+                                            items_per_block - 10,
+                                            items_per_block - 5,
+                                            items_per_block - 2,
+                                            items_per_block - 1};
+
+    custom_type* host_keys_input    = new custom_type[size];
+    custom_type* host_keys_output   = new custom_type[size];
+    custom_type* host_keys_expected = new custom_type[size];
+
+    custom_type* device_keys_input;
+    HIP_CHECK(hipMalloc(&device_keys_input, sizeof(custom_type) * size));
+
+    for(size_t it = 0; it < 8; it++)
+    {
+        int valid_items = valid_items_arr[it];
+
+        // need to cast 0 because of __half and bfloat16 types
+        T elem = static_cast<T>(0);
+        for(size_t i = 0; i < size; i++)
+        {
+            if(elem > maxi)
+                elem = static_cast<T>(0);
+
+            host_keys_input[i] = host_keys_expected[i] = {elem++, i};
+        }
+
+        // filling in the default_val
+        for(size_t bI = 0; bI < grid_size; bI++)
+        {
+            size_t offset = (bI * items_per_block);
+            for(size_t i = valid_items; i < items_per_block; i++)
+            {
+                host_keys_expected[offset + i] = default_val;
+            }
+        }
+
+        // sorting the values
+        for(size_t bI = 0; bI < grid_size; bI++)
+        {
+            size_t offset = (bI * items_per_block);
+            std::stable_sort(host_keys_expected + offset,
+                             host_keys_expected + offset + items_per_block,
+                             [&](const custom_type& lhs, const custom_type& rhs)
+                             { return compare_op(lhs.elem, rhs.elem); });
+        }
+
+        HIP_CHECK(hipMemcpy(device_keys_input,
+                            host_keys_input,
+                            sizeof(custom_type) * size,
+                            hipMemcpyHostToDevice));
+
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(
+                stable_sort_key_with_valid_items_kernel<custom_type, items_per_thread, block_size>),
+            dim3(grid_size),
+            dim3(block_size),
+            0,
+            0,
+            device_keys_input,
+            compare_op,
+            valid_items,
+            default_val);
+
+        HIP_CHECK(hipMemcpy(host_keys_output,
+                            device_keys_input,
+                            sizeof(custom_type) * size,
+                            hipMemcpyDeviceToHost));
+
+        for(size_t i = 0; i < size; i++)
+        {
+            ASSERT_EQ(host_keys_expected[i].elem, host_keys_output[i].elem);
+            ASSERT_EQ(host_keys_expected[i].id, host_keys_output[i].id);
+        }
+    }
+
+    delete[] host_keys_input;
+    delete[] host_keys_output;
+    delete[] host_keys_expected;
+
+    HIP_CHECK(hipFree(device_keys_input));
+}
+
+template<typename T, size_t items_per_thread, size_t block_size, class CompareOp>
+__global__
+void stable_sort_key_value_with_valid_items_kernel(T*        device_key_input,
+                                                   T*        device_value_input,
+                                                   CompareOp compare_op,
+                                                   int       valid_items,
+                                                   T         default_val)
+{
+    constexpr size_t items_per_block = items_per_thread * block_size;
+    const size_t     offset = (blockIdx.x * items_per_block) + (threadIdx.x * items_per_thread);
+
+    T key_input[items_per_thread];
+    T value_input[items_per_thread];
+
+    for(size_t i = 0; i < items_per_thread; i++)
+    {
+        key_input[i]   = device_key_input[offset + i];
+        value_input[i] = device_value_input[offset + i];
+    }
+
+    hipcub::BlockMergeSort<T, block_size, items_per_thread, T> bsort;
+
+    bsort.StableSort(key_input, value_input, compare_op, valid_items, default_val);
+
+    for(size_t i = 0; i < items_per_thread; i++)
+    {
+        device_key_input[offset + i]   = key_input[i];
+        device_value_input[offset + i] = value_input[i];
+    }
+}
+
+TYPED_TEST(HipcubBlockMergeSort, StableSortKeysValuesWithValidItems)
+{
+
+    constexpr size_t block_size       = TestFixture::params::block_size;
+    constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
+    using compare_function            = typename TestFixture::params::compare_function;
+    using T                           = typename TestFixture::params::key_type;
+    constexpr int items_per_block     = items_per_thread * block_size;
+    constexpr int grid_size           = 113;
+
+    auto compare_op = compare_function();
+
+    if(block_size > test_utils::get_max_block_size())
+    {
+        GTEST_SKIP();
+    }
+
+    struct custom_type
+    {
+        T key;
+        T value;
+    };
+
+    constexpr size_t size = grid_size * items_per_block;
+
+    // minus|plus two to prevent overflow weirdness
+    const T mini = std::numeric_limits<T>::min() + static_cast<T>(2);
+    const T maxi = std::numeric_limits<T>::max() - static_cast<T>(2);
+
+    T         default_val        = static_cast<T>(compare_op(mini, maxi) ? maxi : mini);
+    const int valid_items_arr[8] = {items_per_block / 2,
+                                    items_per_block / 3,
+                                    items_per_block / 4,
+                                    items_per_block / 5,
+                                    items_per_block - 10,
+                                    items_per_block - 5,
+                                    items_per_block - 2,
+                                    items_per_block - 1};
+
+    custom_type* host_side_sort    = new custom_type[size];
+    T*           host_keys_input   = new T[size];
+    T*           host_values_input = new T[size];
+
+    T* host_keys_expected   = new T[size];
+    T* host_values_expected = new T[size];
+
+    T* device_keys_input;
+    T* device_values_input;
+    HIP_CHECK(hipMalloc(&device_keys_input, sizeof(T) * size));
+    HIP_CHECK(hipMalloc(&device_values_input, sizeof(T) * size));
+
+    std::random_device                     rd;
+    std::mt19937                           gen(rd());
+    std::uniform_real_distribution<double> dis(static_cast<double>(mini) + 2,
+                                               static_cast<double>(maxi) - 2);
+
+    for(size_t it = 0; it < 8; it++)
+    {
+        int valid_items = valid_items_arr[it];
+
+        // need to cast the 0 because of __half and bfloat16 types
+        T rIndex = static_cast<T>(0);
+        for(size_t i = 0; i < size; i++)
+        {
+            if(rIndex > maxi)
+                rIndex = static_cast<T>(0);
+
+            if(i % 2)
+            {
+                T oIndex          = rIndex - static_cast<T>(1);
+                host_side_sort[i] = {oIndex, static_cast<T>(dis(gen))};
+            }
+            else
+                host_side_sort[i] = {rIndex, static_cast<T>(dis(gen))};
+            host_keys_input[i]   = host_side_sort[i].key;
+            host_values_input[i] = host_side_sort[i].value;
+            rIndex++;
+        }
+
+        // filling in the default_val
+        for(size_t bI = 0; bI < grid_size; bI++)
+        {
+            size_t offset = (bI * items_per_block);
+            for(size_t i = valid_items; i < items_per_block; i++)
+            {
+                host_side_sort[offset + i].key = default_val;
+            }
+        }
+
+        // sorting the values
+        for(size_t bI = 0; bI < grid_size; bI++)
+        {
+            size_t offset = (bI * items_per_block);
+            std::stable_sort(host_side_sort + offset,
+                             host_side_sort + offset + items_per_block,
+                             [&](const custom_type& lhs, const custom_type& rhs)
+                             { return compare_op(lhs.key, rhs.key); });
+        }
+
+        HIP_CHECK(
+            hipMemcpy(device_keys_input, host_keys_input, sizeof(T) * size, hipMemcpyHostToDevice));
+        HIP_CHECK(hipMemcpy(device_values_input,
+                            host_values_input,
+                            sizeof(T) * size,
+                            hipMemcpyHostToDevice));
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(
+                stable_sort_key_value_with_valid_items_kernel<T, items_per_thread, block_size>),
+            dim3(grid_size),
+            dim3(block_size),
+            0,
+            0,
+            device_keys_input,
+            device_values_input,
+            compare_op,
+            valid_items,
+            default_val);
+
+        HIP_CHECK(
+            hipMemcpy(host_keys_input, device_keys_input, sizeof(T) * size, hipMemcpyDeviceToHost));
+        HIP_CHECK(hipMemcpy(host_values_input,
+                            device_values_input,
+                            sizeof(T) * size,
+                            hipMemcpyDeviceToHost));
+
+        for(size_t i = 0; i < size; i++)
+        {
+            ASSERT_EQ(host_side_sort[i].key, host_keys_input[i]);
+            ASSERT_EQ(host_side_sort[i].value, host_values_input[i]);
+        }
+    }
+
+    delete[] host_keys_input;
+    delete[] host_values_input;
+    delete[] host_side_sort;
+
+    HIP_CHECK(hipFree(device_keys_input));
+    HIP_CHECK(hipFree(device_values_input));
+}
\ No newline at end of file
diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp
index 472135e5620..ad47a54ce3e 100644
--- a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp
+++ b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp
@@ -37,6 +37,8 @@
 #include "hipcub/block/block_store.hpp"
 #include "hipcub/util_type.hpp"
 
+#include <bitset>
+
 template<class Key,
          unsigned int BlockSize,
          unsigned int ItemsPerThread,
@@ -47,10 +49,10 @@ template<class Key,
 struct params
 {
     using key_type                                 = Key;
-    static constexpr unsigned int block_size = BlockSize;
+    static constexpr unsigned int block_size       = BlockSize;
     static constexpr unsigned int items_per_thread = ItemsPerThread;
     static constexpr bool         descending       = Descending;
-    static constexpr unsigned int start_bit = StartBit;
+    static constexpr unsigned int start_bit        = StartBit;
     static constexpr unsigned int max_radix_bits   = MaxRadixBits;
     static constexpr unsigned int radix_bits       = RadixBits;
 };
@@ -64,31 +66,208 @@ class HipcubBlockRadixRank : public ::testing::Test
 
 using Params = ::testing::Types<
     // Power of 2 BlockSize
-    params<unsigned int, 64U, 1>,
-    params<test_utils::half, 128U, 1>,
-    params<test_utils::bfloat16, 128U, 1>,
-    params<float, 256U, 1>,
-    params<unsigned short, 512U, 1, true>,
+    params<unsigned int, 128, 1>,
+    params<char, 128, 1>,
+    params<signed char, 128, 1>,
+    params<unsigned char, 128, 1>,
+    params<short, 128, 1>,
+    params<unsigned short, 128, 1>,
+    params<int, 128, 1>,
+    params<unsigned int, 128, 1>,
+    params<long, 128, 1>,
+    params<unsigned long, 128, 1>,
+    params<long long, 128, 1>,
+    params<unsigned long long, 128, 1>,
+    params<float, 128, 1>,
+    params<double, 128, 1>,
+    params<test_utils::half, 128, 1>,
+    params<test_utils::bfloat16, 128, 1>,
+    params<unsigned int, 128, 1, true>,
+    params<char, 128, 1, true>,
+    params<signed char, 128, 1, true>,
+    params<unsigned char, 128, 1, true>,
+    params<short, 128, 1, true>,
+    params<unsigned short, 128, 1, true>,
+    params<int, 128, 1, true>,
+    params<unsigned int, 128, 1, true>,
+    params<long, 128, 1, true>,
+    params<unsigned long, 128, 1, true>,
+    params<long long, 128, 1, true>,
+    params<unsigned long long, 128, 1, true>,
+    params<float, 128, 1, true>,
+    params<double, 128, 1, true>,
+    params<test_utils::half, 128, 1, true>,
+    params<test_utils::bfloat16, 128, 1, true>,
 
     // Non-power of 2 BlockSize
-    params<double, 65U, 1>,
-    params<float, 37U, 1>,
-    params<long long, 510U, 1, true>,
-    params<unsigned int, 162U, 1, false>,
-    params<unsigned char, 255U, 1>,
-
-    // Power of 2 BlockSize and ItemsPerThread > 1
-    params<unsigned long long, 64U, 2, true>,
-    params<int, 128U, 4>,
-    params<unsigned short, 256U, 7>,
-    params<float, 512U, 2, false>,
-
-    // Non-power of 2 BlockSize and ItemsPerThread > 1
-    params<double, 33U, 5>,
-    params<char, 464U, 2, true>,
-    params<unsigned short, 100U, 3>,
-    params<test_utils::half, 234U, 9>,
-    params<test_utils::bfloat16, 234U, 9>,
+    params<unsigned int, 141u, 1>,
+    params<char, 141u, 1>,
+    params<signed char, 141u, 1>,
+    params<unsigned char, 141u, 1>,
+    params<short, 141u, 1>,
+    params<unsigned short, 141u, 1>,
+    params<int, 141u, 1>,
+    params<unsigned int, 141u, 1>,
+    params<long, 141u, 1>,
+    params<unsigned long, 141u, 1>,
+    params<long long, 141u, 1>,
+    params<unsigned long long, 141u, 1>,
+    params<float, 141u, 1>,
+    params<double, 141u, 1>,
+    params<test_utils::half, 141u, 1>,
+    params<test_utils::bfloat16, 141u, 1>,
+    params<unsigned int, 141u, 1, true>,
+    params<char, 141u, 1, true>,
+    params<signed char, 141u, 1, true>,
+    params<unsigned char, 141u, 1, true>,
+    params<short, 141u, 1, true>,
+    params<unsigned short, 141u, 1, true>,
+    params<int, 141u, 1, true>,
+    params<unsigned int, 141u, 1, true>,
+    params<long, 141u, 1, true>,
+    params<unsigned long, 141u, 1, true>,
+    params<long long, 141u, 1, true>,
+    params<unsigned long long, 141u, 1, true>,
+    params<float, 141u, 1, true>,
+    params<double, 141u, 1, true>,
+    params<test_utils::half, 141u, 1, true>,
+    params<test_utils::bfloat16, 141u, 1, true>,
+
+    // Power of 2 BlockSize and ItemsPerThread > 1 and ItemsPerThread Power of 2
+    params<unsigned int, 64, 8>,
+    params<char, 64, 8>,
+    params<signed char, 64, 8>,
+    params<unsigned char, 64, 8>,
+    params<short, 64, 8>,
+    params<unsigned short, 64, 8>,
+    params<int, 64, 8>,
+    params<unsigned int, 64, 8>,
+    params<long, 64, 8>,
+    params<unsigned long, 64, 8>,
+    params<long long, 64, 8>,
+    params<unsigned long long, 64, 8>,
+    params<float, 64, 8>,
+    params<double, 64, 8>,
+    params<test_utils::half, 64, 8>,
+    params<test_utils::bfloat16, 64, 8>,
+    params<unsigned int, 64, 8, true>,
+    params<char, 64, 8, true>,
+    params<signed char, 64, 8, true>,
+    params<unsigned char, 64, 8, true>,
+    params<short, 64, 8, true>,
+    params<unsigned short, 64, 8, true>,
+    params<int, 64, 8, true>,
+    params<unsigned int, 64, 8, true>,
+    params<long, 64, 8, true>,
+    params<unsigned long, 64, 8, true>,
+    params<long long, 64, 8, true>,
+    params<unsigned long long, 64, 8, true>,
+    params<float, 64, 8, true>,
+    params<double, 64, 8, true>,
+    params<test_utils::half, 64, 8, true>,
+    params<test_utils::bfloat16, 64, 8, true>,
+
+    // Power of 2 BlockSize and ItemsPerThread > 1 and ItemsPerThread Non-power of 2
+    params<unsigned int, 64, 9>,
+    params<char, 64, 9>,
+    params<signed char, 64, 9>,
+    params<unsigned char, 64, 9>,
+    params<short, 64, 9>,
+    params<unsigned short, 64, 9>,
+    params<int, 64, 9>,
+    params<unsigned int, 64, 9>,
+    params<long, 64, 9>,
+    params<unsigned long, 64, 9>,
+    params<long long, 64, 9>,
+    params<unsigned long long, 64, 9>,
+    params<float, 64, 9>,
+    params<double, 64, 9>,
+    params<test_utils::half, 64, 9>,
+    params<test_utils::bfloat16, 64, 9>,
+    params<unsigned int, 64, 9, true>,
+    params<char, 64, 9, true>,
+    params<signed char, 64, 9, true>,
+    params<unsigned char, 64, 9, true>,
+    params<short, 64, 9, true>,
+    params<unsigned short, 64, 9, true>,
+    params<int, 64, 9, true>,
+    params<unsigned int, 64, 9, true>,
+    params<long, 64, 9, true>,
+    params<unsigned long, 64, 9, true>,
+    params<long long, 64, 9, true>,
+    params<unsigned long long, 64, 9, true>,
+    params<float, 64, 9, true>,
+    params<double, 64, 9, true>,
+    params<test_utils::half, 64, 9, true>,
+    params<test_utils::bfloat16, 64, 9, true>,
+
+    // Non-power of 2 BlockSize and ItemsPerThread > 1 and ItemsPerThread Power of 2
+    params<unsigned int, 92U, 8>,
+    params<char, 92U, 8>,
+    params<signed char, 92U, 8>,
+    params<unsigned char, 92U, 8>,
+    params<short, 92U, 8>,
+    params<unsigned short, 92U, 8>,
+    params<int, 92U, 8>,
+    params<unsigned int, 92U, 8>,
+    params<long, 92U, 8>,
+    params<unsigned long, 92U, 8>,
+    params<long long, 92U, 8>,
+    params<unsigned long long, 92U, 8>,
+    params<float, 92U, 8>,
+    params<double, 92U, 8>,
+    params<test_utils::half, 92U, 8>,
+    params<test_utils::bfloat16, 92U, 8>,
+    params<unsigned int, 92U, 8, true>,
+    params<char, 92U, 8, true>,
+    params<signed char, 92U, 8, true>,
+    params<unsigned char, 92U, 8, true>,
+    params<short, 92U, 8, true>,
+    params<unsigned short, 92U, 8, true>,
+    params<int, 92U, 8, true>,
+    params<unsigned int, 92U, 8, true>,
+    params<long, 92U, 8, true>,
+    params<unsigned long, 92U, 8, true>,
+    params<long long, 92U, 8, true>,
+    params<unsigned long long, 92U, 8, true>,
+    params<float, 92U, 8, true>,
+    params<double, 92U, 8, true>,
+    params<test_utils::half, 92U, 8, true>,
+    params<test_utils::bfloat16, 92U, 8, true>,
+
+    // Non-power of 2 BlockSize and ItemsPerThread > 1 and ItemsPerThread Non-power of 2
+    params<unsigned int, 92U, 5>,
+    params<char, 92U, 5>,
+    params<signed char, 92U, 5>,
+    params<unsigned char, 92U, 5>,
+    params<short, 92U, 5>,
+    params<unsigned short, 92U, 5>,
+    params<int, 92U, 5>,
+    params<unsigned int, 92U, 5>,
+    params<long, 92U, 5>,
+    params<unsigned long, 92U, 5>,
+    params<long long, 92U, 5>,
+    params<unsigned long long, 92U, 5>,
+    params<float, 92U, 5>,
+    params<double, 92U, 5>,
+    params<test_utils::half, 92U, 5>,
+    params<test_utils::bfloat16, 92U, 5>,
+    params<unsigned int, 92U, 5, true>,
+    params<char, 92U, 5, true>,
+    params<signed char, 92U, 5, true>,
+    params<unsigned char, 92U, 5, true>,
+    params<short, 92U, 5, true>,
+    params<unsigned short, 92U, 5, true>,
+    params<int, 92U, 5, true>,
+    params<unsigned int, 92U, 5, true>,
+    params<long, 92U, 5, true>,
+    params<unsigned long, 92U, 5, true>,
+    params<long long, 92U, 5, true>,
+    params<unsigned long long, 92U, 5, true>,
+    params<float, 92U, 5, true>,
+    params<double, 92U, 5, true>,
+    params<test_utils::half, 92U, 5, true>,
+    params<test_utils::bfloat16, 92U, 5, true>,
 
     // StartBit and MaxRadixBits
     params<unsigned long long, 64U, 1, false, 8, 5>,
@@ -115,10 +294,11 @@ template<unsigned int       BlockSize,
          bool               Descending,
          RadixRankAlgorithm Algorithm,
          typename KeyType>
-__global__ __launch_bounds__(BlockSize) void rank_kernel(const KeyType* keys_input,
-                                                         int*           ranks_output,
-                                                         unsigned int   start_bit,
-                                                         unsigned int   radix_bits)
+__global__ __launch_bounds__(BlockSize)
+void rank_kernel(const KeyType* keys_input,
+                 int*           ranks_output,
+                 unsigned int   start_bit,
+                 unsigned int   radix_bits)
 {
     constexpr bool warp_striped = Algorithm == RadixRankAlgorithm::RADIX_RANK_MATCH;
 
@@ -189,15 +369,15 @@ void test_radix_rank()
     SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
     HIP_CHECK(hipSetDevice(device_id));
 
-    using key_type = typename TestFixture::params::key_type;
-    constexpr size_t block_size = TestFixture::params::block_size;
-    constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
-    constexpr bool descending = TestFixture::params::descending;
+    using key_type                          = typename TestFixture::params::key_type;
+    constexpr size_t       block_size       = TestFixture::params::block_size;
+    constexpr size_t       items_per_thread = TestFixture::params::items_per_thread;
+    constexpr bool         descending       = TestFixture::params::descending;
     constexpr unsigned int max_radix_bits   = TestFixture::params::max_radix_bits;
-    constexpr unsigned int start_bit = TestFixture::params::start_bit;
+    constexpr unsigned int start_bit        = TestFixture::params::start_bit;
     constexpr unsigned int radix_bits       = TestFixture::params::radix_bits;
     constexpr unsigned     end_bit          = start_bit + radix_bits;
-    constexpr size_t items_per_block = block_size * items_per_thread;
+    constexpr size_t       items_per_block  = block_size * items_per_thread;
 
     static_assert(radix_bits <= max_radix_bits,
                   "radix_bits must be less than or equal to max_radix_bits");
@@ -209,13 +389,15 @@ void test_radix_rank()
     }
 
     const size_t grid_size = 42;
-    const size_t size = items_per_block * grid_size;
+    const size_t size      = items_per_block * grid_size;
 
-    SCOPED_TRACE(testing::Message() << "with items_per_block= " << items_per_block << " size=" << size);
+    SCOPED_TRACE(testing::Message()
+                 << "with items_per_block= " << items_per_block << " size=" << size);
 
-    for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
-        unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
         SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
 
         // Generate data
@@ -334,3 +516,267 @@ TYPED_TEST(HipcubBlockRadixRank, BlockRadixRankMatch)
 
     test_radix_rank<TestFixture, RadixRankAlgorithm::RADIX_RANK_MATCH>();
 }
+
+template<unsigned int       BlockSize,
+         unsigned int       ItemsPerThread,
+         unsigned int       RadixBits,
+         bool               Descending,
+         RadixRankAlgorithm Algorithm,
+         typename KeyType>
+__global__ __launch_bounds__(BlockSize)
+void rank_with_prefix_sum_kernel(const KeyType* keys_input,
+                                 int*           ranks_output,
+                                 int*           prefix_sum_output,
+                                 unsigned int   start_bit)
+{
+    constexpr bool warp_striped = Algorithm == RadixRankAlgorithm::RADIX_RANK_MATCH;
+
+    using KeyTraits      = hipcub::Traits<KeyType>;
+    using UnsignedBits   = typename KeyTraits::UnsignedBits;
+    using DigitExtractor = hipcub::BFEDigitExtractor<KeyType>;
+    using RankType       = std::conditional_t<
+        Algorithm == RadixRankAlgorithm::RADIX_RANK_MATCH,
+        hipcub::BlockRadixRankMatch<BlockSize, RadixBits, Descending>,
+        hipcub::BlockRadixRank<BlockSize,
+                               RadixBits,
+                               Descending,
+                               Algorithm == RadixRankAlgorithm::RADIX_RANK_MEMOIZE>>;
+
+    using KeyExchangeType  = hipcub::BlockExchange<KeyType, BlockSize, ItemsPerThread>;
+    using RankExchangeType = hipcub::BlockExchange<int, BlockSize, ItemsPerThread>;
+
+    constexpr unsigned int items_per_block = BlockSize * ItemsPerThread;
+    const unsigned int     lid             = hipThreadIdx_x;
+    const unsigned int     block_offset    = hipBlockIdx_x * items_per_block;
+
+    __shared__ union
+    {
+        typename KeyExchangeType::TempStorage  key_exchange;
+        typename RankType::TempStorage         rank;
+        typename RankExchangeType::TempStorage rank_exchange;
+    } storage;
+
+    KeyType keys[ItemsPerThread];
+    hipcub::LoadDirectBlocked(lid, keys_input + block_offset, keys);
+
+    if(warp_striped)
+    {
+        KeyExchangeType exchange(storage.key_exchange);
+        exchange.BlockedToWarpStriped(keys, keys);
+        __syncthreads();
+    }
+
+    UnsignedBits(&unsigned_keys)[ItemsPerThread]
+        = reinterpret_cast<UnsignedBits(&)[ItemsPerThread]>(keys);
+
+#pragma unroll
+    for(unsigned int key = 0; key < ItemsPerThread; key++)
+    {
+        unsigned_keys[key] = KeyTraits::TwiddleIn(unsigned_keys[key]);
+    }
+
+    RankType             rank(storage.rank);
+    const auto           bins_tracked_per_thread = rank.BINS_TRACKED_PER_THREAD;
+    const DigitExtractor digit_extractor(start_bit, RadixBits);
+    int                  ranks[ItemsPerThread];
+
+    int prefix_sum_storage[bins_tracked_per_thread];
+
+    rank.RankKeys(unsigned_keys, ranks, digit_extractor, prefix_sum_storage);
+
+    if(warp_striped)
+    {
+        __syncthreads();
+        RankExchangeType exchange(storage.rank_exchange);
+        exchange.WarpStripedToBlocked(ranks, ranks);
+    }
+
+    hipcub::StoreDirectBlocked(lid, ranks_output + block_offset, ranks);
+
+    const size_t pfs_size       = (1 << RadixBits);
+    const size_t pfs_offset     = (blockIdx.x * pfs_size) + (threadIdx.x * bins_tracked_per_thread);
+    const size_t pfs_total_size = pfs_size * blockDim.x;
+
+    for(size_t i = 0; i < bins_tracked_per_thread; i++)
+    {
+        if((threadIdx.x * bins_tracked_per_thread) + i < pfs_size)
+            prefix_sum_output[pfs_offset + i] = prefix_sum_storage[i];
+    }
+}
+
+template<typename TestFixture, RadixRankAlgorithm Algorithm>
+void test_radix_rank_with_prefix_sum_output()
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using key_type                          = typename TestFixture::params::key_type;
+    constexpr size_t       block_size       = TestFixture::params::block_size;
+    constexpr size_t       items_per_thread = TestFixture::params::items_per_thread;
+    constexpr bool         descending       = TestFixture::params::descending;
+    constexpr unsigned int start_bit        = TestFixture::params::start_bit;
+    constexpr unsigned int radix_bits       = TestFixture::params::max_radix_bits;
+    constexpr unsigned     end_bit          = start_bit + radix_bits;
+    constexpr size_t       items_per_block  = block_size * items_per_thread;
+
+    if constexpr(std::is_same_v<key_type, unsigned long long>)
+    {
+
+        // Given block size not supported
+        if(block_size > test_utils::get_max_block_size())
+        {
+            return;
+        }
+
+        const size_t grid_size           = 42;
+        const size_t pfs_items_per_block = (1 << radix_bits);
+        const size_t pfs_size            = pfs_items_per_block * grid_size;
+        const size_t size                = items_per_block * grid_size;
+
+        SCOPED_TRACE(testing::Message()
+                     << "with items_per_block= " << items_per_block << " size=" << size);
+
+        for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+        {
+            unsigned int seed_value
+                = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+            SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
+
+            // Generate data
+            std::vector<key_type> keys_input;
+
+            keys_input = test_utils::get_random_data<key_type>(
+                size,
+                test_utils::numeric_limits<key_type>::min(),
+                test_utils::numeric_limits<key_type>::max(),
+                seed_value);
+
+            test_utils::add_special_values(keys_input, seed_value);
+
+            // Calculate expected results on host
+            union converter
+            {
+                key_type in;
+                uint64_t out;
+            } c;
+            std::vector<int> expected(keys_input.size());
+            std::vector<int> pfs_expected(pfs_size, 0);
+            for(size_t i = 0; i < grid_size; i++)
+            {
+                size_t     block_offset = i * items_per_block;
+                const auto key_cmp
+                    = test_utils::key_comparator<key_type, descending, start_bit, end_bit>();
+
+                // Perform an 'argsort', which gives a sorted sequence of indices into `keys_input`.
+                std::vector<int> indices(items_per_block);
+                std::iota(indices.begin(), indices.end(), 0);
+                std::stable_sort(indices.begin(),
+                                 indices.end(),
+                                 [&](const int& i, const int& j) {
+                                     return key_cmp(keys_input[block_offset + i],
+                                                    keys_input[block_offset + j]);
+                                 });
+
+                // Invert the sorted indices sequence to obtain the ranks.
+                for(size_t j = 0; j < indices.size(); ++j)
+                {
+                    expected[block_offset + indices[j]] = static_cast<int>(j);
+                }
+
+                /* Calculating the prefix sun on host */
+                size_t pfs_offset = i * pfs_items_per_block;
+
+                std::vector<int> histogram(pfs_items_per_block, 0);
+
+                for(size_t ii = 0; ii < items_per_block; ii++)
+                {
+                    c.in             = keys_input[block_offset + ii];
+                    uint64_t bit_rep = c.out;
+
+                    bit_rep >>= start_bit;
+                    bit_rep &= ((1 << radix_bits) - 1);
+
+                    if(descending)
+                        bit_rep = (1 << radix_bits) - (1 + bit_rep); //flip it
+
+                    ++histogram[bit_rep];
+                }
+                std::exclusive_scan(histogram.begin(),
+                                    histogram.end(),
+                                    pfs_expected.begin() + pfs_offset,
+                                    0);
+            }
+
+            // Preparing device
+            key_type* d_keys_input;
+            int*      d_ranks_output;
+            int*      d_prefix_sum_output;
+            HIP_CHECK(hipMalloc(&d_keys_input, keys_input.size() * sizeof(key_type)));
+            HIP_CHECK(hipMalloc(&d_ranks_output, expected.size() * sizeof(int)));
+            HIP_CHECK(hipMalloc(&d_prefix_sum_output, pfs_size * sizeof(int)));
+
+            HIP_CHECK(hipMemcpy(d_keys_input,
+                                keys_input.data(),
+                                keys_input.size() * sizeof(key_type),
+                                hipMemcpyHostToDevice));
+
+            // Running kernel
+            hipLaunchKernelGGL(HIP_KERNEL_NAME(rank_with_prefix_sum_kernel<block_size,
+                                                                           items_per_thread,
+                                                                           radix_bits,
+                                                                           descending,
+                                                                           Algorithm,
+                                                                           key_type>),
+                               dim3(grid_size),
+                               dim3(block_size),
+                               0,
+                               0,
+                               d_keys_input,
+                               d_ranks_output,
+                               d_prefix_sum_output,
+                               start_bit);
+
+            // Getting results to host
+            std::vector<int> ranks_output(expected.size());
+            std::vector<int> prefix_sum_output(pfs_size);
+            HIP_CHECK(hipMemcpy(ranks_output.data(),
+                                d_ranks_output,
+                                ranks_output.size() * sizeof(int),
+                                hipMemcpyDeviceToHost));
+
+            HIP_CHECK(hipMemcpy(prefix_sum_output.data(),
+                                d_prefix_sum_output,
+                                prefix_sum_output.size() * sizeof(int),
+                                hipMemcpyDeviceToHost));
+
+            // Verifying results
+            for(size_t i = 0; i < size; i++)
+            {
+                SCOPED_TRACE(testing::Message() << "with index= " << i);
+                ASSERT_EQ(ranks_output[i], expected[i]);
+
+                if(i < pfs_size)
+                    ASSERT_EQ(prefix_sum_output[i], pfs_expected[i]);
+            }
+
+            HIP_CHECK(hipFree(d_keys_input));
+            HIP_CHECK(hipFree(d_ranks_output));
+        }
+    }
+}
+
+TYPED_TEST(HipcubBlockRadixRank, BlockRadixRankBasicWithPrefixSumOutput)
+{
+    test_radix_rank_with_prefix_sum_output<TestFixture, RadixRankAlgorithm::RADIX_RANK_BASIC>();
+}
+
+TYPED_TEST(HipcubBlockRadixRank, BlockRadixRankMemoizeWithPrefixSumOutput)
+{
+    test_radix_rank_with_prefix_sum_output<TestFixture, RadixRankAlgorithm::RADIX_RANK_MEMOIZE>();
+}
+
+TYPED_TEST(HipcubBlockRadixRank, BlockRadixRankMatchWithPrefixSumOutput)
+{
+    test_radix_rank_with_prefix_sum_output<TestFixture, RadixRankAlgorithm::RADIX_RANK_MATCH>();
+}
\ No newline at end of file
diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_radix_sort.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_radix_sort.cpp
index 974686945d7..15ee9d9ab8b 100644
--- a/projects/hipcub/test/hipcub/test_hipcub_block_radix_sort.cpp
+++ b/projects/hipcub/test/hipcub/test_hipcub_block_radix_sort.cpp
@@ -31,30 +31,29 @@
 
 #include <cstdint>
 
-template<
-    class Key,
-    class Value,
-    unsigned int BlockSize,
-    unsigned int ItemsPerThread,
-    bool Descending = false,
-    bool ToStriped = false,
-    unsigned int StartBit = 0,
-    unsigned int EndBit = sizeof(Key) * 8
->
+template<class Key,
+         class Value,
+         unsigned int BlockSize,
+         unsigned int ItemsPerThread,
+         bool         Descending = false,
+         bool         ToStriped  = false,
+         unsigned int StartBit   = 0,
+         unsigned int EndBit     = sizeof(Key) * 8>
 struct params
 {
-    using key_type = Key;
-    using value_type = Value;
-    static constexpr unsigned int block_size = BlockSize;
+    using key_type                                 = Key;
+    using value_type                               = Value;
+    static constexpr unsigned int block_size       = BlockSize;
     static constexpr unsigned int items_per_thread = ItemsPerThread;
-    static constexpr bool descending = Descending;
-    static constexpr bool to_striped = ToStriped;
-    static constexpr unsigned int start_bit = StartBit;
-    static constexpr unsigned int end_bit = EndBit;
+    static constexpr bool         descending       = Descending;
+    static constexpr bool         to_striped       = ToStriped;
+    static constexpr unsigned int start_bit        = StartBit;
+    static constexpr unsigned int end_bit          = EndBit;
 };
 
 template<class Params>
-class HipcubBlockRadixSort : public ::testing::Test {
+class HipcubBlockRadixSort : public ::testing::Test
+{
 public:
     using params = Params;
 };
@@ -65,45 +64,133 @@ using Params = ::testing::Types<
     params<__int128_t, __int128_t, 64U, 1>,
     params<__uint128_t, __uint128_t, 64U, 1>,
 #endif
+    params<char, int, 64U, 1>,
+    params<unsigned char, int, 64U, 1>,
+    params<int, int, 64U, 1>,
     params<unsigned int, int, 64U, 1>,
-    params<int, int, 128U, 1>,
-    params<unsigned int, int, 256U, 1>,
-    params<unsigned short, char, 1024U, 1, true>,
+    params<long, int, 64U, 1>,
+    params<unsigned long, int, 64U, 1>,
+    params<float, int, 64U, 1>,
+    params<double, int, 64U, 1>,
+    params<test_utils::custom_test_type<uint8_t>, float, 64U, 1>,
+    params<test_utils::custom_test_type<int16_t>, int, 64U, 1>,
+    params<test_utils::custom_test_type<float>, int, 64U, 1>,
 
     // Non-power of 2 BlockSize
-    params<double, unsigned int, 65U, 1>,
-    params<float, int, 37U, 1>,
-    params<test_utils::bfloat16, int, 37U, 1>,
-    params<test_utils::half, int, 37U, 1>,
-    params<long long, char, 510U, 1, true>,
-    params<unsigned int, long long, 162U, 1, false, true>,
-    params<unsigned char, float, 255U, 1>,
-
-    // Power of 2 BlockSize and ItemsPerThread > 1
-    params<float, char, 64U, 2, true>,
-    params<int, short, 128U, 4>,
-    params<unsigned short, char, 256U, 7>,
-
-    // Non-power of 2 BlockSize and ItemsPerThread > 1
-    params<double, int, 33U, 5>,
-    params<char, double, 464U, 2, true, true>,
-    params<unsigned short, int, 100U, 3>,
-    params<short, int, 234U, 9>,
-
-    // StartBit and EndBit
-    params<unsigned long long, char, 64U, 1, false, false, 8, 20>,
-    params<unsigned short, int, 102U, 3, true, false, 4, 10>,
-    params<unsigned int, short, 162U, 2, true, true, 3, 12>,
-
-    // Stability (a number of key values is lower than BlockSize * ItemsPerThread: some keys appear
-    // multiple times with different values or key parts outside [StartBit, EndBit))
-    params<unsigned char, int, 512U, 2, false, true>,
-    params<unsigned short, double, 60U, 1, true, false, 8, 11>,
-
-    // Sorting keys of a custom type with a custom decomposer
-    params<test_utils::custom_test_type<int16_t>, int, 128, 4>,
-    params<test_utils::custom_test_type<float>, int, 129, 2, true, false>,
-    params<test_utils::custom_test_type<uint8_t>, float, 255, 1, false, true, 1, 12>>;
+    params<char, int, 63U, 1>,
+    params<unsigned char, int, 63U, 1>,
+    params<int, int, 63U, 1>,
+    params<unsigned int, int, 63U, 1>,
+    params<long, int, 63U, 1>,
+    params<unsigned long, int, 63U, 1>,
+    params<float, int, 63U, 1>,
+    params<double, int, 63U, 1>,
+    params<test_utils::custom_test_type<uint8_t>, float, 63U, 1>,
+    params<test_utils::custom_test_type<int16_t>, int, 63U, 1>,
+    params<test_utils::custom_test_type<float>, int, 63U, 1>,
+
+    // Power of 2 BlockSize and ItemsPerThread > 1 and ItemsPerThread Power of 2
+    params<char, int, 64U, 4>,
+    params<unsigned char, int, 64U, 4>,
+    params<int, int, 64U, 4>,
+    params<unsigned int, int, 64U, 4>,
+    params<long, int, 64U, 4>,
+    params<unsigned long, int, 64U, 4>,
+    params<float, int, 64U, 4>,
+    params<double, int, 64U, 4>,
+    params<test_utils::custom_test_type<uint8_t>, float, 64U, 4>,
+    params<test_utils::custom_test_type<int16_t>, int, 64U, 4>,
+    params<test_utils::custom_test_type<float>, int, 64U, 4>,
+
+    // Power of 2 BlockSize and ItemsPerThread > 1 and ItemsPerThread Non-power of 2
+    params<char, int, 64U, 3>,
+    params<unsigned char, int, 64U, 3>,
+    params<int, int, 64U, 3>,
+    params<unsigned int, int, 64U, 3>,
+    params<long, int, 64U, 3>,
+    params<unsigned long, int, 64U, 3>,
+    params<float, int, 64U, 3>,
+    params<double, int, 64U, 3>,
+    params<test_utils::custom_test_type<uint8_t>, float, 64U, 3>,
+    params<test_utils::custom_test_type<int16_t>, int, 64U, 3>,
+    params<test_utils::custom_test_type<float>, int, 64U, 3>,
+
+    // Non-power of 2 BlockSize and ItemsPerThread > 1 and ItemsPerThread Power of 2
+    params<char, int, 63U, 4>,
+    params<unsigned char, int, 63U, 4>,
+    params<int, int, 63U, 4>,
+    params<unsigned int, int, 63U, 4>,
+    params<long, int, 63U, 4>,
+    params<unsigned long, int, 63U, 4>,
+    params<float, int, 63U, 4>,
+    params<double, int, 63U, 4>,
+    params<test_utils::custom_test_type<uint8_t>, float, 63U, 4>,
+    params<test_utils::custom_test_type<int16_t>, int, 63U, 4>,
+    params<test_utils::custom_test_type<float>, int, 63U, 4>,
+
+    // Non-power of 2 BlockSize and ItemsPerThread > 1 and ItemsPerThread Non-power of 2
+    params<char, int, 63U, 3>,
+    params<unsigned char, int, 63U, 3>,
+    params<int, int, 63U, 3>,
+    params<unsigned int, int, 63U, 3>,
+    params<long, int, 63U, 3>,
+    params<unsigned long, int, 63U, 3>,
+    params<float, int, 63U, 3>,
+    params<double, int, 63U, 3>,
+    params<test_utils::custom_test_type<uint8_t>, float, 63U, 3>,
+    params<test_utils::custom_test_type<int16_t>, int, 63U, 3>,
+    params<test_utils::custom_test_type<float>, int, 63U, 3>,
+
+    // Sort with Striped arangement
+    params<char, int, 64U, 4, false, true>,
+    params<unsigned char, int, 64U, 4, false, true>,
+    params<int, int, 64U, 4, false, true>,
+    params<unsigned int, int, 64U, 4, false, true>,
+    params<long, int, 64U, 4, false, true>,
+    params<unsigned long, int, 64U, 4, false, true>,
+    params<float, int, 64U, 4, false, true>,
+    params<double, int, 64U, 4, false, true>,
+    params<test_utils::custom_test_type<uint8_t>, float, 64U, 4, false, true>,
+    params<test_utils::custom_test_type<int16_t>, int, 64U, 4, false, true>,
+    params<test_utils::custom_test_type<float>, int, 64U, 4, false, true>,
+
+    // Sort in Descending order
+    params<char, int, 64U, 4, true>,
+    params<unsigned char, int, 64U, 4, true>,
+    params<int, int, 64U, 4, true>,
+    params<unsigned int, int, 64U, 4, true>,
+    params<long, int, 64U, 4, true>,
+    params<unsigned long, int, 64U, 4, true>,
+    params<float, int, 64U, 4, true>,
+    params<double, int, 64U, 4, true>,
+    params<test_utils::custom_test_type<uint8_t>, float, 64U, 4, true>,
+    params<test_utils::custom_test_type<int16_t>, int, 64U, 4, true>,
+    params<test_utils::custom_test_type<float>, int, 64U, 4, true>,
+    params<char, int, 64U, 4, true, true>,
+    params<unsigned char, int, 64U, 4, true, true>,
+    params<int, int, 64U, 4, true, true>,
+    params<unsigned int, int, 64U, 4, true, true>,
+    params<long, int, 64U, 4, true, true>,
+    params<unsigned long, int, 64U, 4, true, true>,
+    params<float, int, 64U, 4, true, true>,
+    params<double, int, 64U, 4, true, true>,
+    params<test_utils::custom_test_type<uint8_t>, float, 64U, 4, true, true>,
+    params<test_utils::custom_test_type<int16_t>, int, 64U, 4, true, true>,
+    params<test_utils::custom_test_type<float>, int, 64U, 4, true, true>,
+
+    // Sort with
+    params<test_utils::custom_test_type<uint8_t>, float, 64U, 3, false, false, 1, 7>,
+    params<test_utils::custom_test_type<int16_t>, int, 64U, 3, false, false, 1, 7>,
+    params<test_utils::custom_test_type<float>, int, 64U, 3, false, false, 1, 7>,
+    params<test_utils::custom_test_type<uint8_t>, float, 64U, 4, false, true, 1, 7>,
+    params<test_utils::custom_test_type<int16_t>, int, 64U, 4, false, true, 1, 7>,
+    params<test_utils::custom_test_type<float>, int, 64U, 4, false, true, 1, 7>,
+    params<test_utils::custom_test_type<uint8_t>, float, 64U, 4, true, false, 1, 7>,
+    params<test_utils::custom_test_type<int16_t>, int, 64U, 4, true, false, 1, 7>,
+    params<test_utils::custom_test_type<float>, int, 64U, 4, true, false, 1, 7>,
+    params<test_utils::custom_test_type<uint8_t>, float, 64U, 4, true, true, 1, 7>,
+    params<test_utils::custom_test_type<int16_t>, int, 64U, 4, true, true, 1, 7>,
+    params<test_utils::custom_test_type<float>, int, 64U, 4, true, true, 1, 7>>;
 
 TYPED_TEST_SUITE(HipcubBlockRadixSort, Params);
 
@@ -114,7 +201,8 @@ template<>
 struct SortDispatch<false, false>
 {
     template<class BlockSort, class... Args>
-    __device__ static void sort(BlockSort&& block_sort, Args&&... args)
+    __device__
+    static void sort(BlockSort&& block_sort, Args&&... args)
     {
         block_sort.Sort(std::forward<Args>(args)...);
     }
@@ -124,7 +212,8 @@ template<>
 struct SortDispatch<false, true>
 {
     template<class BlockSort, class... Args>
-    __device__ static void sort(BlockSort&& block_sort, Args&&... args)
+    __device__
+    static void sort(BlockSort&& block_sort, Args&&... args)
     {
         block_sort.SortDescending(std::forward<Args>(args)...);
     }
@@ -134,7 +223,8 @@ template<>
 struct SortDispatch<true, false>
 {
     template<class BlockSort, class... Args>
-    __device__ static void sort(BlockSort&& block_sort, Args&&... args)
+    __device__
+    static void sort(BlockSort&& block_sort, Args&&... args)
     {
         block_sort.SortBlockedToStriped(std::forward<Args>(args)...);
     }
@@ -144,7 +234,8 @@ template<>
 struct SortDispatch<true, true>
 {
     template<class BlockSort, class... Args>
-    __device__ static void sort(BlockSort&& block_sort, Args&&... args)
+    __device__
+    static void sort(BlockSort&& block_sort, Args&&... args)
     {
         block_sort.SortDescendingBlockedToStriped(std::forward<Args>(args)...);
     }
@@ -156,22 +247,25 @@ struct SortOp
     using dispatch_t = SortDispatch<Striped, Descending>;
 
     template<class Key>
-    __device__ void operator()(Key (&keys)[ItemsPerThread], int start_bit, int end_bit) const
+    __device__
+    void operator()(Key (&keys)[ItemsPerThread], int start_bit, int end_bit) const
     {
         hipcub::BlockRadixSort<Key, BlockSize, ItemsPerThread> block_sort;
         if(start_bit == 0 && end_bit == sizeof(Key) * 8)
         {
             dispatch_t::sort(block_sort, keys);
-        } else
+        }
+        else
         {
             dispatch_t::sort(block_sort, keys, start_bit, end_bit);
         }
     }
 
     template<class InnerT>
-    __device__ void operator()(test_utils::custom_test_type<InnerT> (&keys)[ItemsPerThread],
-                               int start_bit,
-                               int end_bit) const
+    __device__
+    void operator()(test_utils::custom_test_type<InnerT> (&keys)[ItemsPerThread],
+                    int start_bit,
+                    int end_bit) const
     {
         using custom_test_t = test_utils::custom_test_type<InnerT>;
         hipcub::BlockRadixSort<custom_test_t, BlockSize, ItemsPerThread> block_sort;
@@ -179,33 +273,37 @@ struct SortOp
         if(start_bit == 0 && end_bit == sizeof(custom_test_t) * 8)
         {
             dispatch_t::sort(block_sort, keys, decomposer);
-        } else
+        }
+        else
         {
             dispatch_t::sort(block_sort, keys, decomposer, start_bit, end_bit);
         }
     }
 
     template<class Key, class Value>
-    __device__ void operator()(Key (&keys)[ItemsPerThread],
-                               Value (&values)[ItemsPerThread],
-                               int start_bit,
-                               int end_bit) const
+    __device__
+    void operator()(Key (&keys)[ItemsPerThread],
+                    Value (&values)[ItemsPerThread],
+                    int start_bit,
+                    int end_bit) const
     {
         hipcub::BlockRadixSort<Key, BlockSize, ItemsPerThread, Value> block_sort;
         if(start_bit == 0 && end_bit == sizeof(Key) * 8)
         {
             dispatch_t::sort(block_sort, keys, values);
-        } else
+        }
+        else
         {
             dispatch_t::sort(block_sort, keys, values, start_bit, end_bit);
         }
     }
 
     template<class InnerT, class Value>
-    __device__ void operator()(test_utils::custom_test_type<InnerT> (&keys)[ItemsPerThread],
-                               Value (&values)[ItemsPerThread],
-                               int start_bit,
-                               int end_bit) const
+    __device__
+    void operator()(test_utils::custom_test_type<InnerT> (&keys)[ItemsPerThread],
+                    Value (&values)[ItemsPerThread],
+                    int start_bit,
+                    int end_bit) const
     {
         using custom_test_t = test_utils::custom_test_type<InnerT>;
         hipcub::BlockRadixSort<custom_test_t, BlockSize, ItemsPerThread, Value> block_sort;
@@ -213,7 +311,8 @@ struct SortOp
         if(start_bit == 0 && end_bit == sizeof(custom_test_t) * 8)
         {
             dispatch_t::sort(block_sort, keys, values, decomposer);
-        } else
+        }
+        else
         {
             dispatch_t::sort(block_sort, keys, values, decomposer, start_bit, end_bit);
         }
@@ -229,17 +328,19 @@ struct StoreOp<BlockSize, ItemsPerThread, false>
     static constexpr unsigned int items_per_block = BlockSize * ItemsPerThread;
 
     template<class Key>
-    __device__ void operator()(Key (&keys)[ItemsPerThread], Key* keys_output) const
+    __device__
+    void operator()(Key (&keys)[ItemsPerThread], Key* keys_output) const
     {
         const unsigned int block_offset = blockIdx.x * items_per_block;
         hipcub::StoreDirectBlocked(threadIdx.x, keys_output + block_offset, keys);
     }
 
     template<class Key, class Value>
-    __device__ void operator()(Key (&keys)[ItemsPerThread],
-                               Value (&values)[ItemsPerThread],
-                               Key*   keys_output,
-                               Value* values_output) const
+    __device__
+    void operator()(Key (&keys)[ItemsPerThread],
+                    Value (&values)[ItemsPerThread],
+                    Key*   keys_output,
+                    Value* values_output) const
     {
         const unsigned int block_offset = blockIdx.x * items_per_block;
         hipcub::StoreDirectBlocked(threadIdx.x, keys_output + block_offset, keys);
@@ -253,17 +354,19 @@ struct StoreOp<BlockSize, ItemsPerThread, true>
     static constexpr unsigned int items_per_block = BlockSize * ItemsPerThread;
 
     template<class Key>
-    __device__ void operator()(Key (&keys)[ItemsPerThread], Key* keys_output) const
+    __device__
+    void operator()(Key (&keys)[ItemsPerThread], Key* keys_output) const
     {
         const unsigned int block_offset = blockIdx.x * items_per_block;
         hipcub::StoreDirectStriped<BlockSize>(threadIdx.x, keys_output + block_offset, keys);
     }
 
     template<class Key, class Value>
-    __device__ void operator()(Key (&keys)[ItemsPerThread],
-                               Value (&values)[ItemsPerThread],
-                               Key*   keys_output,
-                               Value* values_output) const
+    __device__
+    void operator()(Key (&keys)[ItemsPerThread],
+                    Value (&values)[ItemsPerThread],
+                    Key*   keys_output,
+                    Value* values_output) const
     {
         const unsigned int block_offset = blockIdx.x * items_per_block;
         hipcub::StoreDirectStriped<BlockSize>(threadIdx.x, keys_output + block_offset, keys);
@@ -276,9 +379,8 @@ template<unsigned int BlockSize,
          bool         Striped,
          bool         Descending,
          class key_type>
-__global__ __launch_bounds__(BlockSize) void sort_key_kernel(key_type*    device_keys_output,
-                                                             unsigned int start_bit,
-                                                             unsigned int end_bit)
+__global__ __launch_bounds__(BlockSize)
+void sort_key_kernel(key_type* device_keys_output, unsigned int start_bit, unsigned int end_bit)
 {
     constexpr unsigned int items_per_block = BlockSize * ItemsPerThread;
     const unsigned int     block_offset    = blockIdx.x * items_per_block;
@@ -296,8 +398,8 @@ void assert_eq(T a, U b, size_t index)
     // GTest's ASSERT_EQ prints the values if the test fails. On Windows, GTest doesn't currently provide overloads for
     // printing 128 bit types, resulting in linker errors.
     // Check if we're testing with 128 bit types. If so, test using bools so GTest doesn't try to print them on failure.
-    if (test_utils::is_int128<T>::value || test_utils::is_uint128<T>::value ||
-        test_utils::is_int128<U>::value || test_utils::is_uint128<U>::value)
+    if(test_utils::is_int128<T>::value || test_utils::is_uint128<T>::value
+       || test_utils::is_int128<U>::value || test_utils::is_uint128<U>::value)
     {
         const bool values_equal = (a == b);
         ASSERT_EQ(values_equal, true) << "at index: " << index;
@@ -314,26 +416,27 @@ TYPED_TEST(HipcubBlockRadixSort, SortKeys)
     SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
     HIP_CHECK(hipSetDevice(device_id));
 
-    using key_type = typename TestFixture::params::key_type;
-    constexpr size_t block_size = TestFixture::params::block_size;
-    constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
-    constexpr bool descending = TestFixture::params::descending;
-    constexpr bool to_striped = TestFixture::params::to_striped;
-    constexpr unsigned int start_bit = TestFixture::params::start_bit;
-    constexpr unsigned int end_bit = TestFixture::params::end_bit;
-    constexpr size_t items_per_block = block_size * items_per_thread;
+    using key_type                          = typename TestFixture::params::key_type;
+    constexpr size_t       block_size       = TestFixture::params::block_size;
+    constexpr size_t       items_per_thread = TestFixture::params::items_per_thread;
+    constexpr bool         descending       = TestFixture::params::descending;
+    constexpr bool         to_striped       = TestFixture::params::to_striped;
+    constexpr unsigned int start_bit        = TestFixture::params::start_bit;
+    constexpr unsigned int end_bit          = TestFixture::params::end_bit;
+    constexpr size_t       items_per_block  = block_size * items_per_thread;
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
     {
         return;
     }
 
-    const size_t size = items_per_block * 1134;
+    const size_t size      = items_per_block * 1134;
     const size_t grid_size = size / items_per_block;
 
-    for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
-        unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
         SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
 
         // Generate data
@@ -363,40 +466,35 @@ TYPED_TEST(HipcubBlockRadixSort, SortKeys)
             std::stable_sort(
                 expected.begin() + (i * items_per_block),
                 expected.begin() + ((i + 1) * items_per_block),
-                test_utils::key_comparator<key_type, descending, start_bit, end_bit>()
-            );
+                test_utils::key_comparator<key_type, descending, start_bit, end_bit>());
         }
 
         // Preparing device
         key_type* device_keys_output;
-        HIP_CHECK(test_common_utils::hipMallocHelper(&device_keys_output, keys_output.size() * sizeof(key_type)));
+        HIP_CHECK(test_common_utils::hipMallocHelper(&device_keys_output,
+                                                     keys_output.size() * sizeof(key_type)));
 
-        HIP_CHECK(
-            hipMemcpy(
-                device_keys_output, keys_output.data(),
-                keys_output.size() * sizeof(typename decltype(keys_output)::value_type),
-                hipMemcpyHostToDevice
-            )
-        );
+        HIP_CHECK(hipMemcpy(device_keys_output,
+                            keys_output.data(),
+                            keys_output.size() * sizeof(typename decltype(keys_output)::value_type),
+                            hipMemcpyHostToDevice));
 
         // Running kernel
         sort_key_kernel<block_size, items_per_thread, to_striped, descending>
             <<<dim3(grid_size), dim3(block_size), 0, 0>>>(device_keys_output, start_bit, end_bit);
 
         // Getting results to host
-        HIP_CHECK(
-            hipMemcpy(
-                keys_output.data(), device_keys_output,
-                keys_output.size() * sizeof(typename decltype(keys_output)::value_type),
-                hipMemcpyDeviceToHost
-            )
-        );
+        HIP_CHECK(hipMemcpy(keys_output.data(),
+                            device_keys_output,
+                            keys_output.size() * sizeof(typename decltype(keys_output)::value_type),
+                            hipMemcpyDeviceToHost));
 
         // Verifying results
         for(size_t i = 0; i < size; i++)
         {
             assert_eq(test_utils::convert_to_native(keys_output[i]),
-                      test_utils::convert_to_native(expected[i]), i);
+                      test_utils::convert_to_native(expected[i]),
+                      i);
         }
 
         HIP_CHECK(hipFree(device_keys_output));
@@ -409,16 +507,17 @@ template<unsigned int BlockSize,
          bool         Descending,
          class key_type,
          class value_type>
-__global__ __launch_bounds__(BlockSize) void sort_key_value_kernel(key_type*   device_keys_output,
-                                                                   value_type* device_values_output,
-                                                                   unsigned int start_bit,
-                                                                   unsigned int end_bit)
+__global__ __launch_bounds__(BlockSize)
+void sort_key_value_kernel(key_type*    device_keys_output,
+                           value_type*  device_values_output,
+                           unsigned int start_bit,
+                           unsigned int end_bit)
 {
     constexpr unsigned int items_per_block = BlockSize * ItemsPerThread;
     const unsigned int     lid             = threadIdx.x;
     const unsigned int     block_offset    = blockIdx.x * items_per_block;
 
-    key_type keys[ItemsPerThread];
+    key_type   keys[ItemsPerThread];
     value_type values[ItemsPerThread];
     hipcub::LoadDirectBlocked(lid, device_keys_output + block_offset, keys);
     hipcub::LoadDirectBlocked(lid, device_values_output + block_offset, values);
@@ -436,27 +535,28 @@ TYPED_TEST(HipcubBlockRadixSort, SortKeysValues)
     SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
     HIP_CHECK(hipSetDevice(device_id));
 
-    using key_type = typename TestFixture::params::key_type;
-    using value_type = typename TestFixture::params::value_type;
-    constexpr size_t block_size = TestFixture::params::block_size;
-    constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
-    constexpr bool descending = TestFixture::params::descending;
-    constexpr bool to_striped = TestFixture::params::to_striped;
-    constexpr unsigned int start_bit = TestFixture::params::start_bit;
-    constexpr unsigned int end_bit = TestFixture::params::end_bit;
-    constexpr size_t items_per_block = block_size * items_per_thread;
+    using key_type                          = typename TestFixture::params::key_type;
+    using value_type                        = typename TestFixture::params::value_type;
+    constexpr size_t       block_size       = TestFixture::params::block_size;
+    constexpr size_t       items_per_thread = TestFixture::params::items_per_thread;
+    constexpr bool         descending       = TestFixture::params::descending;
+    constexpr bool         to_striped       = TestFixture::params::to_striped;
+    constexpr unsigned int start_bit        = TestFixture::params::start_bit;
+    constexpr unsigned int end_bit          = TestFixture::params::end_bit;
+    constexpr size_t       items_per_block  = block_size * items_per_thread;
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
     {
         return;
     }
 
-    const size_t size = items_per_block * 1134;
+    const size_t size      = items_per_block * 1134;
     const size_t grid_size = size / items_per_block;
 
-    for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
-        unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
         SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
 
         // Generate data
@@ -490,12 +590,11 @@ TYPED_TEST(HipcubBlockRadixSort, SortKeysValues)
         }
         else
         {
-            values_output = test_utils::get_random_data<value_type>(
-                size,
-                std::numeric_limits<value_type>::min(),
-                std::numeric_limits<value_type>::max(),
-                seed_value + seed_value_addition
-            );
+            values_output
+                = test_utils::get_random_data<value_type>(size,
+                                                          std::numeric_limits<value_type>::min(),
+                                                          std::numeric_limits<value_type>::max(),
+                                                          seed_value + seed_value_addition);
         }
 
         using key_value = std::pair<key_type, value_type>;
@@ -512,30 +611,27 @@ TYPED_TEST(HipcubBlockRadixSort, SortKeysValues)
             std::stable_sort(
                 expected.begin() + (i * items_per_block),
                 expected.begin() + ((i + 1) * items_per_block),
-                test_utils::key_value_comparator<key_type, value_type, descending, start_bit, end_bit>()
-            );
+                test_utils::
+                    key_value_comparator<key_type, value_type, descending, start_bit, end_bit>());
         }
 
         key_type* device_keys_output;
-        HIP_CHECK(test_common_utils::hipMallocHelper(&device_keys_output, keys_output.size() * sizeof(key_type)));
+        HIP_CHECK(test_common_utils::hipMallocHelper(&device_keys_output,
+                                                     keys_output.size() * sizeof(key_type)));
         value_type* device_values_output;
-        HIP_CHECK(test_common_utils::hipMallocHelper(&device_values_output, values_output.size() * sizeof(value_type)));
+        HIP_CHECK(test_common_utils::hipMallocHelper(&device_values_output,
+                                                     values_output.size() * sizeof(value_type)));
 
-        HIP_CHECK(
-            hipMemcpy(
-                device_keys_output, keys_output.data(),
-                keys_output.size() * sizeof(typename decltype(keys_output)::value_type),
-                hipMemcpyHostToDevice
-            )
-        );
+        HIP_CHECK(hipMemcpy(device_keys_output,
+                            keys_output.data(),
+                            keys_output.size() * sizeof(typename decltype(keys_output)::value_type),
+                            hipMemcpyHostToDevice));
 
         HIP_CHECK(
-            hipMemcpy(
-                device_values_output, values_output.data(),
-                values_output.size() * sizeof(typename decltype(values_output)::value_type),
-                hipMemcpyHostToDevice
-            )
-        );
+            hipMemcpy(device_values_output,
+                      values_output.data(),
+                      values_output.size() * sizeof(typename decltype(values_output)::value_type),
+                      hipMemcpyHostToDevice));
 
         // Running kernel
         sort_key_value_kernel<block_size, items_per_thread, to_striped, descending>
@@ -545,31 +641,28 @@ TYPED_TEST(HipcubBlockRadixSort, SortKeysValues)
                                                           end_bit);
 
         // Getting results to host
-        HIP_CHECK(
-            hipMemcpy(
-                keys_output.data(), device_keys_output,
-                keys_output.size() * sizeof(typename decltype(keys_output)::value_type),
-                hipMemcpyDeviceToHost
-            )
-        );
+        HIP_CHECK(hipMemcpy(keys_output.data(),
+                            device_keys_output,
+                            keys_output.size() * sizeof(typename decltype(keys_output)::value_type),
+                            hipMemcpyDeviceToHost));
 
         HIP_CHECK(
-            hipMemcpy(
-                values_output.data(), device_values_output,
-                values_output.size() * sizeof(typename decltype(values_output)::value_type),
-                hipMemcpyDeviceToHost
-            )
-        );
+            hipMemcpy(values_output.data(),
+                      device_values_output,
+                      values_output.size() * sizeof(typename decltype(values_output)::value_type),
+                      hipMemcpyDeviceToHost));
 
         for(size_t i = 0; i < size; i++)
         {
-            assert_eq(test_utils::convert_to_native(keys_output[i]), 
-                      test_utils::convert_to_native(expected[i].first), i);
+            assert_eq(test_utils::convert_to_native(keys_output[i]),
+                      test_utils::convert_to_native(expected[i].first),
+                      i);
             assert_eq(test_utils::convert_to_native(values_output[i]),
-                      test_utils::convert_to_native(expected[i].second), i);
+                      test_utils::convert_to_native(expected[i].second),
+                      i);
         }
 
         HIP_CHECK(hipFree(device_keys_output));
         HIP_CHECK(hipFree(device_values_output));
     }
-}
+}
\ No newline at end of file
diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_reduce.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_reduce.cpp
index 44ddf181538..71378b559e3 100644
--- a/projects/hipcub/test/hipcub/test_hipcub_block_reduce.cpp
+++ b/projects/hipcub/test/hipcub/test_hipcub_block_reduce.cpp
@@ -27,18 +27,17 @@
 #include "hipcub/thread/thread_operators.hpp"
 
 // Params for tests
-template<
-    class T,
-    unsigned int BlockSize = 256U,
-    unsigned int ItemsPerThread = 1U,
-    hipcub::BlockReduceAlgorithm Algorithm = hipcub::BlockReduceAlgorithm::BLOCK_REDUCE_WARP_REDUCTIONS
->
+template<class T,
+         unsigned int                 BlockSize      = 256U,
+         unsigned int                 ItemsPerThread = 1U,
+         hipcub::BlockReduceAlgorithm Algorithm
+         = hipcub::BlockReduceAlgorithm::BLOCK_REDUCE_WARP_REDUCTIONS>
 struct params
 {
-    using type = T;
-    static constexpr hipcub::BlockReduceAlgorithm algorithm = Algorithm;
-    static constexpr unsigned int block_size = BlockSize;
-    static constexpr unsigned int items_per_thread = ItemsPerThread;
+    using type                                                     = T;
+    static constexpr hipcub::BlockReduceAlgorithm algorithm        = Algorithm;
+    static constexpr unsigned int                 block_size       = BlockSize;
+    static constexpr unsigned int                 items_per_thread = ItemsPerThread;
 };
 
 // ---------------------------------------------------------
@@ -49,9 +48,9 @@ template<class Params>
 class HipcubBlockReduceSingleValueTests : public ::testing::Test
 {
 public:
-    using type = typename Params::type;
-    static constexpr hipcub::BlockReduceAlgorithm algorithm = Params::algorithm;
-    static constexpr unsigned int block_size = Params::block_size;
+    using type                                               = typename Params::type;
+    static constexpr hipcub::BlockReduceAlgorithm algorithm  = Params::algorithm;
+    static constexpr unsigned int                 block_size = Params::block_size;
 };
 
 using SingleValueTestParams = ::testing::Types<
@@ -122,17 +121,13 @@ using SingleValueTestParams = ::testing::Types<
 
 TYPED_TEST_SUITE(HipcubBlockReduceSingleValueTests, SingleValueTestParams);
 
-template<
-    unsigned int BlockSize,
-    hipcub::BlockReduceAlgorithm Algorithm,
-    class T
->
+template<unsigned int BlockSize, hipcub::BlockReduceAlgorithm Algorithm, class T>
 __global__
 __launch_bounds__(BlockSize)
 void reduce_kernel(T* device_output, T* device_output_reductions)
 {
-    const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
-    T value = device_output[index];
+    const unsigned int                         index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    T                                          value = device_output[index];
     using breduce_t = hipcub::BlockReduce<T, BlockSize, Algorithm>;
     __shared__ typename breduce_t::TempStorage temp_storage;
     value = breduce_t(temp_storage).Reduce(value, hipcub::Sum());
@@ -154,7 +149,7 @@ TYPED_TEST(HipcubBlockReduceSingleValueTests, Reduce)
     binary_op_type_host binary_op_host;
     using acc_type = typename test_utils::select_plus_operator_host<T>::acc_type;
 
-    constexpr auto algorithm = TestFixture::algorithm;
+    constexpr auto   algorithm  = TestFixture::algorithm;
     constexpr size_t block_size = TestFixture::block_size;
 
     // Given block size not supported
@@ -163,12 +158,13 @@ TYPED_TEST(HipcubBlockReduceSingleValueTests, Reduce)
         return;
     }
 
-    const size_t size = block_size * 113;
+    const size_t size      = block_size * 113;
     const size_t grid_size = size / block_size;
 
-    for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
-        unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
         SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
 
         // Generate data
@@ -193,31 +189,129 @@ TYPED_TEST(HipcubBlockReduceSingleValueTests, Reduce)
         T* device_output;
         HIP_CHECK(test_common_utils::hipMallocHelper(&device_output, output.size() * sizeof(T)));
         T* device_output_reductions;
-        HIP_CHECK(test_common_utils::hipMallocHelper(&device_output_reductions, output_reductions.size() * sizeof(T)));
+        HIP_CHECK(test_common_utils::hipMallocHelper(&device_output_reductions,
+                                                     output_reductions.size() * sizeof(T)));
 
-        HIP_CHECK(
-            hipMemcpy(
-                device_output, output.data(),
-                output.size() * sizeof(T),
-                hipMemcpyHostToDevice
-            )
-        );
+        HIP_CHECK(hipMemcpy(device_output,
+                            output.data(),
+                            output.size() * sizeof(T),
+                            hipMemcpyHostToDevice));
 
         // Running kernel
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(reduce_kernel<block_size, algorithm, T>),
-            dim3(grid_size), dim3(block_size), 0, 0,
-            device_output, device_output_reductions
-        );
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_kernel<block_size, algorithm, T>),
+                           dim3(grid_size),
+                           dim3(block_size),
+                           0,
+                           0,
+                           device_output,
+                           device_output_reductions);
 
         // Reading results back
-        HIP_CHECK(
-            hipMemcpy(
-                output_reductions.data(), device_output_reductions,
-                output_reductions.size() * sizeof(T),
-                hipMemcpyDeviceToHost
-            )
-        );
+        HIP_CHECK(hipMemcpy(output_reductions.data(),
+                            device_output_reductions,
+                            output_reductions.size() * sizeof(T),
+                            hipMemcpyDeviceToHost));
+
+        // Verifying results
+        test_utils::assert_near(output_reductions,
+                                expected_reductions,
+                                test_utils::precision<T>::value * block_size);
+
+        HIP_CHECK(hipFree(device_output));
+        HIP_CHECK(hipFree(device_output_reductions));
+    }
+}
+
+template<unsigned int BlockSize, hipcub::BlockReduceAlgorithm Algorithm, class T>
+__global__
+__launch_bounds__(BlockSize)
+void sum_kernel(T* device_output, T* device_output_reductions)
+{
+    const unsigned int                         index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    T                                          value = device_output[index];
+    using breduce_t = hipcub::BlockReduce<T, BlockSize, Algorithm>;
+    __shared__ typename breduce_t::TempStorage temp_storage;
+    value = breduce_t(temp_storage).Sum(value);
+    if(hipThreadIdx_x == 0)
+    {
+        device_output_reductions[hipBlockIdx_x] = value;
+    }
+}
+
+TYPED_TEST(HipcubBlockReduceSingleValueTests, Sum)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using T = typename TestFixture::type;
+    // for bfloat16 and half we use double for host-side accumulation
+    using binary_op_type_host = typename test_utils::select_plus_operator_host<T>::type;
+    binary_op_type_host binary_op_host;
+    using acc_type = typename test_utils::select_plus_operator_host<T>::acc_type;
+
+    constexpr auto   algorithm  = TestFixture::algorithm;
+    constexpr size_t block_size = TestFixture::block_size;
+
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size())
+    {
+        return;
+    }
+
+    const size_t size      = block_size * 113;
+    const size_t grid_size = size / block_size;
+
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    {
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
+
+        // Generate data
+        std::vector<T> output = test_utils::get_random_data<T>(size, 2, 200, seed_value);
+        std::vector<T> output_reductions(size / block_size);
+
+        // Calculate expected results on host
+        std::vector<T> expected_reductions(output_reductions.size(),
+                                           test_utils::convert_to_device<T>(0));
+        for(size_t i = 0; i < output.size() / block_size; i++)
+        {
+            acc_type value(0);
+            for(size_t j = 0; j < block_size; j++)
+            {
+                auto idx = i * block_size + j;
+                value    = binary_op_host(value, output[idx]);
+            }
+            expected_reductions[i] = static_cast<T>(value);
+        }
+
+        // Preparing device
+        T* device_output;
+        HIP_CHECK(test_common_utils::hipMallocHelper(&device_output, output.size() * sizeof(T)));
+        T* device_output_reductions;
+        HIP_CHECK(test_common_utils::hipMallocHelper(&device_output_reductions,
+                                                     output_reductions.size() * sizeof(T)));
+
+        HIP_CHECK(hipMemcpy(device_output,
+                            output.data(),
+                            output.size() * sizeof(T),
+                            hipMemcpyHostToDevice));
+
+        // Running kernel
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(sum_kernel<block_size, algorithm, T>),
+                           dim3(grid_size),
+                           dim3(block_size),
+                           0,
+                           0,
+                           device_output,
+                           device_output_reductions);
+
+        // Reading results back
+        HIP_CHECK(hipMemcpy(output_reductions.data(),
+                            device_output_reductions,
+                            output_reductions.size() * sizeof(T),
+                            hipMemcpyDeviceToHost));
 
         // Verifying results
         test_utils::assert_near(output_reductions,
@@ -231,17 +325,15 @@ TYPED_TEST(HipcubBlockReduceSingleValueTests, Reduce)
 
 TYPED_TEST_SUITE(HipcubBlockReduceSingleValueTests, SingleValueTestParams);
 
-template<
-    unsigned int BlockSize,
-    hipcub::BlockReduceAlgorithm Algorithm,
-    class T
->
+template<unsigned int BlockSize, hipcub::BlockReduceAlgorithm Algorithm, class T>
 __global__
 __launch_bounds__(BlockSize)
-void reduce_valid_kernel(T* device_output, T* device_output_reductions, const unsigned int valid_items)
+void reduce_valid_kernel(T*                 device_output,
+                         T*                 device_output_reductions,
+                         const unsigned int valid_items)
 {
-    const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
-    T value = device_output[index];
+    const unsigned int                         index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    T                                          value = device_output[index];
     using breduce_t = hipcub::BlockReduce<T, BlockSize, Algorithm>;
     __shared__ typename breduce_t::TempStorage temp_storage;
     value = breduce_t(temp_storage).Reduce(value, hipcub::Sum(), valid_items);
@@ -266,8 +358,8 @@ TYPED_TEST(HipcubBlockReduceSingleValueTests, ReduceValid)
     constexpr auto algorithm = TestFixture::algorithm;
 
     constexpr size_t block_size = TestFixture::block_size;
-    const size_t size = block_size * 113;
-    const size_t grid_size = size / block_size;
+    const size_t     size       = block_size * 113;
+    const size_t     grid_size  = size / block_size;
 
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
@@ -275,24 +367,18 @@ TYPED_TEST(HipcubBlockReduceSingleValueTests, ReduceValid)
         return;
     }
 
-    for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
-        unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
         SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
 
-        const unsigned int valid_items = test_utils::get_random_value(
-            block_size - 10,
-            block_size,
-            seed_value
-        );
+        const unsigned int valid_items
+            = test_utils::get_random_value(block_size - 10, block_size, seed_value);
 
         // Generate data
-        std::vector<T> output = test_utils::get_random_data<T>(
-            size,
-            2,
-            200,
-            seed_value + seed_value_addition
-        );
+        std::vector<T> output
+            = test_utils::get_random_data<T>(size, 2, 200, seed_value + seed_value_addition);
         std::vector<T> output_reductions(size / block_size);
 
         // Calculate expected results on host
@@ -313,31 +399,29 @@ TYPED_TEST(HipcubBlockReduceSingleValueTests, ReduceValid)
         T* device_output;
         HIP_CHECK(test_common_utils::hipMallocHelper(&device_output, output.size() * sizeof(T)));
         T* device_output_reductions;
-        HIP_CHECK(test_common_utils::hipMallocHelper(&device_output_reductions, output_reductions.size() * sizeof(T)));
+        HIP_CHECK(test_common_utils::hipMallocHelper(&device_output_reductions,
+                                                     output_reductions.size() * sizeof(T)));
 
-        HIP_CHECK(
-            hipMemcpy(
-                device_output, output.data(),
-                output.size() * sizeof(T),
-                hipMemcpyHostToDevice
-            )
-        );
+        HIP_CHECK(hipMemcpy(device_output,
+                            output.data(),
+                            output.size() * sizeof(T),
+                            hipMemcpyHostToDevice));
 
         // Running kernel
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(reduce_valid_kernel<block_size, algorithm, T>),
-            dim3(grid_size), dim3(block_size), 0, 0,
-            device_output, device_output_reductions, valid_items
-        );
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_valid_kernel<block_size, algorithm, T>),
+                           dim3(grid_size),
+                           dim3(block_size),
+                           0,
+                           0,
+                           device_output,
+                           device_output_reductions,
+                           valid_items);
 
         // Reading results back
-        HIP_CHECK(
-            hipMemcpy(
-                output_reductions.data(), device_output_reductions,
-                output_reductions.size() * sizeof(T),
-                hipMemcpyDeviceToHost
-            )
-        );
+        HIP_CHECK(hipMemcpy(output_reductions.data(),
+                            device_output_reductions,
+                            output_reductions.size() * sizeof(T),
+                            hipMemcpyDeviceToHost));
 
         // Verifying results
         test_utils::assert_near(output_reductions,
@@ -349,15 +433,120 @@ TYPED_TEST(HipcubBlockReduceSingleValueTests, ReduceValid)
     }
 }
 
+template<unsigned int BlockSize, hipcub::BlockReduceAlgorithm Algorithm, class T>
+__global__
+__launch_bounds__(BlockSize)
+void sum_valid_kernel(T* device_output, T* device_output_reductions, const unsigned int valid_items)
+{
+    const unsigned int                         index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    T                                          value = device_output[index];
+    using breduce_t = hipcub::BlockReduce<T, BlockSize, Algorithm>;
+    __shared__ typename breduce_t::TempStorage temp_storage;
+    value = breduce_t(temp_storage).Sum(value, valid_items);
+    if(hipThreadIdx_x == 0)
+    {
+        device_output_reductions[hipBlockIdx_x] = value;
+    }
+}
+
+TYPED_TEST(HipcubBlockReduceSingleValueTests, SumValid)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using T = typename TestFixture::type;
+    // for bfloat16 and half we use double for host-side accumulation
+    using binary_op_type_host = typename test_utils::select_plus_operator_host<T>::type;
+    binary_op_type_host binary_op_host;
+    using acc_type = typename test_utils::select_plus_operator_host<T>::acc_type;
+
+    constexpr auto algorithm = TestFixture::algorithm;
+
+    constexpr size_t block_size = TestFixture::block_size;
+    const size_t     size       = block_size * 113;
+    const size_t     grid_size  = size / block_size;
+
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size())
+    {
+        return;
+    }
+
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    {
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
+
+        const unsigned int valid_items
+            = test_utils::get_random_value(block_size - 10, block_size, seed_value);
+
+        // Generate data
+        std::vector<T> output
+            = test_utils::get_random_data<T>(size, 2, 200, seed_value + seed_value_addition);
+        std::vector<T> output_reductions(size / block_size);
+
+        // Calculate expected results on host
+        std::vector<T> expected_reductions(output_reductions.size(),
+                                           test_utils::convert_to_device<T>(0));
+        for(size_t i = 0; i < output.size() / block_size; i++)
+        {
+            acc_type value(0);
+            for(size_t j = 0; j < valid_items; j++)
+            {
+                auto idx = i * block_size + j;
+                value    = binary_op_host(output[idx], value);
+            }
+            expected_reductions[i] = static_cast<T>(value);
+        }
+
+        // Preparing device
+        T* device_output;
+        HIP_CHECK(test_common_utils::hipMallocHelper(&device_output, output.size() * sizeof(T)));
+        T* device_output_reductions;
+        HIP_CHECK(test_common_utils::hipMallocHelper(&device_output_reductions,
+                                                     output_reductions.size() * sizeof(T)));
+
+        HIP_CHECK(hipMemcpy(device_output,
+                            output.data(),
+                            output.size() * sizeof(T),
+                            hipMemcpyHostToDevice));
+
+        // Running kernel
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(sum_valid_kernel<block_size, algorithm, T>),
+                           dim3(grid_size),
+                           dim3(block_size),
+                           0,
+                           0,
+                           device_output,
+                           device_output_reductions,
+                           valid_items);
+
+        // Reading results back
+        HIP_CHECK(hipMemcpy(output_reductions.data(),
+                            device_output_reductions,
+                            output_reductions.size() * sizeof(T),
+                            hipMemcpyDeviceToHost));
+
+        // Verifying results
+        test_utils::assert_near(output_reductions,
+                                expected_reductions,
+                                test_utils::precision<T>::value * block_size);
+
+        HIP_CHECK(hipFree(device_output));
+        HIP_CHECK(hipFree(device_output_reductions));
+    }
+}
 
 template<class Params>
 class HipcubBlockReduceInputArrayTests : public ::testing::Test
 {
 public:
-    using type = typename Params::type;
-    static constexpr unsigned int block_size = Params::block_size;
-    static constexpr hipcub::BlockReduceAlgorithm algorithm = Params::algorithm;
-    static constexpr unsigned int items_per_thread = Params::items_per_thread;
+    using type                                                     = typename Params::type;
+    static constexpr unsigned int                 block_size       = Params::block_size;
+    static constexpr hipcub::BlockReduceAlgorithm algorithm        = Params::algorithm;
+    static constexpr unsigned int                 items_per_thread = Params::items_per_thread;
 };
 
 using InputArrayTestParams = ::testing::Types<
@@ -394,12 +583,10 @@ using InputArrayTestParams = ::testing::Types<
 
 TYPED_TEST_SUITE(HipcubBlockReduceInputArrayTests, InputArrayTestParams);
 
-template<
-    unsigned int BlockSize,
-    unsigned int ItemsPerThread,
-    hipcub::BlockReduceAlgorithm Algorithm,
-    class T
->
+template<unsigned int                 BlockSize,
+         unsigned int                 ItemsPerThread,
+         hipcub::BlockReduceAlgorithm Algorithm,
+         class T>
 __global__
 __launch_bounds__(BlockSize)
 void reduce_array_kernel(T* device_output, T* device_output_reductions)
@@ -412,7 +599,7 @@ void reduce_array_kernel(T* device_output, T* device_output_reductions)
         in_out[j] = device_output[index + j];
     }
 
-    T reduction;
+    T                                          reduction;
     using breduce_t = hipcub::BlockReduce<T, BlockSize, Algorithm>;
     __shared__ typename breduce_t::TempStorage temp_storage;
     reduction = breduce_t(temp_storage).Reduce(in_out, hipcub::Sum());
@@ -423,7 +610,6 @@ void reduce_array_kernel(T* device_output, T* device_output_reductions)
     }
 }
 
-
 TYPED_TEST(HipcubBlockReduceInputArrayTests, Reduce)
 {
     int device_id = test_common_utils::obtain_device_from_ctest();
@@ -436,8 +622,8 @@ TYPED_TEST(HipcubBlockReduceInputArrayTests, Reduce)
     binary_op_type_host binary_op_host;
     using acc_type = typename test_utils::select_plus_operator_host<T>::acc_type;
 
-    constexpr auto algorithm = TestFixture::algorithm;
-    constexpr size_t block_size = TestFixture::block_size;
+    constexpr auto   algorithm        = TestFixture::algorithm;
+    constexpr size_t block_size       = TestFixture::block_size;
     constexpr size_t items_per_thread = TestFixture::items_per_thread;
 
     // Given block size not supported
@@ -447,12 +633,13 @@ TYPED_TEST(HipcubBlockReduceInputArrayTests, Reduce)
     }
 
     const size_t items_per_block = block_size * items_per_thread;
-    const size_t size = items_per_block * 37;
-    const size_t grid_size = size / items_per_block;
+    const size_t size            = items_per_block * 37;
+    const size_t grid_size       = size / items_per_block;
 
-    for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
-        unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
         SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
 
         // Generate data
@@ -483,39 +670,34 @@ TYPED_TEST(HipcubBlockReduceInputArrayTests, Reduce)
         T* device_output;
         HIP_CHECK(test_common_utils::hipMallocHelper(&device_output, output.size() * sizeof(T)));
         T* device_output_reductions;
-        HIP_CHECK(test_common_utils::hipMallocHelper(&device_output_reductions, output_reductions.size() * sizeof(T)));
-
-        HIP_CHECK(
-            hipMemcpy(
-                device_output, output.data(),
-                output.size() * sizeof(T),
-                hipMemcpyHostToDevice
-            )
-        );
-
-        HIP_CHECK(
-            hipMemcpy(
-                device_output_reductions, output_reductions.data(),
-                output_reductions.size() * sizeof(T),
-                hipMemcpyHostToDevice
-            )
-        );
+        HIP_CHECK(test_common_utils::hipMallocHelper(&device_output_reductions,
+                                                     output_reductions.size() * sizeof(T)));
+
+        HIP_CHECK(hipMemcpy(device_output,
+                            output.data(),
+                            output.size() * sizeof(T),
+                            hipMemcpyHostToDevice));
+
+        HIP_CHECK(hipMemcpy(device_output_reductions,
+                            output_reductions.data(),
+                            output_reductions.size() * sizeof(T),
+                            hipMemcpyHostToDevice));
 
         // Running kernel
         hipLaunchKernelGGL(
             HIP_KERNEL_NAME(reduce_array_kernel<block_size, items_per_thread, algorithm, T>),
-            dim3(grid_size), dim3(block_size), 0, 0,
-            device_output, device_output_reductions
-        );
+            dim3(grid_size),
+            dim3(block_size),
+            0,
+            0,
+            device_output,
+            device_output_reductions);
 
         // Reading results back
-        HIP_CHECK(
-            hipMemcpy(
-                output_reductions.data(), device_output_reductions,
-                output_reductions.size() * sizeof(T),
-                hipMemcpyDeviceToHost
-            )
-        );
+        HIP_CHECK(hipMemcpy(output_reductions.data(),
+                            device_output_reductions,
+                            output_reductions.size() * sizeof(T),
+                            hipMemcpyDeviceToHost));
 
         // Verifying results
         test_utils::assert_near(output_reductions,
@@ -526,3 +708,129 @@ TYPED_TEST(HipcubBlockReduceInputArrayTests, Reduce)
         HIP_CHECK(hipFree(device_output_reductions));
     }
 }
+
+template<unsigned int                 BlockSize,
+         unsigned int                 ItemsPerThread,
+         hipcub::BlockReduceAlgorithm Algorithm,
+         class T>
+__global__
+__launch_bounds__(BlockSize)
+void sum_array_kernel(T* device_output, T* device_output_reductions)
+{
+    const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread;
+    // load
+    T in_out[ItemsPerThread];
+    for(unsigned int j = 0; j < ItemsPerThread; j++)
+    {
+        in_out[j] = device_output[index + j];
+    }
+
+    T                                          reduction;
+    using breduce_t = hipcub::BlockReduce<T, BlockSize, Algorithm>;
+    __shared__ typename breduce_t::TempStorage temp_storage;
+    reduction = breduce_t(temp_storage).Sum(in_out);
+
+    if(hipThreadIdx_x == 0)
+    {
+        device_output_reductions[hipBlockIdx_x] = reduction;
+    }
+}
+
+TYPED_TEST(HipcubBlockReduceInputArrayTests, Sum)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using T = typename TestFixture::type;
+    // for bfloat16 and half we use double for host-side accumulation
+    using binary_op_type_host = typename test_utils::select_plus_operator_host<T>::type;
+    binary_op_type_host binary_op_host;
+    using acc_type = typename test_utils::select_plus_operator_host<T>::acc_type;
+
+    constexpr auto   algorithm        = TestFixture::algorithm;
+    constexpr size_t block_size       = TestFixture::block_size;
+    constexpr size_t items_per_thread = TestFixture::items_per_thread;
+
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size())
+    {
+        return;
+    }
+
+    const size_t items_per_block = block_size * items_per_thread;
+    const size_t size            = items_per_block * 37;
+    const size_t grid_size       = size / items_per_block;
+
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    {
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
+
+        // Generate data
+        std::vector<T> output
+            = test_utils::get_random_data<T>(size,
+                                             test_utils::convert_to_device<T>(2),
+                                             test_utils::convert_to_device<T>(200),
+                                             seed_value);
+
+        // Output reduce results
+        std::vector<T> output_reductions(size / block_size, test_utils::convert_to_device<T>(0));
+
+        // Calculate expected results on host
+        std::vector<T> expected_reductions(output_reductions.size(),
+                                           test_utils::convert_to_device<T>(0));
+        for(size_t i = 0; i < output.size() / items_per_block; i++)
+        {
+            acc_type value(0);
+            for(size_t j = 0; j < items_per_block; j++)
+            {
+                auto idx = i * items_per_block + j;
+                value    = binary_op_host(static_cast<acc_type>(output[idx]), value);
+            }
+            expected_reductions[i] = static_cast<T>(value);
+        }
+
+        // Preparing device
+        T* device_output;
+        HIP_CHECK(test_common_utils::hipMallocHelper(&device_output, output.size() * sizeof(T)));
+        T* device_output_reductions;
+        HIP_CHECK(test_common_utils::hipMallocHelper(&device_output_reductions,
+                                                     output_reductions.size() * sizeof(T)));
+
+        HIP_CHECK(hipMemcpy(device_output,
+                            output.data(),
+                            output.size() * sizeof(T),
+                            hipMemcpyHostToDevice));
+
+        HIP_CHECK(hipMemcpy(device_output_reductions,
+                            output_reductions.data(),
+                            output_reductions.size() * sizeof(T),
+                            hipMemcpyHostToDevice));
+
+        // Running kernel
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(sum_array_kernel<block_size, items_per_thread, algorithm, T>),
+            dim3(grid_size),
+            dim3(block_size),
+            0,
+            0,
+            device_output,
+            device_output_reductions);
+
+        // Reading results back
+        HIP_CHECK(hipMemcpy(output_reductions.data(),
+                            device_output_reductions,
+                            output_reductions.size() * sizeof(T),
+                            hipMemcpyDeviceToHost));
+
+        // Verifying results
+        test_utils::assert_near(output_reductions,
+                                expected_reductions,
+                                test_utils::precision<T>::value * items_per_block);
+
+        HIP_CHECK(hipFree(device_output));
+        HIP_CHECK(hipFree(device_output_reductions));
+    }
+}
\ No newline at end of file
diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp
index 58e264a8a00..e93ae5f7a20 100644
--- a/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp
+++ b/projects/hipcub/test/hipcub/test_hipcub_block_run_length_decode.cpp
@@ -25,19 +25,17 @@
 #include "hipcub/block/block_run_length_decode.hpp"
 #include "hipcub/block/block_store.hpp"
 
-template<
-    class ItemT,
-    class LengthT,
-    unsigned BlockSize,
-    unsigned RunsPerThread,
-    unsigned DecodedItemsPerThread
->
+template<class ItemT,
+         class LengthT,
+         unsigned BlockSize,
+         unsigned RunsPerThread,
+         unsigned DecodedItemsPerThread>
 struct Params
 {
-    using item_type = ItemT;
-    using length_type = LengthT;
-    static constexpr unsigned block_size = BlockSize;
-    static constexpr unsigned runs_per_thread = RunsPerThread;
+    using item_type                                    = ItemT;
+    using length_type                                  = LengthT;
+    static constexpr unsigned block_size               = BlockSize;
+    static constexpr unsigned runs_per_thread          = RunsPerThread;
     static constexpr unsigned decoded_items_per_thread = DecodedItemsPerThread;
 };
 
@@ -79,46 +77,37 @@ using HipcubBlockRunLengthDecodeTestParams
 
 TYPED_TEST_SUITE(HipcubBlockRunLengthDecodeTest, HipcubBlockRunLengthDecodeTestParams);
 
-template<
-    class ItemT,
-    class LengthT,
-    unsigned BlockSize,
-    unsigned RunsPerThread,
-    unsigned DecodedItemsPerThread
->
+template<class ItemT,
+         class LengthT,
+         unsigned BlockSize,
+         unsigned RunsPerThread,
+         unsigned DecodedItemsPerThread>
 __global__
 __launch_bounds__(BlockSize)
-void block_run_length_decode_kernel(
-    const ItemT * d_run_items,
-    const LengthT * d_run_lengths,
-    ItemT * d_decoded_items)
+void block_run_length_decode_kernel(const ItemT*   d_run_items,
+                                    const LengthT* d_run_lengths,
+                                    ItemT*         d_decoded_items)
 {
-    using BlockRunLengthDecodeT = hipcub::BlockRunLengthDecode<
-        ItemT,
-        BlockSize,
-        RunsPerThread,
-        DecodedItemsPerThread
-    >;
+    using BlockRunLengthDecodeT
+        = hipcub::BlockRunLengthDecode<ItemT, BlockSize, RunsPerThread, DecodedItemsPerThread>;
     static constexpr unsigned int decoded_items_per_block = BlockSize * DecodedItemsPerThread;
     __shared__ typename BlockRunLengthDecodeT::TempStorage temp_storage;
 
-    ItemT run_items[RunsPerThread];
+    ItemT   run_items[RunsPerThread];
     LengthT run_lengths[RunsPerThread];
 
     const unsigned global_thread_idx = BlockSize * hipBlockIdx_x + hipThreadIdx_x;
     hipcub::LoadDirectBlocked(global_thread_idx, d_run_items, run_items);
     hipcub::LoadDirectBlocked(global_thread_idx, d_run_lengths, run_lengths);
 
-    unsigned total_decoded_size{};
-    BlockRunLengthDecodeT block_run_length_decode(
-        temp_storage,
-        run_items,
-        run_lengths,
-        total_decoded_size
-    );
+    unsigned              total_decoded_size{};
+    BlockRunLengthDecodeT block_run_length_decode(temp_storage,
+                                                  run_items,
+                                                  run_lengths,
+                                                  total_decoded_size);
 
     unsigned decoded_window_offset = 0;
-    while (decoded_window_offset < total_decoded_size)
+    while(decoded_window_offset < total_decoded_size)
     {
         ItemT decoded_items[DecodedItemsPerThread];
 
@@ -139,13 +128,13 @@ TYPED_TEST(HipcubBlockRunLengthDecodeTest, TestDecode)
     SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
     HIP_CHECK(hipSetDevice(device_id));
 
-    using ItemT = typename TestFixture::params::item_type;
-    using LengthT = typename TestFixture::params::length_type;
-    constexpr unsigned block_size = TestFixture::params::block_size;
-    constexpr unsigned runs_per_thread = TestFixture::params::runs_per_thread;
+    using ItemT                                 = typename TestFixture::params::item_type;
+    using LengthT                               = typename TestFixture::params::length_type;
+    constexpr unsigned block_size               = TestFixture::params::block_size;
+    constexpr unsigned runs_per_thread          = TestFixture::params::runs_per_thread;
     constexpr unsigned decoded_items_per_thread = TestFixture::params::decoded_items_per_thread;
 
-    for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
         const unsigned int seed_value
             = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
@@ -154,31 +143,26 @@ TYPED_TEST(HipcubBlockRunLengthDecodeTest, TestDecode)
         const LengthT max_run_length = static_cast<LengthT>(
             std::min(1000ll, static_cast<long long>(std::numeric_limits<LengthT>::max())));
 
-        size_t num_runs = runs_per_thread * block_size;
-        auto run_items = test_utils::get_random_data<ItemT>(
-            num_runs,
-            std::numeric_limits<ItemT>::min(),
-            std::numeric_limits<ItemT>::max(),
-            seed_value
-        );
-        auto run_lengths = test_utils::get_random_data<LengthT>(
-            num_runs,
-            static_cast<LengthT>(1),
-            max_run_length,
-            seed_value
-        );
-
-        std::default_random_engine prng(seed_value);
+        size_t num_runs    = runs_per_thread * block_size;
+        auto   run_items   = test_utils::get_random_data<ItemT>(num_runs,
+                                                            std::numeric_limits<ItemT>::min(),
+                                                            std::numeric_limits<ItemT>::max(),
+                                                            seed_value);
+        auto   run_lengths = test_utils::get_random_data<LengthT>(num_runs,
+                                                                static_cast<LengthT>(1),
+                                                                max_run_length,
+                                                                seed_value);
+
+        std::default_random_engine            prng(seed_value);
         std::uniform_int_distribution<size_t> num_empty_runs_dist(1, 4);
-        const size_t num_trailing_empty_runs = num_empty_runs_dist(prng);
+        const size_t                          num_trailing_empty_runs = num_empty_runs_dist(prng);
         num_runs += num_trailing_empty_runs;
 
-        const auto empty_run_items = test_utils::get_random_data<ItemT>(
-            num_trailing_empty_runs,
-            std::numeric_limits<ItemT>::min(),
-            std::numeric_limits<ItemT>::max(),
-            seed_value
-        );
+        const auto empty_run_items
+            = test_utils::get_random_data<ItemT>(num_trailing_empty_runs,
+                                                 std::numeric_limits<ItemT>::min(),
+                                                 std::numeric_limits<ItemT>::max(),
+                                                 seed_value);
         // Not strictly required, but fixes a spurious GCC warning and good practice anyways
         run_items.reserve(run_items.size() + empty_run_items.size());
         run_items.insert(run_items.end(), empty_run_items.begin(), empty_run_items.end());
@@ -186,64 +170,56 @@ TYPED_TEST(HipcubBlockRunLengthDecodeTest, TestDecode)
         run_lengths.insert(run_lengths.end(), num_trailing_empty_runs, static_cast<LengthT>(0));
 
         std::vector<ItemT> expected;
-        for (size_t i = 0; i < run_items.size(); ++i)
+        for(size_t i = 0; i < run_items.size(); ++i)
         {
-            for (size_t j = 0; j < static_cast<size_t>(run_lengths[i]); ++j)
+            for(size_t j = 0; j < static_cast<size_t>(run_lengths[i]); ++j)
             {
                 expected.push_back(run_items[i]);
             }
         }
 
-        ItemT * d_run_items{};
-        HIP_CHECK(test_common_utils::hipMallocHelper(&d_run_items, run_items.size() * sizeof(ItemT)));
+        ItemT* d_run_items{};
         HIP_CHECK(
-            hipMemcpy(
-                d_run_items,
-                run_items.data(),
-                run_items.size() * sizeof(ItemT),
-                hipMemcpyHostToDevice
-            )
-        );
-
-        LengthT * d_run_lengths{};
-        HIP_CHECK(test_common_utils::hipMallocHelper(&d_run_lengths, run_lengths.size() * sizeof(LengthT)));
+            test_common_utils::hipMallocHelper(&d_run_items, run_items.size() * sizeof(ItemT)));
+        HIP_CHECK(hipMemcpy(d_run_items,
+                            run_items.data(),
+                            run_items.size() * sizeof(ItemT),
+                            hipMemcpyHostToDevice));
+
+        LengthT* d_run_lengths{};
+        HIP_CHECK(test_common_utils::hipMallocHelper(&d_run_lengths,
+                                                     run_lengths.size() * sizeof(LengthT)));
+        HIP_CHECK(hipMemcpy(d_run_lengths,
+                            run_lengths.data(),
+                            run_lengths.size() * sizeof(LengthT),
+                            hipMemcpyHostToDevice));
+
+        ItemT* d_decoded_runs{};
         HIP_CHECK(
-            hipMemcpy(
-                d_run_lengths,
-                run_lengths.data(),
-                run_lengths.size() * sizeof(LengthT),
-                hipMemcpyHostToDevice
-            )
-        );
-
-        ItemT * d_decoded_runs{};
-        HIP_CHECK(test_common_utils::hipMallocHelper(&d_decoded_runs, expected.size() * sizeof(ItemT)));
+            test_common_utils::hipMallocHelper(&d_decoded_runs, expected.size() * sizeof(ItemT)));
 
+        HIP_CHECK(hipGetLastError());
         hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(
-                block_run_length_decode_kernel<
-                    ItemT,
-                    LengthT,
-                    block_size,
-                    runs_per_thread,
-                    decoded_items_per_thread
-                >
-            ),
-            dim3(1), dim3(block_size), 0, 0,
-            d_run_items, d_run_lengths, d_decoded_runs
-        );
-        HIP_CHECK(hipPeekAtLastError());
+            HIP_KERNEL_NAME(block_run_length_decode_kernel<ItemT,
+                                                           LengthT,
+                                                           block_size,
+                                                           runs_per_thread,
+                                                           decoded_items_per_thread>),
+            dim3(1),
+            dim3(block_size),
+            0,
+            0,
+            d_run_items,
+            d_run_lengths,
+            d_decoded_runs);
+        HIP_CHECK(hipGetLastError());
         HIP_CHECK(hipDeviceSynchronize());
 
         std::vector<ItemT> output(expected.size());
-        HIP_CHECK(
-            hipMemcpy(
-                output.data(),
-                d_decoded_runs,
-                output.size() * sizeof(ItemT),
-                hipMemcpyDeviceToHost
-            )
-        );
+        HIP_CHECK(hipMemcpy(output.data(),
+                            d_decoded_runs,
+                            output.size() * sizeof(ItemT),
+                            hipMemcpyDeviceToHost));
 
         HIP_CHECK(hipFree(d_run_items));
         HIP_CHECK(hipFree(d_run_lengths));
diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_scan.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_scan.cpp
index a4b9f0a4d29..6892222fcca 100644
--- a/projects/hipcub/test/hipcub/test_hipcub_block_scan.cpp
+++ b/projects/hipcub/test/hipcub/test_hipcub_block_scan.cpp
@@ -89,7 +89,7 @@ TYPED_TEST_SUITE(HipcubBlockScanSingleValueTests, SingleValueTestParams);
 
 template<unsigned int BlockSize, hipcub::BlockScanAlgorithm Algorithm, class T>
 __global__ __launch_bounds__(BlockSize)
-void block_inclusive_scan_kernel(T* device_output)
+void inclusive_scan_kernel(T* device_output)
 {
     const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
     T                  value = device_output[index];
@@ -159,7 +159,7 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveScan)
                             hipMemcpyHostToDevice));
 
         // Launching kernel
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(block_inclusive_scan_kernel<block_size, algorithm, T>),
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(inclusive_scan_kernel<block_size, algorithm, T>),
                            dim3(grid_size),
                            dim3(block_size),
                            0,
@@ -187,7 +187,7 @@ template<unsigned int               BlockSize,
          hipcub::BlockScanAlgorithm Algorithm,
          class T>
 __global__ __launch_bounds__(BlockSize)
-void block_inclusive_scan_initial_value_kernel(T* device_output, T initial_value)
+void inclusive_scan_initial_value_kernel(T* device_output, T initial_value)
 {
     const unsigned int index
         = (hipBlockIdx_x * BlockSize * ItemsPerThread) + hipThreadIdx_x * ItemsPerThread;
@@ -270,7 +270,7 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveScanInitialValue)
 
         // Launching kernel
         hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(block_inclusive_scan_initial_value_kernel<block_size, 1, algorithm, T>),
+            HIP_KERNEL_NAME(inclusive_scan_initial_value_kernel<block_size, 1, algorithm, T>),
             dim3(grid_size),
             dim3(block_size),
             0,
@@ -295,12 +295,13 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveScanInitialValue)
 }
 
 template<unsigned int BlockSize, hipcub::BlockScanAlgorithm Algorithm, class T>
-__global__ __launch_bounds__(BlockSize)
-void block_inclusive_scan_reduce_kernel(T* device_output, T* device_output_reductions)
+__global__
+    __launch_bounds__(BlockSize)
+void inclusive_scan_reduce_kernel(T* device_output, T* device_output_reductions)
 {
-    const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
-    T                  value = device_output[index];
-    T                  reduction;
+    const unsigned int                       index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    T                                        value = device_output[index];
+    T                                        reduction;
     using bscan_t = hipcub::BlockScan<T, BlockSize, Algorithm>;
     __shared__ typename bscan_t::TempStorage temp_storage;
     bscan_t(temp_storage).InclusiveScan(value, value, hipcub::Sum(), reduction);
@@ -378,14 +379,13 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveScanReduce)
         HIP_CHECK(hipMemset(device_output_reductions, T(0), output_reductions.size() * sizeof(T)));
 
         // Launching kernel
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(block_inclusive_scan_reduce_kernel<block_size, algorithm, T>),
-            dim3(grid_size),
-            dim3(block_size),
-            0,
-            0,
-            device_output,
-            device_output_reductions);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(inclusive_scan_reduce_kernel<block_size, algorithm, T>),
+                           dim3(grid_size),
+                           dim3(block_size),
+                           0,
+                           0,
+                           device_output,
+                           device_output_reductions);
 
         HIP_CHECK(hipPeekAtLastError());
         HIP_CHECK(hipDeviceSynchronize());
@@ -412,14 +412,18 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveScanReduce)
     }
 }
 
+// CUB fails to compute the block aggregate correctly when using the API for initial value support.
+// TODO fix this unit test
+#if 0
 template<unsigned int               BlockSize,
          unsigned int               ItemsPerThread,
          hipcub::BlockScanAlgorithm Algorithm,
          class T>
-__global__ __launch_bounds__(BlockSize)
-void block_inclusive_scan_reduce_initial_value_kernel(T* device_output,
-                                                      T* device_output_reductions,
-                                                      T  initial_value)
+__global__
+    __launch_bounds__(BlockSize)
+void inclusive_scan_reduce_initial_value_kernel(T* device_output,
+                                                T* device_output_reductions,
+                                                T  initial_value)
 {
     const unsigned int index
         = (hipBlockIdx_x * BlockSize * ItemsPerThread) + hipThreadIdx_x * ItemsPerThread;
@@ -446,6 +450,7 @@ void block_inclusive_scan_reduce_initial_value_kernel(T* device_output,
     }
 }
 
+// #ifndef __HIP_PLATFORM_NVIDIA__
 TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveScanReduceInitialValue)
 {
     int device_id = test_common_utils::obtain_device_from_ctest();
@@ -483,23 +488,20 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveScanReduceInitialValue)
         SCOPED_TRACE(testing::Message() << "with initial_value = " << initial_value);
 
         // Calculate expected results on host
-        std::vector<T> expected(output.size(), T(0));
-        std::vector<T> expected_reductions(output_reductions.size(), T(0));
+        std::vector<T> expected(output.size(), 0);
+        std::vector<T> expected_reductions(output_reductions.size(), 0);
         for(size_t i = 0; i < output.size() / block_size; i++)
         {
-            acc_type accumulator(initial_value);
-            acc_type reduction = output[i * block_size];
+
+            acc_type accumulator = static_cast<acc_type>(initial_value);
             for(size_t j = 0; j < block_size; j++)
+
             {
-                size_t idx    = i * block_size + j;
-                accumulator   = binary_op_host(output[idx], accumulator);
+                auto idx      = i * block_size + j;
+                accumulator   = binary_op_host(accumulator, static_cast<acc_type>(output[idx]));
                 expected[idx] = static_cast<T>(accumulator);
-                if(j > 0)
-                {
-                    reduction = binary_op_host(output[idx], reduction);
-                }
             }
-            expected_reductions[i] = reduction;
+            expected_reductions[i] = expected[(i + 1) * block_size - 1];
         }
 
         // Writing to device memory
@@ -522,7 +524,7 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveScanReduceInitialValue)
         // Launching kernel
         hipLaunchKernelGGL(
             HIP_KERNEL_NAME(
-                block_inclusive_scan_reduce_initial_value_kernel<block_size, 1, algorithm, T>),
+                inclusive_scan_reduce_initial_value_kernel<block_size, 1, algorithm, T>),
             dim3(grid_size),
             dim3(block_size),
             0,
@@ -556,11 +558,13 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveScanReduceInitialValue)
     }
 }
 
+// #endif //__HIP_PLATFORM_NVIDIA__
+
+#endif
+
 template<unsigned int BlockSize, hipcub::BlockScanAlgorithm Algorithm, class T>
 __global__ __launch_bounds__(BlockSize)
-void block_inclusive_scan_prefix_callback_kernel(T* device_output,
-                                                 T* device_output_bp,
-                                                 T  block_prefix)
+void inclusive_scan_prefix_callback_kernel(T* device_output, T* device_output_bp, T block_prefix)
 {
     const unsigned int index           = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
     T                  prefix_value    = block_prefix;
@@ -652,7 +656,7 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveScanPrefixCallback)
 
         // Launching kernel
         hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(block_inclusive_scan_prefix_callback_kernel<block_size, algorithm, T>),
+            HIP_KERNEL_NAME(inclusive_scan_prefix_callback_kernel<block_size, algorithm, T>),
             dim3(grid_size),
             dim3(block_size),
             0,
@@ -687,11 +691,12 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveScanPrefixCallback)
 }
 
 template<unsigned int BlockSize, hipcub::BlockScanAlgorithm Algorithm, class T>
-__global__ __launch_bounds__(BlockSize) void exclusive_scan_kernel(T* device_output, T init)
+__global__ __launch_bounds__(BlockSize)
+void exclusive_scan_kernel(T* device_output, T init)
 {
-    const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
-    T                  value = device_output[index];
-    using bscan_t            = hipcub::BlockScan<T, BlockSize, Algorithm>;
+    const unsigned int                       index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    T                                        value = device_output[index];
+    using bscan_t = hipcub::BlockScan<T, BlockSize, Algorithm>;
     __shared__ typename bscan_t::TempStorage temp_storage;
     bscan_t(temp_storage).ExclusiveScan(value, value, init, hipcub::Sum());
     device_output[index] = value;
@@ -782,12 +787,12 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, ExclusiveScan)
 }
 
 template<unsigned int BlockSize, hipcub::BlockScanAlgorithm Algorithm, class T>
-__global__ __launch_bounds__(BlockSize) void exclusive_scan_reduce_kernel(
-    T* device_output, T* device_output_reductions, T init)
+__global__ __launch_bounds__(BlockSize)
+void exclusive_scan_reduce_kernel(T* device_output, T* device_output_reductions, T init)
 {
-    const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
-    T                  value = device_output[index];
-    T                  reduction;
+    const unsigned int                       index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    T                                        value = device_output[index];
+    T                                        reduction;
     using bscan_t = hipcub::BlockScan<T, BlockSize, Algorithm>;
     __shared__ typename bscan_t::TempStorage temp_storage;
     bscan_t(temp_storage).ExclusiveScan(value, value, init, hipcub::Sum(), reduction);
@@ -911,8 +916,8 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, ExclusiveScanReduce)
 }
 
 template<unsigned int BlockSize, hipcub::BlockScanAlgorithm Algorithm, class T>
-__global__ __launch_bounds__(BlockSize) void exclusive_scan_prefix_callback_kernel(
-    T* device_output, T* device_output_bp, T block_prefix)
+__global__ __launch_bounds__(BlockSize)
+void exclusive_scan_prefix_callback_kernel(T* device_output, T* device_output_bp, T block_prefix)
 {
     const unsigned int index           = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
     T                  prefix_value    = block_prefix;
@@ -988,7 +993,7 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, ExclusiveScanPrefixCallback)
             acc_type accumulator_block_prefixes(block_prefix);
             for(size_t j = 0; j < block_size; j++)
             {
-                auto idx = i * block_size + j;
+                auto idx                   = i * block_size + j;
                 accumulator_block_prefixes = binary_op_host(static_cast<acc_type>(output[idx]),
                                                             accumulator_block_prefixes);
             }
@@ -1114,7 +1119,7 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, CustomStruct)
                             hipMemcpyHostToDevice));
 
         // Launching kernel
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(block_inclusive_scan_kernel<block_size, algorithm, T>),
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(inclusive_scan_kernel<block_size, algorithm, T>),
                            dim3(grid_size),
                            dim3(block_size),
                            0,
@@ -1137,80 +1142,21 @@ TYPED_TEST(HipcubBlockScanSingleValueTests, CustomStruct)
     }
 }
 
-// // ---------------------------------------------------------
-// // Test for scan ops taking array of values as input
-// // ---------------------------------------------------------
-
-template<class Params>
-class HipcubBlockScanInputArrayTests : public ::testing::Test
-{
-public:
-    using type                                                   = typename Params::type;
-    static constexpr unsigned int               block_size       = Params::block_size;
-    static constexpr hipcub::BlockScanAlgorithm algorithm        = Params::algorithm;
-    static constexpr unsigned int               items_per_thread = Params::items_per_thread;
-};
-
-using InputArrayTestParams = ::testing::Types<
-    // -----------------------------------------------------------------------
-    // hipcub::BlockScanAlgorithm::using_warp_scan
-    // -----------------------------------------------------------------------
-    params<float, 6U, 32>,
-    params<float, 32, 2>,
-    params<unsigned int, 256, 3>,
-    params<int, 512, 4>,
-    params<float, 37, 2>,
-    params<float, 65, 5>,
-    params<float, 162, 7>,
-    params<float, 255, 15>,
-    // half and bfloat require small block sizes due to the very limited accuracy
-    params<test_utils::half, 65, 5>,
-    params<test_utils::bfloat16, 16, 5>,
-    // -----------------------------------------------------------------------
-    // hipcub::BLOCK_SCAN_RAKING
-    // -----------------------------------------------------------------------
-    params<float, 6U, 32, hipcub::BLOCK_SCAN_RAKING>,
-    params<float, 32, 2, hipcub::BLOCK_SCAN_RAKING>,
-    params<int, 256, 3, hipcub::BLOCK_SCAN_RAKING>,
-    params<unsigned int, 512, 4, hipcub::BLOCK_SCAN_RAKING>,
-    params<float, 37, 2, hipcub::BLOCK_SCAN_RAKING>,
-    params<float, 65, 5, hipcub::BLOCK_SCAN_RAKING>,
-    params<float, 162, 7, hipcub::BLOCK_SCAN_RAKING>,
-    params<float, 255, 15, hipcub::BLOCK_SCAN_RAKING>,
-    // half and bfloat require small block sizes due to the very limited accuracy
-    params<test_utils::half, 65, 5, hipcub::BLOCK_SCAN_RAKING>,
-    params<test_utils::bfloat16, 16, 5, hipcub::BLOCK_SCAN_RAKING>>;
-
-TYPED_TEST_SUITE(HipcubBlockScanInputArrayTests, InputArrayTestParams);
-
-template<unsigned int               BlockSize,
-         unsigned int               ItemsPerThread,
-         hipcub::BlockScanAlgorithm Algorithm,
-         class T>
+template<unsigned int BlockSize, hipcub::BlockScanAlgorithm Algorithm, class T>
 __global__ __launch_bounds__(BlockSize)
-void block_inclusive_scan_array_kernel(T* device_output)
+void inclusive_sum_kernel(T* device_output)
 {
-    const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread;
-
-    // load
-    T in_out[ItemsPerThread];
-    for(unsigned int j = 0; j < ItemsPerThread; j++)
-    {
-        in_out[j] = device_output[index + j];
-    }
+    const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    T                  value = device_output[index];
 
     using bscan_t = hipcub::BlockScan<T, BlockSize, Algorithm>;
     __shared__ typename bscan_t::TempStorage temp_storage;
-    bscan_t(temp_storage).InclusiveScan(in_out, in_out, hipcub::Sum());
+    bscan_t(temp_storage).InclusiveSum(value, value);
 
-    // store
-    for(unsigned int j = 0; j < ItemsPerThread; j++)
-    {
-        device_output[index + j] = in_out[j];
-    }
+    device_output[index] = value;
 }
 
-TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScan)
+TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveSum)
 {
     int device_id = test_common_utils::obtain_device_from_ctest();
     SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
@@ -1222,9 +1168,8 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScan)
     binary_op_type_host binary_op_host;
     using acc_type = typename test_utils::select_plus_operator_host<T>::acc_type;
 
-    constexpr auto   algorithm        = TestFixture::algorithm;
-    constexpr size_t block_size       = TestFixture::block_size;
-    constexpr size_t items_per_thread = TestFixture::items_per_thread;
+    constexpr auto   algorithm  = TestFixture::algorithm;
+    constexpr size_t block_size = TestFixture::block_size;
 
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
@@ -1232,9 +1177,8 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScan)
         return;
     }
 
-    const size_t items_per_block = block_size * items_per_thread;
-    const size_t size            = items_per_block * 37;
-    const size_t grid_size       = size / items_per_block;
+    const size_t size      = block_size * 113;
+    const size_t grid_size = size / block_size;
 
     for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
@@ -1246,13 +1190,13 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScan)
         std::vector<T> output = test_utils::get_random_data<T>(size, 2, 200, seed_value);
 
         // Calculate expected results on host
-        std::vector<T> expected(output.size(), test_utils::convert_to_device<T>(0));
-        for(size_t i = 0; i < output.size() / items_per_block; i++)
+        std::vector<T> expected(output.size(), 0);
+        for(size_t i = 0; i < output.size() / block_size; i++)
         {
             acc_type accumulator(0);
-            for(size_t j = 0; j < items_per_block; j++)
+            for(size_t j = 0; j < block_size; j++)
             {
-                auto idx      = i * items_per_block + j;
+                auto idx      = i * block_size + j;
                 accumulator   = binary_op_host(static_cast<acc_type>(output[idx]), accumulator);
                 expected[idx] = static_cast<T>(accumulator);
             }
@@ -1270,14 +1214,12 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScan)
                             hipMemcpyHostToDevice));
 
         // Launching kernel
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(
-                block_inclusive_scan_array_kernel<block_size, items_per_thread, algorithm, T>),
-            dim3(grid_size),
-            dim3(block_size),
-            0,
-            0,
-            device_output);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(inclusive_sum_kernel<block_size, algorithm, T>),
+                           dim3(grid_size),
+                           dim3(block_size),
+                           0,
+                           0,
+                           device_output);
 
         HIP_CHECK(hipPeekAtLastError());
         HIP_CHECK(hipDeviceSynchronize());
@@ -1295,40 +1237,25 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScan)
     }
 }
 
-template<unsigned int               BlockSize,
-         unsigned int               ItemsPerThread,
-         hipcub::BlockScanAlgorithm Algorithm,
-         class T>
-__global__ __launch_bounds__(BlockSize)
-void block_inclusive_scan_reduce_array_kernel(T* device_output, T* device_output_reductions)
+template<unsigned int BlockSize, hipcub::BlockScanAlgorithm Algorithm, class T>
+__global__
+    __launch_bounds__(BlockSize)
+void inclusive_sum_reduce_kernel(T* device_output, T* device_output_reductions)
 {
-    const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread;
-
-    // load
-    T in_out[ItemsPerThread];
-    for(unsigned int j = 0; j < ItemsPerThread; j++)
-    {
-        in_out[j] = device_output[index + j];
-    }
-
+    const unsigned int                       index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    T                                        value = device_output[index];
+    T                                        reduction;
     using bscan_t = hipcub::BlockScan<T, BlockSize, Algorithm>;
     __shared__ typename bscan_t::TempStorage temp_storage;
-    T                                        reduction;
-    bscan_t(temp_storage).InclusiveScan(in_out, in_out, hipcub::Sum(), reduction);
-
-    // store
-    for(unsigned int j = 0; j < ItemsPerThread; j++)
-    {
-        device_output[index + j] = in_out[j];
-    }
-
+    bscan_t(temp_storage).InclusiveSum(value, value, reduction);
+    device_output[index] = value;
     if(hipThreadIdx_x == 0)
     {
         device_output_reductions[hipBlockIdx_x] = reduction;
     }
 }
 
-TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanReduce)
+TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveSumReduce)
 {
     int device_id = test_common_utils::obtain_device_from_ctest();
     SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
@@ -1340,9 +1267,8 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanReduce)
     binary_op_type_host binary_op_host;
     using acc_type = typename test_utils::select_plus_operator_host<T>::acc_type;
 
-    constexpr auto   algorithm        = TestFixture::algorithm;
-    constexpr size_t block_size       = TestFixture::block_size;
-    constexpr size_t items_per_thread = TestFixture::items_per_thread;
+    constexpr auto   algorithm  = TestFixture::algorithm;
+    constexpr size_t block_size = TestFixture::block_size;
 
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
@@ -1350,9 +1276,8 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanReduce)
         return;
     }
 
-    const size_t items_per_block = block_size * items_per_thread;
-    const size_t size            = items_per_block * 37;
-    const size_t grid_size       = size / items_per_block;
+    const size_t size      = block_size * 113;
+    const size_t grid_size = size / block_size;
 
     for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
@@ -1362,24 +1287,21 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanReduce)
 
         // Generate data
         std::vector<T> output = test_utils::get_random_data<T>(size, 2, 200, seed_value);
-
-        // Output reduce results
-        std::vector<T> output_reductions(size / block_size, test_utils::convert_to_device<T>(0));
+        std::vector<T> output_reductions(size / block_size, 0);
 
         // Calculate expected results on host
-        std::vector<T> expected(output.size(), test_utils::convert_to_device<T>(0));
-        std::vector<T> expected_reductions(output_reductions.size(),
-                                           test_utils::convert_to_device<T>(0));
-        for(size_t i = 0; i < output.size() / items_per_block; i++)
+        std::vector<T> expected(output.size(), 0);
+        std::vector<T> expected_reductions(output_reductions.size(), 0);
+        for(size_t i = 0; i < output.size() / block_size; i++)
         {
             acc_type accumulator(0);
-            for(size_t j = 0; j < items_per_block; j++)
+            for(size_t j = 0; j < block_size; j++)
             {
-                auto idx      = i * items_per_block + j;
+                auto idx      = i * block_size + j;
                 accumulator   = binary_op_host(static_cast<acc_type>(output[idx]), accumulator);
                 expected[idx] = static_cast<T>(accumulator);
             }
-            expected_reductions[i] = expected[(i + 1) * items_per_block - 1];
+            expected_reductions[i] = expected[(i + 1) * block_size - 1];
         }
 
         // Writing to device memory
@@ -1397,22 +1319,16 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanReduce)
                             output.size() * sizeof(T),
                             hipMemcpyHostToDevice));
 
-        HIP_CHECK(hipMemset(device_output_reductions,
-                            test_utils::convert_to_device<T>(0),
-                            output_reductions.size() * sizeof(T)));
+        HIP_CHECK(hipMemset(device_output_reductions, T(0), output_reductions.size() * sizeof(T)));
 
         // Launching kernel
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(block_inclusive_scan_reduce_array_kernel<block_size,
-                                                                     items_per_thread,
-                                                                     algorithm,
-                                                                     T>),
-            dim3(grid_size),
-            dim3(block_size),
-            0,
-            0,
-            device_output,
-            device_output_reductions);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(inclusive_sum_reduce_kernel<block_size, algorithm, T>),
+                           dim3(grid_size),
+                           dim3(block_size),
+                           0,
+                           0,
+                           device_output,
+                           device_output_reductions);
 
         HIP_CHECK(hipPeekAtLastError());
         HIP_CHECK(hipDeviceSynchronize());
@@ -1430,7 +1346,6 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanReduce)
 
         // Validating results
         test_utils::assert_near(output, expected, test_utils::precision<T>::value * block_size);
-
         test_utils::assert_near(output_reductions,
                                 expected_reductions,
                                 test_utils::precision<T>::value * block_size);
@@ -1440,48 +1355,33 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanReduce)
     }
 }
 
-template<unsigned int               BlockSize,
-         unsigned int               ItemsPerThread,
-         hipcub::BlockScanAlgorithm Algorithm,
-         class T>
+template<unsigned int BlockSize, hipcub::BlockScanAlgorithm Algorithm, class T>
 __global__ __launch_bounds__(BlockSize)
-void block_inclusive_scan_array_prefix_callback_kernel(T* device_output,
-                                                       T* device_output_bp,
-                                                       T  block_prefix)
+void inclusive_sum_prefix_callback_kernel(T* device_output, T* device_output_bp, T block_prefix)
 {
-    const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread;
+    const unsigned int index           = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
     T                  prefix_value    = block_prefix;
     auto               prefix_callback = [&prefix_value](T reduction)
     {
-        T prefix     = prefix_value;
-        prefix_value = prefix_value + reduction;
+        T prefix = prefix_value;
+        prefix_value += reduction;
         return prefix;
     };
 
-    // load
-    T in_out[ItemsPerThread];
-    for(unsigned int j = 0; j < ItemsPerThread; j++)
-    {
-        in_out[j] = device_output[index + j];
-    }
+    T value = device_output[index];
 
     using bscan_t = hipcub::BlockScan<T, BlockSize, Algorithm>;
     __shared__ typename bscan_t::TempStorage temp_storage;
-    bscan_t(temp_storage).InclusiveScan(in_out, in_out, hipcub::Sum(), prefix_callback);
-
-    // store
-    for(unsigned int j = 0; j < ItemsPerThread; j++)
-    {
-        device_output[index + j] = in_out[j];
-    }
+    bscan_t(temp_storage).InclusiveSum(value, value, prefix_callback);
 
+    device_output[index] = value;
     if(hipThreadIdx_x == 0)
     {
         device_output_bp[hipBlockIdx_x] = prefix_value;
     }
 }
 
-TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanPrefixCallback)
+TYPED_TEST(HipcubBlockScanSingleValueTests, InclusiveSumPrefixCallback)
 {
     int device_id = test_common_utils::obtain_device_from_ctest();
     SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
@@ -1493,9 +1393,8 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanPrefixCallback)
     binary_op_type_host binary_op_host;
     using acc_type = typename test_utils::select_plus_operator_host<T>::acc_type;
 
-    constexpr auto   algorithm        = TestFixture::algorithm;
-    constexpr size_t block_size       = TestFixture::block_size;
-    constexpr size_t items_per_thread = TestFixture::items_per_thread;
+    constexpr auto   algorithm  = TestFixture::algorithm;
+    constexpr size_t block_size = TestFixture::block_size;
 
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
@@ -1503,9 +1402,8 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanPrefixCallback)
         return;
     }
 
-    const size_t items_per_block = block_size * items_per_thread;
-    const size_t size            = items_per_block * 37;
-    const size_t grid_size       = size / items_per_block;
+    const size_t size      = block_size * 113;
+    const size_t grid_size = size / block_size;
 
     for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
@@ -1515,16 +1413,1670 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanPrefixCallback)
 
         // Generate data
         std::vector<T> output = test_utils::get_random_data<T>(size, 2, 200, seed_value);
-        std::vector<T> output_block_prefixes(size / items_per_block,
-                                             test_utils::convert_to_device<T>(0));
-        T block_prefix = test_utils::get_random_value<T>(test_utils::convert_to_device<T>(0),
-                                                         test_utils::convert_to_device<T>(100),
-                                                         seed_value + seed_value_addition);
+        std::vector<T> output_block_prefixes(size / block_size);
+        T block_prefix = test_utils::get_random_value<T>(0, 100, seed_value + seed_value_addition);
 
         // Calculate expected results on host
-        std::vector<T> expected(output.size(), test_utils::convert_to_device<T>(0));
-        std::vector<T> expected_block_prefixes(output_block_prefixes.size(),
-                                               test_utils::convert_to_device<T>(0));
+        std::vector<T> expected(output.size(), 0);
+        std::vector<T> expected_block_prefixes(output_block_prefixes.size(), 0);
+        for(size_t i = 0; i < output.size() / block_size; i++)
+        {
+            acc_type accumulator(block_prefix);
+            for(size_t j = 0; j < block_size; j++)
+            {
+                auto idx      = i * block_size + j;
+                accumulator   = binary_op_host(static_cast<acc_type>(output[idx]), accumulator);
+                expected[idx] = static_cast<T>(accumulator);
+            }
+            expected_block_prefixes[i] = expected[(i + 1) * block_size - 1];
+        }
+
+        // Writing to device memory
+        T* device_output;
+        HIP_CHECK(test_common_utils::hipMallocHelper(
+            &device_output,
+            output.size() * sizeof(typename decltype(output)::value_type)));
+        T* device_output_bp;
+        HIP_CHECK(test_common_utils::hipMallocHelper(
+            &device_output_bp,
+            output_block_prefixes.size()
+                * sizeof(typename decltype(output_block_prefixes)::value_type)));
+
+        HIP_CHECK(hipMemcpy(device_output,
+                            output.data(),
+                            output.size() * sizeof(T),
+                            hipMemcpyHostToDevice));
+
+        // Launching kernel
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(inclusive_sum_prefix_callback_kernel<block_size, algorithm, T>),
+            dim3(grid_size),
+            dim3(block_size),
+            0,
+            0,
+            device_output,
+            device_output_bp,
+            block_prefix);
+
+        HIP_CHECK(hipPeekAtLastError());
+        HIP_CHECK(hipDeviceSynchronize());
+
+        // Read from device memory
+        HIP_CHECK(hipMemcpy(output.data(),
+                            device_output,
+                            output.size() * sizeof(T),
+                            hipMemcpyDeviceToHost));
+
+        HIP_CHECK(hipMemcpy(output_block_prefixes.data(),
+                            device_output_bp,
+                            output_block_prefixes.size() * sizeof(T),
+                            hipMemcpyDeviceToHost));
+
+        // Validating results
+        test_utils::assert_near(output, expected, test_utils::precision<T>::value * block_size);
+        test_utils::assert_near(output_block_prefixes,
+                                expected_block_prefixes,
+                                test_utils::precision<T>::value * block_size);
+
+        HIP_CHECK(hipFree(device_output));
+        HIP_CHECK(hipFree(device_output_bp));
+    }
+}
+
+template<unsigned int BlockSize, hipcub::BlockScanAlgorithm Algorithm, class T>
+__global__ __launch_bounds__(BlockSize)
+void exclusive_sum_kernel(T* device_output)
+{
+    const unsigned int                       index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    T                                        value = device_output[index];
+    using bscan_t = hipcub::BlockScan<T, BlockSize, Algorithm>;
+    __shared__ typename bscan_t::TempStorage temp_storage;
+    bscan_t(temp_storage).ExclusiveSum(value, value);
+    device_output[index] = value;
+}
+
+TYPED_TEST(HipcubBlockScanSingleValueTests, ExclusiveSum)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using T = typename TestFixture::type;
+    // for bfloat16 and half we use double for host-side accumulation
+    using binary_op_type_host = typename test_utils::select_plus_operator_host<T>::type;
+    binary_op_type_host binary_op_host;
+    using acc_type = typename test_utils::select_plus_operator_host<T>::acc_type;
+
+    constexpr auto   algorithm  = TestFixture::algorithm;
+    constexpr size_t block_size = TestFixture::block_size;
+
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size())
+    {
+        return;
+    }
+
+    const size_t size      = block_size * 113;
+    const size_t grid_size = size / block_size;
+
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    {
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
+
+        // Generate data
+        std::vector<T> output = test_utils::get_random_data<T>(size, 2, 241, seed_value);
+        const T        init   = 0;
+
+        // Calculate expected results on host
+        std::vector<T> expected(output.size(), 0);
+        for(size_t i = 0; i < output.size() / block_size; i++)
+        {
+            acc_type accumulator(init);
+            expected[i * block_size] = init;
+            for(size_t j = 1; j < block_size; j++)
+            {
+                auto idx      = i * block_size + j;
+                accumulator   = binary_op_host(static_cast<acc_type>(output[idx - 1]), accumulator);
+                expected[idx] = static_cast<T>(accumulator);
+            }
+        }
+
+        // Writing to device memory
+        T* device_output;
+        HIP_CHECK(test_common_utils::hipMallocHelper(
+            &device_output,
+            output.size() * sizeof(typename decltype(output)::value_type)));
+
+        HIP_CHECK(hipMemcpy(device_output,
+                            output.data(),
+                            output.size() * sizeof(T),
+                            hipMemcpyHostToDevice));
+
+        // Launching kernel
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(exclusive_sum_kernel<block_size, algorithm, T>),
+                           dim3(grid_size),
+                           dim3(block_size),
+                           0,
+                           0,
+                           device_output);
+
+        HIP_CHECK(hipPeekAtLastError());
+        HIP_CHECK(hipDeviceSynchronize());
+
+        // Read from device memory
+        HIP_CHECK(hipMemcpy(output.data(),
+                            device_output,
+                            output.size() * sizeof(T),
+                            hipMemcpyDeviceToHost));
+
+        // Validating results
+        test_utils::assert_near(output, expected, test_utils::precision<T>::value * block_size);
+
+        HIP_CHECK(hipFree(device_output));
+    }
+}
+
+template<unsigned int BlockSize, hipcub::BlockScanAlgorithm Algorithm, class T>
+__global__ __launch_bounds__(BlockSize)
+void exclusive_sum_reduce_kernel(T* device_output, T* device_output_reductions)
+{
+    const unsigned int                       index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    T                                        value = device_output[index];
+    T                                        reduction;
+    using bscan_t = hipcub::BlockScan<T, BlockSize, Algorithm>;
+    __shared__ typename bscan_t::TempStorage temp_storage;
+    bscan_t(temp_storage).ExclusiveSum(value, value, reduction);
+    device_output[index] = value;
+    if(hipThreadIdx_x == 0)
+    {
+        device_output_reductions[hipBlockIdx_x] = reduction;
+    }
+}
+
+TYPED_TEST(HipcubBlockScanSingleValueTests, ExclusiveSumReduce)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using T = typename TestFixture::type;
+    // for bfloat16 and half we use double for host-side accumulation
+    using binary_op_type_host = typename test_utils::select_plus_operator_host<T>::type;
+    binary_op_type_host binary_op_host;
+    using acc_type = typename test_utils::select_plus_operator_host<T>::acc_type;
+
+    constexpr auto   algorithm  = TestFixture::algorithm;
+    constexpr size_t block_size = TestFixture::block_size;
+
+    if(block_size > test_utils::get_max_block_size())
+    {
+        return;
+    }
+
+    const size_t size      = block_size * 113;
+    const size_t grid_size = size / block_size;
+
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    {
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
+
+        // Generate data
+        std::vector<T> output = test_utils::get_random_data<T>(size, 2, 200, seed_value);
+        const T        init   = 0;
+
+        // Output reduce results
+        std::vector<T> output_reductions(size / block_size, 0);
+
+        // Calculate expected results on host
+        std::vector<T> expected(output.size(), 0);
+        std::vector<T> expected_reductions(output_reductions.size(), 0);
+        for(size_t i = 0; i < output.size() / block_size; i++)
+        {
+            acc_type accumulator(init);
+            expected[i * block_size] = init;
+            for(size_t j = 1; j < block_size; j++)
+            {
+                auto idx      = i * block_size + j;
+                accumulator   = binary_op_host(static_cast<acc_type>(output[idx - 1]), accumulator);
+                expected[idx] = static_cast<T>(accumulator);
+            }
+
+            acc_type accumulator_reductions(0);
+            for(size_t j = 0; j < block_size; j++)
+            {
+                auto idx = i * block_size + j;
+                accumulator_reductions
+                    = binary_op_host(static_cast<acc_type>(output[idx]), accumulator_reductions);
+                expected_reductions[i] = static_cast<T>(accumulator_reductions);
+            }
+        }
+
+        // Writing to device memory
+        T* device_output;
+        HIP_CHECK(test_common_utils::hipMallocHelper(
+            &device_output,
+            output.size() * sizeof(typename decltype(output)::value_type)));
+        T* device_output_reductions;
+        HIP_CHECK(test_common_utils::hipMallocHelper(
+            &device_output_reductions,
+            output_reductions.size() * sizeof(typename decltype(output_reductions)::value_type)));
+
+        HIP_CHECK(hipMemcpy(device_output,
+                            output.data(),
+                            output.size() * sizeof(T),
+                            hipMemcpyHostToDevice));
+
+        HIP_CHECK(hipMemset(device_output_reductions, T(0), output_reductions.size() * sizeof(T)));
+
+        // Launching kernel
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(exclusive_sum_reduce_kernel<block_size, algorithm, T>),
+                           dim3(grid_size),
+                           dim3(block_size),
+                           0,
+                           0,
+                           device_output,
+                           device_output_reductions);
+
+        HIP_CHECK(hipPeekAtLastError());
+        HIP_CHECK(hipDeviceSynchronize());
+
+        // Read from device memory
+        HIP_CHECK(hipMemcpy(output.data(),
+                            device_output,
+                            output.size() * sizeof(T),
+                            hipMemcpyDeviceToHost));
+
+        HIP_CHECK(hipMemcpy(output_reductions.data(),
+                            device_output_reductions,
+                            output_reductions.size() * sizeof(T),
+                            hipMemcpyDeviceToHost));
+
+        // Validating results
+        test_utils::assert_near(output, expected, test_utils::precision<T>::value * block_size);
+        test_utils::assert_near(output_reductions,
+                                expected_reductions,
+                                test_utils::precision<T>::value * block_size);
+
+        HIP_CHECK(hipFree(device_output));
+        HIP_CHECK(hipFree(device_output_reductions));
+    }
+}
+
+template<unsigned int BlockSize, hipcub::BlockScanAlgorithm Algorithm, class T>
+__global__ __launch_bounds__(BlockSize)
+void exclusive_sum_prefix_callback_kernel(T* device_output, T* device_output_bp, T block_prefix)
+{
+    const unsigned int index           = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    T                  prefix_value    = block_prefix;
+    auto               prefix_callback = [&prefix_value](T reduction)
+    {
+        T prefix = prefix_value;
+        prefix_value += reduction;
+        return prefix;
+    };
+
+    T value = device_output[index];
+
+    using bscan_t = hipcub::BlockScan<T, BlockSize, Algorithm>;
+    __shared__ typename bscan_t::TempStorage temp_storage;
+    bscan_t(temp_storage).ExclusiveSum(value, value, prefix_callback);
+
+    device_output[index] = value;
+    if(hipThreadIdx_x == 0)
+    {
+        device_output_bp[hipBlockIdx_x] = prefix_value;
+    }
+}
+
+TYPED_TEST(HipcubBlockScanSingleValueTests, ExclusiveSumPrefixCallback)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using T = typename TestFixture::type;
+    // for bfloat16 and half we use double for host-side accumulation
+    using binary_op_type_host = typename test_utils::select_plus_operator_host<T>::type;
+    binary_op_type_host binary_op_host;
+    using acc_type = typename test_utils::select_plus_operator_host<T>::acc_type;
+
+    constexpr auto   algorithm  = TestFixture::algorithm;
+    constexpr size_t block_size = TestFixture::block_size;
+
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size())
+    {
+        return;
+    }
+
+    const size_t size      = block_size * 113;
+    const size_t grid_size = size / block_size;
+
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    {
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
+
+        // Generate data
+        std::vector<T> output = test_utils::get_random_data<T>(size, 2, 200, seed_value);
+        std::vector<T> output_block_prefixes(size / block_size);
+        T block_prefix = test_utils::get_random_value<T>(0, 100, seed_value + seed_value_addition);
+
+        // Calculate expected results on host
+        std::vector<T> expected(output.size(), 0);
+        std::vector<T> expected_block_prefixes(output_block_prefixes.size(), 0);
+        for(size_t i = 0; i < output.size() / block_size; i++)
+        {
+            acc_type accumulator(block_prefix);
+            expected[i * block_size] = block_prefix;
+            for(size_t j = 1; j < block_size; j++)
+            {
+                auto idx      = i * block_size + j;
+                accumulator   = binary_op_host(static_cast<acc_type>(output[idx - 1]), accumulator);
+                expected[idx] = static_cast<T>(accumulator);
+            }
+
+            acc_type accumulator_block_prefixes(block_prefix);
+            for(size_t j = 0; j < block_size; j++)
+            {
+                auto idx                   = i * block_size + j;
+                accumulator_block_prefixes = binary_op_host(static_cast<acc_type>(output[idx]),
+                                                            accumulator_block_prefixes);
+            }
+            expected_block_prefixes[i] = static_cast<T>(accumulator_block_prefixes);
+        }
+
+        // Writing to device memory
+        T* device_output;
+        HIP_CHECK(test_common_utils::hipMallocHelper(
+            &device_output,
+            output.size() * sizeof(typename decltype(output)::value_type)));
+        T* device_output_bp;
+        HIP_CHECK(test_common_utils::hipMallocHelper(
+            &device_output_bp,
+            output_block_prefixes.size()
+                * sizeof(typename decltype(output_block_prefixes)::value_type)));
+
+        HIP_CHECK(hipMemcpy(device_output,
+                            output.data(),
+                            output.size() * sizeof(T),
+                            hipMemcpyHostToDevice));
+
+        // Launching kernel
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(exclusive_sum_prefix_callback_kernel<block_size, algorithm, T>),
+            dim3(grid_size),
+            dim3(block_size),
+            0,
+            0,
+            device_output,
+            device_output_bp,
+            block_prefix);
+
+        HIP_CHECK(hipPeekAtLastError());
+        HIP_CHECK(hipDeviceSynchronize());
+
+        // Read from device memory
+        HIP_CHECK(hipMemcpy(output.data(),
+                            device_output,
+                            output.size() * sizeof(T),
+                            hipMemcpyDeviceToHost));
+
+        HIP_CHECK(hipMemcpy(output_block_prefixes.data(),
+                            device_output_bp,
+                            output_block_prefixes.size() * sizeof(T),
+                            hipMemcpyDeviceToHost));
+
+        // Validating results
+        test_utils::assert_near(output, expected, test_utils::precision<T>::value * block_size);
+        test_utils::assert_near(output_block_prefixes,
+                                expected_block_prefixes,
+                                test_utils::precision<T>::value * block_size);
+
+        HIP_CHECK(hipFree(device_output));
+        HIP_CHECK(hipFree(device_output_bp));
+    }
+}
+
+// // ---------------------------------------------------------
+// // Test for scan ops taking array of values as input
+// // ---------------------------------------------------------
+
+template<class Params>
+class HipcubBlockScanInputArrayTests : public ::testing::Test
+{
+public:
+    using type                                                   = typename Params::type;
+    static constexpr unsigned int               block_size       = Params::block_size;
+    static constexpr hipcub::BlockScanAlgorithm algorithm        = Params::algorithm;
+    static constexpr unsigned int               items_per_thread = Params::items_per_thread;
+};
+
+using InputArrayTestParams = ::testing::Types<
+    // -----------------------------------------------------------------------
+    // hipcub::BlockScanAlgorithm::using_warp_scan
+    // -----------------------------------------------------------------------
+    params<float, 6U, 32>,
+    params<float, 32, 2>,
+    params<unsigned int, 256, 3>,
+    params<int, 512, 4>,
+    params<float, 37, 2>,
+    params<float, 65, 5>,
+    params<float, 162, 7>,
+    params<float, 255, 15>,
+    // half and bfloat require small block sizes due to the very limited accuracy
+    params<test_utils::half, 65, 5>,
+    params<test_utils::bfloat16, 16, 5>,
+    // -----------------------------------------------------------------------
+    // hipcub::BLOCK_SCAN_RAKING
+    // -----------------------------------------------------------------------
+    params<float, 6U, 32, hipcub::BLOCK_SCAN_RAKING>,
+    params<float, 32, 2, hipcub::BLOCK_SCAN_RAKING>,
+    params<int, 256, 3, hipcub::BLOCK_SCAN_RAKING>,
+    params<unsigned int, 512, 4, hipcub::BLOCK_SCAN_RAKING>,
+    params<float, 37, 2, hipcub::BLOCK_SCAN_RAKING>,
+    params<float, 65, 5, hipcub::BLOCK_SCAN_RAKING>,
+    params<float, 162, 7, hipcub::BLOCK_SCAN_RAKING>,
+    params<float, 255, 15, hipcub::BLOCK_SCAN_RAKING>,
+    // half and bfloat require small block sizes due to the very limited accuracy
+    params<test_utils::half, 65, 5, hipcub::BLOCK_SCAN_RAKING>,
+    params<test_utils::bfloat16, 16, 5, hipcub::BLOCK_SCAN_RAKING>>;
+
+TYPED_TEST_SUITE(HipcubBlockScanInputArrayTests, InputArrayTestParams);
+
+template<unsigned int               BlockSize,
+         unsigned int               ItemsPerThread,
+         hipcub::BlockScanAlgorithm Algorithm,
+         class T>
+__global__ __launch_bounds__(BlockSize)
+void inclusive_scan_array_kernel(T* device_output)
+{
+    const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread;
+
+    // load
+    T in_out[ItemsPerThread];
+    for(unsigned int j = 0; j < ItemsPerThread; j++)
+    {
+        in_out[j] = device_output[index + j];
+    }
+
+    using bscan_t = hipcub::BlockScan<T, BlockSize, Algorithm>;
+    __shared__ typename bscan_t::TempStorage temp_storage;
+    bscan_t(temp_storage).InclusiveScan(in_out, in_out, hipcub::Sum());
+
+    // store
+    for(unsigned int j = 0; j < ItemsPerThread; j++)
+    {
+        device_output[index + j] = in_out[j];
+    }
+}
+
+TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScan)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using T = typename TestFixture::type;
+    // for bfloat16 and half we use double for host-side accumulation
+    using binary_op_type_host = typename test_utils::select_plus_operator_host<T>::type;
+    binary_op_type_host binary_op_host;
+    using acc_type = typename test_utils::select_plus_operator_host<T>::acc_type;
+
+    constexpr auto   algorithm        = TestFixture::algorithm;
+    constexpr size_t block_size       = TestFixture::block_size;
+    constexpr size_t items_per_thread = TestFixture::items_per_thread;
+
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size())
+    {
+        return;
+    }
+
+    const size_t items_per_block = block_size * items_per_thread;
+    const size_t size            = items_per_block * 37;
+    const size_t grid_size       = size / items_per_block;
+
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    {
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
+
+        // Generate data
+        std::vector<T> output = test_utils::get_random_data<T>(size, 2, 200, seed_value);
+
+        // Calculate expected results on host
+        std::vector<T> expected(output.size(), test_utils::convert_to_device<T>(0));
+        for(size_t i = 0; i < output.size() / items_per_block; i++)
+        {
+            acc_type accumulator(0);
+            for(size_t j = 0; j < items_per_block; j++)
+            {
+                auto idx      = i * items_per_block + j;
+                accumulator   = binary_op_host(static_cast<acc_type>(output[idx]), accumulator);
+                expected[idx] = static_cast<T>(accumulator);
+            }
+        }
+
+        // Writing to device memory
+        T* device_output;
+        HIP_CHECK(test_common_utils::hipMallocHelper(
+            &device_output,
+            output.size() * sizeof(typename decltype(output)::value_type)));
+
+        HIP_CHECK(hipMemcpy(device_output,
+                            output.data(),
+                            output.size() * sizeof(T),
+                            hipMemcpyHostToDevice));
+
+        // Launching kernel
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(
+                inclusive_scan_array_kernel<block_size, items_per_thread, algorithm, T>),
+            dim3(grid_size),
+            dim3(block_size),
+            0,
+            0,
+            device_output);
+
+        HIP_CHECK(hipPeekAtLastError());
+        HIP_CHECK(hipDeviceSynchronize());
+
+        // Read from device memory
+        HIP_CHECK(hipMemcpy(output.data(),
+                            device_output,
+                            output.size() * sizeof(T),
+                            hipMemcpyDeviceToHost));
+
+        // Validating results
+        test_utils::assert_near(output, expected, test_utils::precision<T>::value * block_size);
+
+        HIP_CHECK(hipFree(device_output));
+    }
+}
+
+template<unsigned int               BlockSize,
+         unsigned int               ItemsPerThread,
+         hipcub::BlockScanAlgorithm Algorithm,
+         class T>
+__global__ __launch_bounds__(BlockSize)
+void inclusive_scan_reduce_array_kernel(T* device_output, T* device_output_reductions)
+{
+    const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread;
+
+    // load
+    T in_out[ItemsPerThread];
+    for(unsigned int j = 0; j < ItemsPerThread; j++)
+    {
+        in_out[j] = device_output[index + j];
+    }
+
+    using bscan_t = hipcub::BlockScan<T, BlockSize, Algorithm>;
+    __shared__ typename bscan_t::TempStorage temp_storage;
+    T                                        reduction;
+    bscan_t(temp_storage).InclusiveScan(in_out, in_out, hipcub::Sum(), reduction);
+
+    // store
+    for(unsigned int j = 0; j < ItemsPerThread; j++)
+    {
+        device_output[index + j] = in_out[j];
+    }
+
+    if(hipThreadIdx_x == 0)
+    {
+        device_output_reductions[hipBlockIdx_x] = reduction;
+    }
+}
+
+TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanReduce)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using T = typename TestFixture::type;
+    // for bfloat16 and half we use double for host-side accumulation
+    using binary_op_type_host = typename test_utils::select_plus_operator_host<T>::type;
+    binary_op_type_host binary_op_host;
+    using acc_type = typename test_utils::select_plus_operator_host<T>::acc_type;
+
+    constexpr auto   algorithm        = TestFixture::algorithm;
+    constexpr size_t block_size       = TestFixture::block_size;
+    constexpr size_t items_per_thread = TestFixture::items_per_thread;
+
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size())
+    {
+        return;
+    }
+
+    const size_t items_per_block = block_size * items_per_thread;
+    const size_t size            = items_per_block * 37;
+    const size_t grid_size       = size / items_per_block;
+
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    {
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
+
+        // Generate data
+        std::vector<T> output = test_utils::get_random_data<T>(size, 2, 200, seed_value);
+
+        // Output reduce results
+        std::vector<T> output_reductions(size / block_size, test_utils::convert_to_device<T>(0));
+
+        // Calculate expected results on host
+        std::vector<T> expected(output.size(), test_utils::convert_to_device<T>(0));
+        std::vector<T> expected_reductions(output_reductions.size(),
+                                           test_utils::convert_to_device<T>(0));
+        for(size_t i = 0; i < output.size() / items_per_block; i++)
+        {
+            acc_type accumulator(0);
+            for(size_t j = 0; j < items_per_block; j++)
+            {
+                auto idx      = i * items_per_block + j;
+                accumulator   = binary_op_host(static_cast<acc_type>(output[idx]), accumulator);
+                expected[idx] = static_cast<T>(accumulator);
+            }
+            expected_reductions[i] = expected[(i + 1) * items_per_block - 1];
+        }
+
+        // Writing to device memory
+        T* device_output;
+        HIP_CHECK(test_common_utils::hipMallocHelper(
+            &device_output,
+            output.size() * sizeof(typename decltype(output)::value_type)));
+        T* device_output_reductions;
+        HIP_CHECK(test_common_utils::hipMallocHelper(
+            &device_output_reductions,
+            output_reductions.size() * sizeof(typename decltype(output_reductions)::value_type)));
+
+        HIP_CHECK(hipMemcpy(device_output,
+                            output.data(),
+                            output.size() * sizeof(T),
+                            hipMemcpyHostToDevice));
+
+        HIP_CHECK(hipMemset(device_output_reductions,
+                            test_utils::convert_to_device<T>(0),
+                            output_reductions.size() * sizeof(T)));
+
+        // Launching kernel
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(
+                inclusive_scan_reduce_array_kernel<block_size, items_per_thread, algorithm, T>),
+            dim3(grid_size),
+            dim3(block_size),
+            0,
+            0,
+            device_output,
+            device_output_reductions);
+
+        HIP_CHECK(hipPeekAtLastError());
+        HIP_CHECK(hipDeviceSynchronize());
+
+        // Read from device memory
+        HIP_CHECK(hipMemcpy(output.data(),
+                            device_output,
+                            output.size() * sizeof(T),
+                            hipMemcpyDeviceToHost));
+
+        HIP_CHECK(hipMemcpy(output_reductions.data(),
+                            device_output_reductions,
+                            output_reductions.size() * sizeof(T),
+                            hipMemcpyDeviceToHost));
+
+        // Validating results
+        test_utils::assert_near(output, expected, test_utils::precision<T>::value * block_size);
+
+        test_utils::assert_near(output_reductions,
+                                expected_reductions,
+                                test_utils::precision<T>::value * block_size);
+
+        HIP_CHECK(hipFree(device_output));
+        HIP_CHECK(hipFree(device_output_reductions));
+    }
+}
+
+template<unsigned int               BlockSize,
+         unsigned int               ItemsPerThread,
+         hipcub::BlockScanAlgorithm Algorithm,
+         class T>
+__global__ __launch_bounds__(BlockSize)
+void inclusive_scan_array_prefix_callback_kernel(T* device_output,
+                                                 T* device_output_bp,
+                                                 T  block_prefix)
+{
+    const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread;
+    T                  prefix_value    = block_prefix;
+    auto               prefix_callback = [&prefix_value](T reduction)
+    {
+        T prefix     = prefix_value;
+        prefix_value = prefix_value + reduction;
+        return prefix;
+    };
+
+    // load
+    T in_out[ItemsPerThread];
+    for(unsigned int j = 0; j < ItemsPerThread; j++)
+    {
+        in_out[j] = device_output[index + j];
+    }
+
+    using bscan_t = hipcub::BlockScan<T, BlockSize, Algorithm>;
+    __shared__ typename bscan_t::TempStorage temp_storage;
+    bscan_t(temp_storage).InclusiveScan(in_out, in_out, hipcub::Sum(), prefix_callback);
+
+    // store
+    for(unsigned int j = 0; j < ItemsPerThread; j++)
+    {
+        device_output[index + j] = in_out[j];
+    }
+
+    if(hipThreadIdx_x == 0)
+    {
+        device_output_bp[hipBlockIdx_x] = prefix_value;
+    }
+}
+
+TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanPrefixCallback)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using T = typename TestFixture::type;
+    // for bfloat16 and half we use double for host-side accumulation
+    using binary_op_type_host = typename test_utils::select_plus_operator_host<T>::type;
+    binary_op_type_host binary_op_host;
+    using acc_type = typename test_utils::select_plus_operator_host<T>::acc_type;
+
+    constexpr auto   algorithm        = TestFixture::algorithm;
+    constexpr size_t block_size       = TestFixture::block_size;
+    constexpr size_t items_per_thread = TestFixture::items_per_thread;
+
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size())
+    {
+        return;
+    }
+
+    const size_t items_per_block = block_size * items_per_thread;
+    const size_t size            = items_per_block * 37;
+    const size_t grid_size       = size / items_per_block;
+
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    {
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
+
+        // Generate data
+        std::vector<T> output = test_utils::get_random_data<T>(size, 2, 200, seed_value);
+        std::vector<T> output_block_prefixes(size / items_per_block,
+                                             test_utils::convert_to_device<T>(0));
+        T block_prefix = test_utils::get_random_value<T>(test_utils::convert_to_device<T>(0),
+                                                         test_utils::convert_to_device<T>(100),
+                                                         seed_value + seed_value_addition);
+
+        // Calculate expected results on host
+        std::vector<T> expected(output.size(), test_utils::convert_to_device<T>(0));
+        std::vector<T> expected_block_prefixes(output_block_prefixes.size(),
+                                               test_utils::convert_to_device<T>(0));
+        for(size_t i = 0; i < output.size() / items_per_block; i++)
+        {
+            acc_type accumulator(block_prefix);
+            for(size_t j = 0; j < items_per_block; j++)
+            {
+                auto idx      = i * items_per_block + j;
+                accumulator   = binary_op_host(static_cast<acc_type>(output[idx]), accumulator);
+                expected[idx] = static_cast<T>(accumulator);
+            }
+            expected_block_prefixes[i] = expected[(i + 1) * items_per_block - 1];
+        }
+
+        // Writing to device memory
+        T* device_output;
+        HIP_CHECK(test_common_utils::hipMallocHelper(
+            &device_output,
+            output.size() * sizeof(typename decltype(output)::value_type)));
+        T* device_output_bp;
+        HIP_CHECK(test_common_utils::hipMallocHelper(
+            &device_output_bp,
+            output_block_prefixes.size()
+                * sizeof(typename decltype(output_block_prefixes)::value_type)));
+
+        HIP_CHECK(hipMemcpy(device_output,
+                            output.data(),
+                            output.size() * sizeof(T),
+                            hipMemcpyHostToDevice));
+
+        HIP_CHECK(hipMemcpy(device_output_bp,
+                            output_block_prefixes.data(),
+                            output_block_prefixes.size() * sizeof(T),
+                            hipMemcpyHostToDevice));
+
+        // Launching kernel
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(inclusive_scan_array_prefix_callback_kernel<block_size,
+                                                                        items_per_thread,
+                                                                        algorithm,
+                                                                        T>),
+            dim3(grid_size),
+            dim3(block_size),
+            0,
+            0,
+            device_output,
+            device_output_bp,
+            block_prefix);
+
+        HIP_CHECK(hipPeekAtLastError());
+        HIP_CHECK(hipDeviceSynchronize());
+
+        // Read from device memory
+        HIP_CHECK(hipMemcpy(output.data(),
+                            device_output,
+                            output.size() * sizeof(T),
+                            hipMemcpyDeviceToHost));
+
+        HIP_CHECK(hipMemcpy(output_block_prefixes.data(),
+                            device_output_bp,
+                            output_block_prefixes.size() * sizeof(T),
+                            hipMemcpyDeviceToHost));
+
+        // Validating results
+        test_utils::assert_near(output, expected, test_utils::precision<T>::value * block_size);
+
+        test_utils::assert_near(output_block_prefixes,
+                                expected_block_prefixes,
+                                test_utils::precision<T>::value * block_size);
+
+        HIP_CHECK(hipFree(device_output));
+        HIP_CHECK(hipFree(device_output_bp));
+    }
+}
+
+template<unsigned int               BlockSize,
+         unsigned int               ItemsPerThread,
+         hipcub::BlockScanAlgorithm Algorithm,
+         class T>
+__global__ __launch_bounds__(BlockSize)
+void exclusive_scan_array_kernel(T* device_output, T init)
+{
+    const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread;
+    // load
+    T in_out[ItemsPerThread];
+    for(unsigned int j = 0; j < ItemsPerThread; j++)
+    {
+        in_out[j] = device_output[index + j];
+    }
+
+    using bscan_t = hipcub::BlockScan<T, BlockSize, Algorithm>;
+    __shared__ typename bscan_t::TempStorage temp_storage;
+    bscan_t(temp_storage).ExclusiveScan(in_out, in_out, init, hipcub::Sum());
+
+    // store
+    for(unsigned int j = 0; j < ItemsPerThread; j++)
+    {
+        device_output[index + j] = in_out[j];
+    }
+}
+
+TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveScan)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using T = typename TestFixture::type;
+    // for bfloat16 and half we use double for host-side accumulation
+    using binary_op_type_host = typename test_utils::select_plus_operator_host<T>::type;
+    binary_op_type_host binary_op_host;
+    using acc_type = typename test_utils::select_plus_operator_host<T>::acc_type;
+
+    constexpr auto   algorithm        = TestFixture::algorithm;
+    constexpr size_t block_size       = TestFixture::block_size;
+    constexpr size_t items_per_thread = TestFixture::items_per_thread;
+
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size())
+    {
+        return;
+    }
+
+    const size_t items_per_block = block_size * items_per_thread;
+    const size_t size            = items_per_block * 37;
+    const size_t grid_size       = size / items_per_block;
+
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    {
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
+
+        // Generate data
+        std::vector<T> output
+            = test_utils::get_random_data<T>(size,
+                                             test_utils::convert_to_device<T>(2),
+                                             test_utils::convert_to_device<T>(200),
+                                             seed_value);
+        const T init = test_utils::get_random_value<T>(test_utils::convert_to_device<T>(0),
+                                                       test_utils::convert_to_device<T>(100),
+                                                       seed_value + seed_value_addition);
+
+        // Calculate expected results on host
+        std::vector<T> expected(output.size(), test_utils::convert_to_device<T>(0));
+        for(size_t i = 0; i < output.size() / items_per_block; i++)
+        {
+            acc_type accumulator(init);
+            expected[i * items_per_block] = init;
+            for(size_t j = 1; j < items_per_block; j++)
+            {
+                auto idx      = i * items_per_block + j;
+                accumulator   = binary_op_host(static_cast<acc_type>(output[idx - 1]), accumulator);
+                expected[idx] = static_cast<T>(accumulator);
+            }
+        }
+
+        // Writing to device memory
+        T* device_output;
+        HIP_CHECK(test_common_utils::hipMallocHelper(
+            &device_output,
+            output.size() * sizeof(typename decltype(output)::value_type)));
+
+        HIP_CHECK(hipMemcpy(device_output,
+                            output.data(),
+                            output.size() * sizeof(T),
+                            hipMemcpyHostToDevice));
+
+        // Launching kernel
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(
+                exclusive_scan_array_kernel<block_size, items_per_thread, algorithm, T>),
+            dim3(grid_size),
+            dim3(block_size),
+            0,
+            0,
+            device_output,
+            init);
+
+        HIP_CHECK(hipPeekAtLastError());
+        HIP_CHECK(hipDeviceSynchronize());
+
+        // Read from device memory
+        HIP_CHECK(hipMemcpy(output.data(),
+                            device_output,
+                            output.size() * sizeof(T),
+                            hipMemcpyDeviceToHost));
+
+        // Validating results
+        test_utils::assert_near(output, expected, test_utils::precision<T>::value * block_size);
+
+        HIP_CHECK(hipFree(device_output));
+    }
+}
+
+template<unsigned int               BlockSize,
+         unsigned int               ItemsPerThread,
+         hipcub::BlockScanAlgorithm Algorithm,
+         class T>
+__global__ __launch_bounds__(BlockSize)
+void exclusive_scan_reduce_array_kernel(T* device_output, T* device_output_reductions, T init)
+{
+    const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread;
+    // load
+    T in_out[ItemsPerThread];
+    for(unsigned int j = 0; j < ItemsPerThread; j++)
+    {
+        in_out[j] = device_output[index + j];
+    }
+
+    using bscan_t = hipcub::BlockScan<T, BlockSize, Algorithm>;
+    __shared__ typename bscan_t::TempStorage temp_storage;
+    T                                        reduction;
+    bscan_t(temp_storage).ExclusiveScan(in_out, in_out, init, hipcub::Sum(), reduction);
+
+    // store
+    for(unsigned int j = 0; j < ItemsPerThread; j++)
+    {
+        device_output[index + j] = in_out[j];
+    }
+
+    if(hipThreadIdx_x == 0)
+    {
+        device_output_reductions[hipBlockIdx_x] = reduction;
+    }
+}
+
+TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveScanReduce)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using T = typename TestFixture::type;
+    // for bfloat16 and half we use double for host-side accumulation
+    using binary_op_type_host = typename test_utils::select_plus_operator_host<T>::type;
+    binary_op_type_host binary_op_host;
+    using acc_type = typename test_utils::select_plus_operator_host<T>::acc_type;
+
+    constexpr auto   algorithm        = TestFixture::algorithm;
+    constexpr size_t block_size       = TestFixture::block_size;
+    constexpr size_t items_per_thread = TestFixture::items_per_thread;
+
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size())
+    {
+        return;
+    }
+
+    const size_t items_per_block = block_size * items_per_thread;
+    const size_t size            = items_per_block * 37;
+    const size_t grid_size       = size / items_per_block;
+
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    {
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
+
+        // Generate data
+        std::vector<T> output
+            = test_utils::get_random_data<T>(size,
+                                             test_utils::convert_to_device<T>(2),
+                                             test_utils::convert_to_device<T>(200),
+                                             seed_value);
+
+        // Output reduce results
+        std::vector<T> output_reductions(size / block_size, test_utils::convert_to_device<T>(0));
+        const T        init = test_utils::get_random_value<T>(test_utils::convert_to_device<T>(0),
+                                                       test_utils::convert_to_device<T>(100),
+                                                       seed_value + seed_value_addition);
+
+        // Calculate expected results on host
+        std::vector<T> expected(output.size(), test_utils::convert_to_device<T>(0));
+        std::vector<T> expected_reductions(output_reductions.size(),
+                                           test_utils::convert_to_device<T>(0));
+        for(size_t i = 0; i < output.size() / items_per_block; i++)
+        {
+            acc_type accumulator(init);
+            expected[i * items_per_block] = init;
+            for(size_t j = 1; j < items_per_block; j++)
+            {
+                auto idx      = i * items_per_block + j;
+                accumulator   = binary_op_host(static_cast<acc_type>(output[idx - 1]), accumulator);
+                expected[idx] = static_cast<T>(accumulator);
+            }
+
+            acc_type accumulator_reductions(0);
+            for(size_t j = 0; j < items_per_block; j++)
+            {
+                auto idx = i * items_per_block + j;
+                accumulator_reductions
+                    = binary_op_host(static_cast<acc_type>(output[idx]), accumulator_reductions);
+                expected_reductions[i] = static_cast<T>(accumulator_reductions);
+            }
+        }
+
+        // Writing to device memory
+        T* device_output;
+        HIP_CHECK(test_common_utils::hipMallocHelper(
+            &device_output,
+            output.size() * sizeof(typename decltype(output)::value_type)));
+        T* device_output_reductions;
+        HIP_CHECK(test_common_utils::hipMallocHelper(
+            &device_output_reductions,
+            output_reductions.size() * sizeof(typename decltype(output_reductions)::value_type)));
+
+        HIP_CHECK(hipMemcpy(device_output,
+                            output.data(),
+                            output.size() * sizeof(T),
+                            hipMemcpyHostToDevice));
+
+        HIP_CHECK(hipMemset(device_output_reductions,
+                            test_utils::convert_to_device<T>(0),
+                            output_reductions.size() * sizeof(T)));
+
+        // Launching kernel
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(
+                exclusive_scan_reduce_array_kernel<block_size, items_per_thread, algorithm, T>),
+            dim3(grid_size),
+            dim3(block_size),
+            0,
+            0,
+            device_output,
+            device_output_reductions,
+            init);
+
+        HIP_CHECK(hipPeekAtLastError());
+        HIP_CHECK(hipDeviceSynchronize());
+
+        // Read from device memory
+        HIP_CHECK(hipMemcpy(output.data(),
+                            device_output,
+                            output.size() * sizeof(T),
+                            hipMemcpyDeviceToHost));
+
+        HIP_CHECK(hipMemcpy(output_reductions.data(),
+                            device_output_reductions,
+                            output_reductions.size() * sizeof(T),
+                            hipMemcpyDeviceToHost));
+
+        // Validating results
+        test_utils::assert_near(output, expected, test_utils::precision<T>::value * block_size);
+
+        test_utils::assert_near(output_reductions,
+                                expected_reductions,
+                                test_utils::precision<T>::value * block_size);
+    }
+}
+
+template<unsigned int               BlockSize,
+         unsigned int               ItemsPerThread,
+         hipcub::BlockScanAlgorithm Algorithm,
+         class T>
+__global__ __launch_bounds__(BlockSize)
+void exclusive_scan_prefix_callback_array_kernel(T* device_output,
+                                                 T* device_output_bp,
+                                                 T  block_prefix)
+{
+    const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread;
+    T                  prefix_value    = block_prefix;
+    auto               prefix_callback = [&prefix_value](T reduction)
+    {
+        T prefix     = prefix_value;
+        prefix_value = prefix_value + reduction;
+        return prefix;
+    };
+
+    // load
+    T in_out[ItemsPerThread];
+    for(unsigned int j = 0; j < ItemsPerThread; j++)
+    {
+        in_out[j] = device_output[index + j];
+    }
+
+    using bscan_t = hipcub::BlockScan<T, BlockSize, Algorithm>;
+    __shared__ typename bscan_t::TempStorage temp_storage;
+    bscan_t(temp_storage).ExclusiveScan(in_out, in_out, hipcub::Sum(), prefix_callback);
+
+    // store
+    for(unsigned int j = 0; j < ItemsPerThread; j++)
+    {
+        device_output[index + j] = in_out[j];
+    }
+
+    if(hipThreadIdx_x == 0)
+    {
+        device_output_bp[hipBlockIdx_x] = prefix_value;
+    }
+}
+
+TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveScanPrefixCallback)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using T = typename TestFixture::type;
+    // for bfloat16 and half we use double for host-side accumulation
+    using binary_op_type_host = typename test_utils::select_plus_operator_host<T>::type;
+    binary_op_type_host binary_op_host;
+    using acc_type = typename test_utils::select_plus_operator_host<T>::acc_type;
+
+    constexpr auto   algorithm        = TestFixture::algorithm;
+    constexpr size_t block_size       = TestFixture::block_size;
+    constexpr size_t items_per_thread = TestFixture::items_per_thread;
+
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size())
+    {
+        return;
+    }
+
+    const size_t items_per_block = block_size * items_per_thread;
+    const size_t size            = items_per_block * 37;
+    const size_t grid_size       = size / items_per_block;
+
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    {
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
+
+        // Generate data
+        std::vector<T> output = test_utils::get_random_data<T>(size, 2, 200, seed_value);
+        std::vector<T> output_block_prefixes(size / items_per_block);
+        T block_prefix = test_utils::get_random_value<T>(test_utils::convert_to_device<T>(0),
+                                                         test_utils::convert_to_device<T>(100),
+                                                         seed_value + seed_value_addition);
+
+        // Calculate expected results on host
+        std::vector<T> expected(output.size(), test_utils::convert_to_device<T>(0));
+        std::vector<T> expected_block_prefixes(output_block_prefixes.size(),
+                                               test_utils::convert_to_device<T>(0));
+        for(size_t i = 0; i < output.size() / items_per_block; i++)
+        {
+            acc_type accumulator(block_prefix);
+            expected[i * items_per_block] = block_prefix;
+            for(size_t j = 1; j < items_per_block; j++)
+            {
+                auto idx      = i * items_per_block + j;
+                accumulator   = binary_op_host(static_cast<acc_type>(output[idx - 1]), accumulator);
+                expected[idx] = static_cast<T>(accumulator);
+            }
+            acc_type accumulator_block_prefixes(block_prefix);
+            for(size_t j = 0; j < items_per_block; j++)
+            {
+                auto idx                   = i * items_per_block + j;
+                accumulator_block_prefixes = binary_op_host(static_cast<acc_type>(output[idx]),
+                                                            accumulator_block_prefixes);
+                expected_block_prefixes[i] = static_cast<T>(accumulator_block_prefixes);
+            }
+        }
+
+        // Writing to device memory
+        T* device_output;
+        HIP_CHECK(test_common_utils::hipMallocHelper(
+            &device_output,
+            output.size() * sizeof(typename decltype(output)::value_type)));
+        T* device_output_bp;
+        HIP_CHECK(test_common_utils::hipMallocHelper(
+            &device_output_bp,
+            output_block_prefixes.size()
+                * sizeof(typename decltype(output_block_prefixes)::value_type)));
+
+        HIP_CHECK(hipMemcpy(device_output,
+                            output.data(),
+                            output.size() * sizeof(T),
+                            hipMemcpyHostToDevice));
+
+        // Launching kernel
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(exclusive_scan_prefix_callback_array_kernel<block_size,
+                                                                        items_per_thread,
+                                                                        algorithm,
+                                                                        T>),
+            dim3(grid_size),
+            dim3(block_size),
+            0,
+            0,
+            device_output,
+            device_output_bp,
+            block_prefix);
+
+        HIP_CHECK(hipPeekAtLastError());
+        HIP_CHECK(hipDeviceSynchronize());
+
+        // Read from device memory
+        HIP_CHECK(hipMemcpy(output.data(),
+                            device_output,
+                            output.size() * sizeof(T),
+                            hipMemcpyDeviceToHost));
+
+        HIP_CHECK(hipMemcpy(output_block_prefixes.data(),
+                            device_output_bp,
+                            output_block_prefixes.size() * sizeof(T),
+                            hipMemcpyDeviceToHost));
+
+        // Validating results
+        test_utils::assert_near(output, expected, test_utils::precision<T>::value * block_size);
+
+        test_utils::assert_near(output_block_prefixes,
+                                expected_block_prefixes,
+                                test_utils::precision<T>::value * block_size);
+
+        HIP_CHECK(hipFree(device_output));
+        HIP_CHECK(hipFree(device_output_bp));
+    }
+}
+
+template<unsigned int               BlockSize,
+         unsigned int               ItemsPerThread,
+         hipcub::BlockScanAlgorithm Algorithm,
+         class T>
+__global__ __launch_bounds__(BlockSize)
+void inclusive_sum_array_kernel(T* device_output)
+{
+    const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread;
+
+    // load
+    T in_out[ItemsPerThread];
+    for(unsigned int j = 0; j < ItemsPerThread; j++)
+    {
+        in_out[j] = device_output[index + j];
+    }
+
+    using bscan_t = hipcub::BlockScan<T, BlockSize, Algorithm>;
+    __shared__ typename bscan_t::TempStorage temp_storage;
+    bscan_t(temp_storage).InclusiveSum(in_out, in_out);
+
+    // store
+    for(unsigned int j = 0; j < ItemsPerThread; j++)
+    {
+        device_output[index + j] = in_out[j];
+    }
+}
+
+TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveSum)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using T = typename TestFixture::type;
+    // for bfloat16 and half we use double for host-side accumulation
+    using binary_op_type_host = typename test_utils::select_plus_operator_host<T>::type;
+    binary_op_type_host binary_op_host;
+    using acc_type = typename test_utils::select_plus_operator_host<T>::acc_type;
+
+    constexpr auto   algorithm        = TestFixture::algorithm;
+    constexpr size_t block_size       = TestFixture::block_size;
+    constexpr size_t items_per_thread = TestFixture::items_per_thread;
+
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size())
+    {
+        return;
+    }
+
+    const size_t items_per_block = block_size * items_per_thread;
+    const size_t size            = items_per_block * 37;
+    const size_t grid_size       = size / items_per_block;
+
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    {
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
+
+        // Generate data
+        std::vector<T> output = test_utils::get_random_data<T>(size, 2, 200, seed_value);
+
+        // Calculate expected results on host
+        std::vector<T> expected(output.size(), test_utils::convert_to_device<T>(0));
+        for(size_t i = 0; i < output.size() / items_per_block; i++)
+        {
+            acc_type accumulator(0);
+            for(size_t j = 0; j < items_per_block; j++)
+            {
+                auto idx      = i * items_per_block + j;
+                accumulator   = binary_op_host(static_cast<acc_type>(output[idx]), accumulator);
+                expected[idx] = static_cast<T>(accumulator);
+            }
+        }
+
+        // Writing to device memory
+        T* device_output;
+        HIP_CHECK(test_common_utils::hipMallocHelper(
+            &device_output,
+            output.size() * sizeof(typename decltype(output)::value_type)));
+
+        HIP_CHECK(hipMemcpy(device_output,
+                            output.data(),
+                            output.size() * sizeof(T),
+                            hipMemcpyHostToDevice));
+
+        // Launching kernel
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(inclusive_sum_array_kernel<block_size, items_per_thread, algorithm, T>),
+            dim3(grid_size),
+            dim3(block_size),
+            0,
+            0,
+            device_output);
+
+        HIP_CHECK(hipPeekAtLastError());
+        HIP_CHECK(hipDeviceSynchronize());
+
+        // Read from device memory
+        HIP_CHECK(hipMemcpy(output.data(),
+                            device_output,
+                            output.size() * sizeof(T),
+                            hipMemcpyDeviceToHost));
+
+        // Validating results
+        test_utils::assert_near(output, expected, test_utils::precision<T>::value * block_size);
+
+        HIP_CHECK(hipFree(device_output));
+    }
+}
+
+template<unsigned int               BlockSize,
+         unsigned int               ItemsPerThread,
+         hipcub::BlockScanAlgorithm Algorithm,
+         class T>
+__global__ __launch_bounds__(BlockSize)
+void inclusive_sum_reduce_array_kernel(T* device_output, T* device_output_reductions)
+{
+    const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread;
+
+    // load
+    T in_out[ItemsPerThread];
+    for(unsigned int j = 0; j < ItemsPerThread; j++)
+    {
+        in_out[j] = device_output[index + j];
+    }
+
+    using bscan_t = hipcub::BlockScan<T, BlockSize, Algorithm>;
+    __shared__ typename bscan_t::TempStorage temp_storage;
+    T                                        reduction;
+    bscan_t(temp_storage).InclusiveSum(in_out, in_out, reduction);
+
+    // store
+    for(unsigned int j = 0; j < ItemsPerThread; j++)
+    {
+        device_output[index + j] = in_out[j];
+    }
+
+    if(hipThreadIdx_x == 0)
+    {
+        device_output_reductions[hipBlockIdx_x] = reduction;
+    }
+}
+
+TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveSumReduce)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using T = typename TestFixture::type;
+    // for bfloat16 and half we use double for host-side accumulation
+    using binary_op_type_host = typename test_utils::select_plus_operator_host<T>::type;
+    binary_op_type_host binary_op_host;
+    using acc_type = typename test_utils::select_plus_operator_host<T>::acc_type;
+
+    constexpr auto   algorithm        = TestFixture::algorithm;
+    constexpr size_t block_size       = TestFixture::block_size;
+    constexpr size_t items_per_thread = TestFixture::items_per_thread;
+
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size())
+    {
+        return;
+    }
+
+    const size_t items_per_block = block_size * items_per_thread;
+    const size_t size            = items_per_block * 37;
+    const size_t grid_size       = size / items_per_block;
+
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    {
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
+
+        // Generate data
+        std::vector<T> output = test_utils::get_random_data<T>(size, 2, 200, seed_value);
+
+        // Output reduce results
+        std::vector<T> output_reductions(size / block_size, test_utils::convert_to_device<T>(0));
+
+        // Calculate expected results on host
+        std::vector<T> expected(output.size(), test_utils::convert_to_device<T>(0));
+        std::vector<T> expected_reductions(output_reductions.size(),
+                                           test_utils::convert_to_device<T>(0));
+        for(size_t i = 0; i < output.size() / items_per_block; i++)
+        {
+            acc_type accumulator(0);
+            for(size_t j = 0; j < items_per_block; j++)
+            {
+                auto idx      = i * items_per_block + j;
+                accumulator   = binary_op_host(static_cast<acc_type>(output[idx]), accumulator);
+                expected[idx] = static_cast<T>(accumulator);
+            }
+            expected_reductions[i] = expected[(i + 1) * items_per_block - 1];
+        }
+
+        // Writing to device memory
+        T* device_output;
+        HIP_CHECK(test_common_utils::hipMallocHelper(
+            &device_output,
+            output.size() * sizeof(typename decltype(output)::value_type)));
+        T* device_output_reductions;
+        HIP_CHECK(test_common_utils::hipMallocHelper(
+            &device_output_reductions,
+            output_reductions.size() * sizeof(typename decltype(output_reductions)::value_type)));
+
+        HIP_CHECK(hipMemcpy(device_output,
+                            output.data(),
+                            output.size() * sizeof(T),
+                            hipMemcpyHostToDevice));
+
+        HIP_CHECK(hipMemset(device_output_reductions,
+                            test_utils::convert_to_device<T>(0),
+                            output_reductions.size() * sizeof(T)));
+
+        // Launching kernel
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(
+                inclusive_sum_reduce_array_kernel<block_size, items_per_thread, algorithm, T>),
+            dim3(grid_size),
+            dim3(block_size),
+            0,
+            0,
+            device_output,
+            device_output_reductions);
+
+        HIP_CHECK(hipPeekAtLastError());
+        HIP_CHECK(hipDeviceSynchronize());
+
+        // Read from device memory
+        HIP_CHECK(hipMemcpy(output.data(),
+                            device_output,
+                            output.size() * sizeof(T),
+                            hipMemcpyDeviceToHost));
+
+        HIP_CHECK(hipMemcpy(output_reductions.data(),
+                            device_output_reductions,
+                            output_reductions.size() * sizeof(T),
+                            hipMemcpyDeviceToHost));
+
+        // Validating results
+        test_utils::assert_near(output, expected, test_utils::precision<T>::value * block_size);
+
+        test_utils::assert_near(output_reductions,
+                                expected_reductions,
+                                test_utils::precision<T>::value * block_size);
+
+        HIP_CHECK(hipFree(device_output));
+        HIP_CHECK(hipFree(device_output_reductions));
+    }
+}
+
+template<unsigned int               BlockSize,
+         unsigned int               ItemsPerThread,
+         hipcub::BlockScanAlgorithm Algorithm,
+         class T>
+__global__ __launch_bounds__(BlockSize)
+void inclusive_sum_array_prefix_callback_kernel(T* device_output,
+                                                T* device_output_bp,
+                                                T  block_prefix)
+{
+    const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread;
+    T                  prefix_value    = block_prefix;
+    auto               prefix_callback = [&prefix_value](T reduction)
+    {
+        T prefix     = prefix_value;
+        prefix_value = prefix_value + reduction;
+        return prefix;
+    };
+
+    // load
+    T in_out[ItemsPerThread];
+    for(unsigned int j = 0; j < ItemsPerThread; j++)
+    {
+        in_out[j] = device_output[index + j];
+    }
+
+    using bscan_t = hipcub::BlockScan<T, BlockSize, Algorithm>;
+    __shared__ typename bscan_t::TempStorage temp_storage;
+    bscan_t(temp_storage).InclusiveSum(in_out, in_out, prefix_callback);
+
+    // store
+    for(unsigned int j = 0; j < ItemsPerThread; j++)
+    {
+        device_output[index + j] = in_out[j];
+    }
+
+    if(hipThreadIdx_x == 0)
+    {
+        device_output_bp[hipBlockIdx_x] = prefix_value;
+    }
+}
+
+TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveSumPrefixCallback)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using T = typename TestFixture::type;
+    // for bfloat16 and half we use double for host-side accumulation
+    using binary_op_type_host = typename test_utils::select_plus_operator_host<T>::type;
+    binary_op_type_host binary_op_host;
+    using acc_type = typename test_utils::select_plus_operator_host<T>::acc_type;
+
+    constexpr auto   algorithm        = TestFixture::algorithm;
+    constexpr size_t block_size       = TestFixture::block_size;
+    constexpr size_t items_per_thread = TestFixture::items_per_thread;
+
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size())
+    {
+        return;
+    }
+
+    const size_t items_per_block = block_size * items_per_thread;
+    const size_t size            = items_per_block * 37;
+    const size_t grid_size       = size / items_per_block;
+
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    {
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
+
+        // Generate data
+        std::vector<T> output = test_utils::get_random_data<T>(size, 2, 200, seed_value);
+        std::vector<T> output_block_prefixes(size / items_per_block,
+                                             test_utils::convert_to_device<T>(0));
+        T block_prefix = test_utils::get_random_value<T>(test_utils::convert_to_device<T>(0),
+                                                         test_utils::convert_to_device<T>(100),
+                                                         seed_value + seed_value_addition);
+
+        // Calculate expected results on host
+        std::vector<T> expected(output.size(), test_utils::convert_to_device<T>(0));
+        std::vector<T> expected_block_prefixes(output_block_prefixes.size(),
+                                               test_utils::convert_to_device<T>(0));
         for(size_t i = 0; i < output.size() / items_per_block; i++)
         {
             acc_type accumulator(block_prefix);
@@ -1560,10 +3112,10 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, InclusiveScanPrefixCallback)
 
         // Launching kernel
         hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(block_inclusive_scan_array_prefix_callback_kernel<block_size,
-                                                                              items_per_thread,
-                                                                              algorithm,
-                                                                              T>),
+            HIP_KERNEL_NAME(inclusive_sum_array_prefix_callback_kernel<block_size,
+                                                                       items_per_thread,
+                                                                       algorithm,
+                                                                       T>),
             dim3(grid_size),
             dim3(block_size),
             0,
@@ -1602,7 +3154,8 @@ template<unsigned int               BlockSize,
          unsigned int               ItemsPerThread,
          hipcub::BlockScanAlgorithm Algorithm,
          class T>
-__global__ __launch_bounds__(BlockSize) void exclusive_scan_array_kernel(T* device_output, T init)
+__global__ __launch_bounds__(BlockSize)
+void exclusive_sum_array_kernel(T* device_output)
 {
     const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread;
     // load
@@ -1614,7 +3167,7 @@ __global__ __launch_bounds__(BlockSize) void exclusive_scan_array_kernel(T* devi
 
     using bscan_t = hipcub::BlockScan<T, BlockSize, Algorithm>;
     __shared__ typename bscan_t::TempStorage temp_storage;
-    bscan_t(temp_storage).ExclusiveScan(in_out, in_out, init, hipcub::Sum());
+    bscan_t(temp_storage).ExclusiveSum(in_out, in_out);
 
     // store
     for(unsigned int j = 0; j < ItemsPerThread; j++)
@@ -1623,7 +3176,7 @@ __global__ __launch_bounds__(BlockSize) void exclusive_scan_array_kernel(T* devi
     }
 }
 
-TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveScan)
+TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveSum)
 {
     int device_id = test_common_utils::obtain_device_from_ctest();
     SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
@@ -1661,9 +3214,7 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveScan)
                                              test_utils::convert_to_device<T>(2),
                                              test_utils::convert_to_device<T>(200),
                                              seed_value);
-        const T init = test_utils::get_random_value<T>(test_utils::convert_to_device<T>(0),
-                                                       test_utils::convert_to_device<T>(100),
-                                                       seed_value + seed_value_addition);
+        const T init = static_cast<T>(0);
 
         // Calculate expected results on host
         std::vector<T> expected(output.size(), test_utils::convert_to_device<T>(0));
@@ -1692,14 +3243,12 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveScan)
 
         // Launching kernel
         hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(
-                exclusive_scan_array_kernel<block_size, items_per_thread, algorithm, T>),
+            HIP_KERNEL_NAME(exclusive_sum_array_kernel<block_size, items_per_thread, algorithm, T>),
             dim3(grid_size),
             dim3(block_size),
             0,
             0,
-            device_output,
-            init);
+            device_output);
 
         HIP_CHECK(hipPeekAtLastError());
         HIP_CHECK(hipDeviceSynchronize());
@@ -1721,8 +3270,8 @@ template<unsigned int               BlockSize,
          unsigned int               ItemsPerThread,
          hipcub::BlockScanAlgorithm Algorithm,
          class T>
-__global__ __launch_bounds__(BlockSize) void exclusive_scan_reduce_array_kernel(
-    T* device_output, T* device_output_reductions, T init)
+__global__ __launch_bounds__(BlockSize)
+void exclusive_sum_reduce_array_kernel(T* device_output, T* device_output_reductions)
 {
     const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread;
     // load
@@ -1735,7 +3284,7 @@ __global__ __launch_bounds__(BlockSize) void exclusive_scan_reduce_array_kernel(
     using bscan_t = hipcub::BlockScan<T, BlockSize, Algorithm>;
     __shared__ typename bscan_t::TempStorage temp_storage;
     T                                        reduction;
-    bscan_t(temp_storage).ExclusiveScan(in_out, in_out, init, hipcub::Sum(), reduction);
+    bscan_t(temp_storage).ExclusiveSum(in_out, in_out, reduction);
 
     // store
     for(unsigned int j = 0; j < ItemsPerThread; j++)
@@ -1749,7 +3298,7 @@ __global__ __launch_bounds__(BlockSize) void exclusive_scan_reduce_array_kernel(
     }
 }
 
-TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveScanReduce)
+TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveSumReduce)
 {
     int device_id = test_common_utils::obtain_device_from_ctest();
     SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
@@ -1790,9 +3339,7 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveScanReduce)
 
         // Output reduce results
         std::vector<T> output_reductions(size / block_size, test_utils::convert_to_device<T>(0));
-        const T        init = test_utils::get_random_value<T>(test_utils::convert_to_device<T>(0),
-                                                       test_utils::convert_to_device<T>(100),
-                                                       seed_value + seed_value_addition);
+        const T        init = static_cast<T>(0);
 
         // Calculate expected results on host
         std::vector<T> expected(output.size(), test_utils::convert_to_device<T>(0));
@@ -1841,14 +3388,13 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveScanReduce)
         // Launching kernel
         hipLaunchKernelGGL(
             HIP_KERNEL_NAME(
-                exclusive_scan_reduce_array_kernel<block_size, items_per_thread, algorithm, T>),
+                exclusive_sum_reduce_array_kernel<block_size, items_per_thread, algorithm, T>),
             dim3(grid_size),
             dim3(block_size),
             0,
             0,
             device_output,
-            device_output_reductions,
-            init);
+            device_output_reductions);
 
         HIP_CHECK(hipPeekAtLastError());
         HIP_CHECK(hipDeviceSynchronize());
@@ -1877,8 +3423,10 @@ template<unsigned int               BlockSize,
          unsigned int               ItemsPerThread,
          hipcub::BlockScanAlgorithm Algorithm,
          class T>
-__global__ __launch_bounds__(BlockSize) void exclusive_scan_prefix_callback_array_kernel(
-    T* device_output, T* device_output_bp, T block_prefix)
+__global__ __launch_bounds__(BlockSize)
+void exclusive_sum_prefix_callback_array_kernel(T* device_output,
+                                                T* device_output_bp,
+                                                T  block_prefix)
 {
     const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread;
     T                  prefix_value    = block_prefix;
@@ -1898,7 +3446,7 @@ __global__ __launch_bounds__(BlockSize) void exclusive_scan_prefix_callback_arra
 
     using bscan_t = hipcub::BlockScan<T, BlockSize, Algorithm>;
     __shared__ typename bscan_t::TempStorage temp_storage;
-    bscan_t(temp_storage).ExclusiveScan(in_out, in_out, hipcub::Sum(), prefix_callback);
+    bscan_t(temp_storage).ExclusiveSum(in_out, in_out, prefix_callback);
 
     // store
     for(unsigned int j = 0; j < ItemsPerThread; j++)
@@ -1912,7 +3460,7 @@ __global__ __launch_bounds__(BlockSize) void exclusive_scan_prefix_callback_arra
     }
 }
 
-TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveScanPrefixCallback)
+TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveSumPrefixCallback)
 {
     int device_id = test_common_utils::obtain_device_from_ctest();
     SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
@@ -1968,7 +3516,7 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveScanPrefixCallback)
             acc_type accumulator_block_prefixes(block_prefix);
             for(size_t j = 0; j < items_per_block; j++)
             {
-                auto idx = i * items_per_block + j;
+                auto idx                   = i * items_per_block + j;
                 accumulator_block_prefixes = binary_op_host(static_cast<acc_type>(output[idx]),
                                                             accumulator_block_prefixes);
                 expected_block_prefixes[i] = static_cast<T>(accumulator_block_prefixes);
@@ -1993,10 +3541,10 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveScanPrefixCallback)
 
         // Launching kernel
         hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(exclusive_scan_prefix_callback_array_kernel<block_size,
-                                                                        items_per_thread,
-                                                                        algorithm,
-                                                                        T>),
+            HIP_KERNEL_NAME(exclusive_sum_prefix_callback_array_kernel<block_size,
+                                                                       items_per_thread,
+                                                                       algorithm,
+                                                                       T>),
             dim3(grid_size),
             dim3(block_size),
             0,
@@ -2029,4 +3577,4 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveScanPrefixCallback)
         HIP_CHECK(hipFree(device_output));
         HIP_CHECK(hipFree(device_output_bp));
     }
-}
+}
\ No newline at end of file
diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_shuffle.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_shuffle.cpp
index 500ffd69b2a..f0eb9d54953 100644
--- a/projects/hipcub/test/hipcub/test_hipcub_block_shuffle.cpp
+++ b/projects/hipcub/test/hipcub/test_hipcub_block_shuffle.cpp
@@ -24,8 +24,8 @@
 
 // required hipcub headers
 #include <hipcub/block/block_load.hpp>
-#include <hipcub/block/block_store.hpp>
 #include <hipcub/block/block_shuffle.hpp>
+#include <hipcub/block/block_store.hpp>
 // #include <hipcub/block/block_sort.hpp>
 
 // required test headers
@@ -34,21 +34,18 @@
 #include <type_traits>
 
 // Params for tests
-template<
-    class T,
-    unsigned int BlockSize = 256U
->
+template<class T, unsigned int BlockSize = 256U>
 struct params
 {
-    using type = T;
+    using type                               = T;
     static constexpr unsigned int block_size = BlockSize;
 };
 
-
 template<typename Params>
-class HipcubBlockShuffleTests : public ::testing::Test {
+class HipcubBlockShuffleTests : public ::testing::Test
+{
 public:
-    using type = typename Params::type;
+    using type                               = typename Params::type;
     static constexpr unsigned int block_size = Params::block_size;
 };
 
@@ -78,21 +75,14 @@ using SingleValueTestParams = ::testing::Types<
 
 TYPED_TEST_SUITE(HipcubBlockShuffleTests, SingleValueTestParams);
 
-template<
-    unsigned int BlockSize,
-    class T
->
+template<unsigned int BlockSize, class T>
 __global__
 __launch_bounds__(BlockSize)
 void shuffle_offset_kernel(T* device_input, T* device_output, int distance)
 {
-    const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
-    hipcub::BlockShuffle<T,BlockSize> b_shuffle;
-    b_shuffle.Offset(
-        device_input[index],
-        device_output[index],
-        distance
-    );
+    const unsigned int                 index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    hipcub::BlockShuffle<T, BlockSize> b_shuffle;
+    b_shuffle.Offset(device_input[index], device_output[index], distance);
 }
 
 TYPED_TEST(HipcubBlockShuffleTests, BlockOffset)
@@ -101,15 +91,18 @@ TYPED_TEST(HipcubBlockShuffleTests, BlockOffset)
     SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
     HIP_CHECK(hipSetDevice(device_id));
 
-    using type = typename TestFixture::type;
+    using type              = typename TestFixture::type;
     const size_t block_size = TestFixture::block_size;
-    const size_t size = block_size * 1134;
-    const size_t grid_size = size / block_size;
-    for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    const size_t size       = block_size * 1134;
+    const size_t grid_size  = size / block_size;
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
-        unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
-        int distance = rand() % std::min(size_t(10), block_size/2) - std::min(size_t(10), block_size/2);
-        SCOPED_TRACE(testing::Message() << "with seed= " << seed_value <<" & distance = "<<distance);
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        int distance
+            = rand() % std::min(size_t(10), block_size / 2) - std::min(size_t(10), block_size / 2);
+        SCOPED_TRACE(testing::Message()
+                     << "with seed= " << seed_value << " & distance = " << distance);
         // Generate data
         const int         min_value = std::is_unsigned<type>::value ? 0 : -100;
         std::vector<type> input_data
@@ -117,35 +110,32 @@ TYPED_TEST(HipcubBlockShuffleTests, BlockOffset)
         std::vector<type> output_data(input_data);
 
         // Preparing device
-        type * device_input;
-        type * device_output;
+        type* device_input;
+        type* device_output;
 
         HIP_CHECK(hipMalloc(&device_input, input_data.size() * sizeof(type)));
         HIP_CHECK(hipMalloc(&device_output, input_data.size() * sizeof(type)));
 
-        HIP_CHECK(
-            hipMemcpy(
-                device_input, input_data.data(),
-                input_data.size() * sizeof(type),
-                hipMemcpyHostToDevice
-            )
-        );
+        HIP_CHECK(hipMemcpy(device_input,
+                            input_data.data(),
+                            input_data.size() * sizeof(type),
+                            hipMemcpyHostToDevice));
 
         // Running kernel
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(shuffle_offset_kernel<block_size, type>),
-            dim3(grid_size), dim3(block_size), 0, 0,
-            device_input, device_output, distance
-        );
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(shuffle_offset_kernel<block_size, type>),
+                           dim3(grid_size),
+                           dim3(block_size),
+                           0,
+                           0,
+                           device_input,
+                           device_output,
+                           distance);
 
         // Reading results back
-        HIP_CHECK(
-            hipMemcpy(
-                output_data.data(), device_output,
-                output_data.size() * sizeof(type),
-                hipMemcpyDeviceToHost
-            )
-        );
+        HIP_CHECK(hipMemcpy(output_data.data(),
+                            device_output,
+                            output_data.size() * sizeof(type),
+                            hipMemcpyDeviceToHost));
 
         // Calculate expected results on host
         for(size_t block_index = 0; block_index < grid_size; block_index++)
@@ -168,21 +158,14 @@ TYPED_TEST(HipcubBlockShuffleTests, BlockOffset)
     }
 }
 
-template<
-    unsigned int BlockSize,
-    class T
->
+template<unsigned int BlockSize, class T>
 __global__
 __launch_bounds__(BlockSize)
 void shuffle_rotate_kernel(T* device_input, T* device_output, int distance)
 {
-    const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
-    hipcub::BlockShuffle<T,BlockSize> b_shuffle;
-    b_shuffle.Rotate(
-        device_input[index],
-        device_output[index],
-        distance
-    );
+    const unsigned int                 index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    hipcub::BlockShuffle<T, BlockSize> b_shuffle;
+    b_shuffle.Rotate(device_input[index], device_output[index], distance);
 }
 
 TYPED_TEST(HipcubBlockShuffleTests, BlockRotate)
@@ -191,15 +174,17 @@ TYPED_TEST(HipcubBlockShuffleTests, BlockRotate)
     SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
     HIP_CHECK(hipSetDevice(device_id));
 
-    using type = typename TestFixture::type;
+    using type              = typename TestFixture::type;
     const size_t block_size = TestFixture::block_size;
-    const size_t size = block_size * 1134;
-    const size_t grid_size = size / block_size;
-    for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    const size_t size       = block_size * 1134;
+    const size_t grid_size  = size / block_size;
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
-        unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
-        int distance = rand() % std::min(size_t(5), block_size/2);
-        SCOPED_TRACE(testing::Message() << "with seed= " << seed_value <<" & distance = "<<distance);
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        int distance = rand() % std::min(size_t(5), block_size / 2);
+        SCOPED_TRACE(testing::Message()
+                     << "with seed= " << seed_value << " & distance = " << distance);
         // Generate data
         const int         min_value = std::is_unsigned<type>::value ? 0 : -100;
         std::vector<type> input_data
@@ -207,35 +192,32 @@ TYPED_TEST(HipcubBlockShuffleTests, BlockRotate)
         std::vector<type> output_data(input_data);
 
         // Preparing device
-        type * device_input;
-        type * device_output;
+        type* device_input;
+        type* device_output;
 
         HIP_CHECK(hipMalloc(&device_input, input_data.size() * sizeof(type)));
         HIP_CHECK(hipMalloc(&device_output, input_data.size() * sizeof(type)));
 
-        HIP_CHECK(
-            hipMemcpy(
-                device_input, input_data.data(),
-                input_data.size() * sizeof(type),
-                hipMemcpyHostToDevice
-            )
-        );
+        HIP_CHECK(hipMemcpy(device_input,
+                            input_data.data(),
+                            input_data.size() * sizeof(type),
+                            hipMemcpyHostToDevice));
 
         // Running kernel
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(shuffle_rotate_kernel<block_size, type>),
-            dim3(grid_size), dim3(block_size), 0, 0,
-            device_input, device_output, distance
-        );
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(shuffle_rotate_kernel<block_size, type>),
+                           dim3(grid_size),
+                           dim3(block_size),
+                           0,
+                           0,
+                           device_input,
+                           device_output,
+                           distance);
 
         // Reading results back
-        HIP_CHECK(
-            hipMemcpy(
-                output_data.data(), device_output,
-                output_data.size() * sizeof(type),
-                hipMemcpyDeviceToHost
-            )
-        );
+        HIP_CHECK(hipMemcpy(output_data.data(),
+                            device_output,
+                            output_data.size() * sizeof(type),
+                            hipMemcpyDeviceToHost));
 
         // Calculate expected results on host
         for(size_t block_index = 0; block_index < grid_size; block_index++)
@@ -253,23 +235,19 @@ TYPED_TEST(HipcubBlockShuffleTests, BlockRotate)
 
         HIP_CHECK(hipFree(device_input));
         HIP_CHECK(hipFree(device_output));
-
     }
-
 }
 
-template<
-    unsigned int BlockSize,
-    unsigned int ItemsPerThread,
-    class T
->
+template<unsigned int BlockSize, unsigned int ItemsPerThread, class T>
 __global__
 __launch_bounds__(BlockSize)
-void shuffle_up_kernel(T (*device_input), T (*device_output))
+void shuffle_up_kernel(T(*device_input), T(*device_output))
 {
-    const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
-    hipcub::BlockShuffle<T,BlockSize> b_shuffle;
-    b_shuffle.template Up<ItemsPerThread>(reinterpret_cast<T(&)[ItemsPerThread]>(device_input[index*ItemsPerThread]),reinterpret_cast<T(&)[ItemsPerThread]>(device_output[index*ItemsPerThread]));
+    const unsigned int                 index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    hipcub::BlockShuffle<T, BlockSize> b_shuffle;
+    b_shuffle.template Up<ItemsPerThread>(
+        reinterpret_cast<T(&)[ItemsPerThread]>(device_input[index * ItemsPerThread]),
+        reinterpret_cast<T(&)[ItemsPerThread]>(device_output[index * ItemsPerThread]));
 }
 
 TYPED_TEST(HipcubBlockShuffleTests, BlockUp)
@@ -278,14 +256,15 @@ TYPED_TEST(HipcubBlockShuffleTests, BlockUp)
     SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
     HIP_CHECK(hipSetDevice(device_id));
 
-    using type = typename TestFixture::type;
-    const size_t block_size = TestFixture::block_size;
-    const size_t size = block_size * 1134;
-    const size_t grid_size = size / block_size;
+    using type                            = typename TestFixture::type;
+    const size_t           block_size     = TestFixture::block_size;
+    const size_t           size           = block_size * 1134;
+    const size_t           grid_size      = size / block_size;
     constexpr unsigned int ItemsPerThread = 128;
-    for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
-        unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
         SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
         // Generate data
         const int         min_value = std::is_unsigned<type>::value ? 0 : -100;
@@ -293,43 +272,35 @@ TYPED_TEST(HipcubBlockShuffleTests, BlockUp)
             = test_utils::get_random_data<type>(ItemsPerThread * size, min_value, 100, seed_value);
         std::vector<type> output_data(input_data);
 
-        std::vector<type*>  arr_input(size);
+        std::vector<type*> arr_input(size);
         std::vector<type*> arr_output(size);
 
         // Preparing device
-        type * device_input;
-        type * device_output;
-
+        type* device_input;
+        type* device_output;
 
         HIP_CHECK(hipMalloc(&device_input, input_data.size() * sizeof(type)));
         HIP_CHECK(hipMalloc(&device_output, input_data.size() * sizeof(type)));
 
-
-
-        HIP_CHECK(
-            hipMemcpy(
-                device_input, input_data.data(),
-                input_data.size() * sizeof(type),
-                hipMemcpyHostToDevice
-            )
-        );
-
+        HIP_CHECK(hipMemcpy(device_input,
+                            input_data.data(),
+                            input_data.size() * sizeof(type),
+                            hipMemcpyHostToDevice));
 
         // Running kernel
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(shuffle_up_kernel<block_size, ItemsPerThread, type>),
-            dim3(grid_size), dim3(block_size), 0, 0,
-            device_input, device_output
-        );
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(shuffle_up_kernel<block_size, ItemsPerThread, type>),
+                           dim3(grid_size),
+                           dim3(block_size),
+                           0,
+                           0,
+                           device_input,
+                           device_output);
 
         // Reading results back
-        HIP_CHECK(
-            hipMemcpy(
-                output_data.data(), device_output,
-                output_data.size() * sizeof(type),
-                hipMemcpyDeviceToHost
-            )
-        );
+        HIP_CHECK(hipMemcpy(output_data.data(),
+                            device_output,
+                            output_data.size() * sizeof(type),
+                            hipMemcpyDeviceToHost));
 
         // Calculate expected results on host
         for(size_t block_index = 0; block_index < grid_size; block_index++)
@@ -352,23 +323,123 @@ TYPED_TEST(HipcubBlockShuffleTests, BlockUp)
 
         HIP_CHECK(hipFree(device_input));
         HIP_CHECK(hipFree(device_output));
-
     }
+}
+
+template<unsigned int BlockSize, unsigned int ItemsPerThread, class T>
+__global__
+__launch_bounds__(BlockSize)
+void shuffle_up_with_suffix_kernel(T* device_input, T* device_output, T* device_suffix)
+{
+    const unsigned int                 index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    hipcub::BlockShuffle<T, BlockSize> b_shuffle;
+    b_shuffle.template Up<ItemsPerThread>(
+        reinterpret_cast<T(&)[ItemsPerThread]>(device_input[index * ItemsPerThread]),
+        reinterpret_cast<T(&)[ItemsPerThread]>(device_output[index * ItemsPerThread]),
+        device_suffix[blockIdx.x]);
+}
+
+TYPED_TEST(HipcubBlockShuffleTests, BlockUpWithSuffix)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using type                        = typename TestFixture::type;
+    constexpr size_t block_size       = TestFixture::block_size;
+    constexpr size_t items_per_thread = 128;
+    constexpr size_t items_per_block  = block_size * items_per_thread;
+    constexpr size_t grid_size        = 114;
+
+    const size_t size = items_per_block * grid_size;
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    {
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
+        // Generate data
+        const double min_value = static_cast<double>(std::is_unsigned<type>::value ? 0 : -100);
+        const double max_value = static_cast<double>(std::is_unsigned<type>::value ? 200 : 100);
+
+        std::mt19937                           gen(seed_value);
+        std::uniform_real_distribution<double> dis(min_value, max_value);
+
+        type* host_input  = new type[size];
+        type* host_output = new type[size];
+
+        for(size_t i = 0; i < size; i++)
+            host_input[i] = static_cast<type>(dis(gen));
+
+        std::iota(host_input, host_input + size, static_cast<type>(0));
+
+        // Preparing device
+        type* device_input;
+        type* device_output;
+        type* device_suffix;
+
+        HIP_CHECK(hipMalloc(&device_input, size * sizeof(type)));
+        HIP_CHECK(hipMalloc(&device_output, size * sizeof(type)));
+        HIP_CHECK(hipMalloc(&device_suffix, grid_size * sizeof(type)));
+
+        HIP_CHECK(hipMemcpy(device_input, host_input, size * sizeof(type), hipMemcpyHostToDevice));
+
+        // Running kernel
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(shuffle_up_with_suffix_kernel<block_size, items_per_thread, type>),
+            dim3(grid_size),
+            dim3(block_size),
+            0,
+            0,
+            device_input,
+            device_output,
+            device_suffix);
+
+        // Reading results back
+        HIP_CHECK(
+            hipMemcpy(host_output, device_output, size * sizeof(type), hipMemcpyDeviceToHost));
+
+        type* host_block_suffix = new type[grid_size];
+
+        HIP_CHECK(hipMemcpy(host_block_suffix,
+                            device_suffix,
+                            sizeof(type) * grid_size,
+                            hipMemcpyDeviceToHost));
+
+        // Calculate expected results on host
+        for(size_t block_index = 0; block_index < grid_size; block_index++)
+        {
+            size_t suffix_index = (block_index * items_per_block) + (items_per_block - 1);
+            ASSERT_EQ(host_block_suffix[block_index], host_input[suffix_index]);
+            for(size_t thread_index = 0; thread_index < block_size; thread_index++)
+            {
+                size_t start_offset = (block_index * block_size + thread_index) * items_per_thread;
+                for(size_t item_index = 0; item_index < items_per_thread; item_index++)
+                {
+                    if(thread_index + item_index > 0)
+                        ASSERT_EQ(host_input[start_offset + item_index - 1],
+                                  host_output[start_offset + item_index]);
+                }
+            }
+        }
 
+        delete[] host_input;
+        delete[] host_output;
+        delete[] host_block_suffix;
+        HIP_CHECK(hipFree(device_input));
+        HIP_CHECK(hipFree(device_output));
+    }
 }
 
-template<
-    unsigned int BlockSize,
-    unsigned int ItemsPerThread,
-    class T
->
+template<unsigned int BlockSize, unsigned int ItemsPerThread, class T>
 __global__
 __launch_bounds__(BlockSize)
-void shuffle_down_kernel(T (*device_input), T (*device_output))
+void shuffle_down_kernel(T(*device_input), T(*device_output))
 {
-    const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
-    hipcub::BlockShuffle<T,BlockSize> b_shuffle;
-    b_shuffle.template Down<ItemsPerThread>(reinterpret_cast<T(&)[ItemsPerThread]>(device_input[index*ItemsPerThread]),reinterpret_cast<T(&)[ItemsPerThread]>(device_output[index*ItemsPerThread]));
+    const unsigned int                 index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    hipcub::BlockShuffle<T, BlockSize> b_shuffle;
+    b_shuffle.template Down<ItemsPerThread>(
+        reinterpret_cast<T(&)[ItemsPerThread]>(device_input[index * ItemsPerThread]),
+        reinterpret_cast<T(&)[ItemsPerThread]>(device_output[index * ItemsPerThread]));
 }
 
 TYPED_TEST(HipcubBlockShuffleTests, BlockDown)
@@ -377,14 +448,15 @@ TYPED_TEST(HipcubBlockShuffleTests, BlockDown)
     SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
     HIP_CHECK(hipSetDevice(device_id));
 
-    using type = typename TestFixture::type;
-    const size_t block_size = TestFixture::block_size;
-    const size_t size = block_size * 1134;
-    const size_t grid_size = size / block_size;
+    using type                            = typename TestFixture::type;
+    const size_t           block_size     = TestFixture::block_size;
+    const size_t           size           = block_size * 1134;
+    const size_t           grid_size      = size / block_size;
     constexpr unsigned int ItemsPerThread = 128;
-    for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
-        unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
         SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
 
         // Generate data
@@ -393,43 +465,35 @@ TYPED_TEST(HipcubBlockShuffleTests, BlockDown)
             = test_utils::get_random_data<type>(ItemsPerThread * size, min_value, 100, seed_value);
         std::vector<type> output_data(input_data);
 
-        std::vector<type*>  arr_input(size);
+        std::vector<type*> arr_input(size);
         std::vector<type*> arr_output(size);
 
         // Preparing device
-        type * device_input;
-        type * device_output;
-
+        type* device_input;
+        type* device_output;
 
         HIP_CHECK(hipMalloc(&device_input, input_data.size() * sizeof(type)));
         HIP_CHECK(hipMalloc(&device_output, input_data.size() * sizeof(type)));
 
-
-
-        HIP_CHECK(
-            hipMemcpy(
-                device_input, input_data.data(),
-                input_data.size() * sizeof(type),
-                hipMemcpyHostToDevice
-            )
-        );
-
+        HIP_CHECK(hipMemcpy(device_input,
+                            input_data.data(),
+                            input_data.size() * sizeof(type),
+                            hipMemcpyHostToDevice));
 
         // Running kernel
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(shuffle_down_kernel<block_size, ItemsPerThread, type>),
-            dim3(grid_size), dim3(block_size), 0, 0,
-            device_input, device_output
-        );
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(shuffle_down_kernel<block_size, ItemsPerThread, type>),
+                           dim3(grid_size),
+                           dim3(block_size),
+                           0,
+                           0,
+                           device_input,
+                           device_output);
 
         // Reading results back
-        HIP_CHECK(
-            hipMemcpy(
-                output_data.data(), device_output,
-                output_data.size() * sizeof(type),
-                hipMemcpyDeviceToHost
-            )
-        );
+        HIP_CHECK(hipMemcpy(output_data.data(),
+                            device_output,
+                            output_data.size() * sizeof(type),
+                            hipMemcpyDeviceToHost));
 
         // Calculate expected results on host
         for(size_t block_index = 0; block_index < grid_size; block_index++)
@@ -452,7 +516,109 @@ TYPED_TEST(HipcubBlockShuffleTests, BlockDown)
 
         HIP_CHECK(hipFree(device_input));
         HIP_CHECK(hipFree(device_output));
-
     }
+}
 
+template<unsigned int BlockSize, unsigned int ItemsPerThread, class T>
+__global__
+__launch_bounds__(BlockSize)
+void shuffle_down_with_prefix_kernel(T* device_input, T* device_output, T* device_prefix)
+{
+    const unsigned int                 index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    hipcub::BlockShuffle<T, BlockSize> b_shuffle;
+    b_shuffle.template Down<ItemsPerThread>(
+        reinterpret_cast<T(&)[ItemsPerThread]>(device_input[index * ItemsPerThread]),
+        reinterpret_cast<T(&)[ItemsPerThread]>(device_output[index * ItemsPerThread]),
+        device_prefix[blockIdx.x]);
 }
+
+TYPED_TEST(HipcubBlockShuffleTests, BlockDownWithSuffix)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using type                        = typename TestFixture::type;
+    constexpr size_t block_size       = TestFixture::block_size;
+    constexpr size_t items_per_thread = 128;
+    constexpr size_t items_per_block  = block_size * items_per_thread;
+    constexpr size_t grid_size        = 114;
+
+    const size_t size = items_per_block * grid_size;
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    {
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
+        // Generate data
+        const double min_value = static_cast<double>(std::is_unsigned<type>::value ? 0 : -100);
+        const double max_value = static_cast<double>(std::is_unsigned<type>::value ? 200 : 100);
+
+        std::mt19937                           gen(seed_value);
+        std::uniform_real_distribution<double> dis(min_value, max_value);
+
+        type* host_input  = new type[size];
+        type* host_output = new type[size];
+
+        for(size_t i = 0; i < size; i++)
+            host_input[i] = static_cast<type>(dis(gen));
+
+        std::iota(host_input, host_input + size, static_cast<type>(0));
+
+        // Preparing device
+        type* device_input;
+        type* device_output;
+        type* device_prefix;
+
+        HIP_CHECK(hipMalloc(&device_input, size * sizeof(type)));
+        HIP_CHECK(hipMalloc(&device_output, size * sizeof(type)));
+        HIP_CHECK(hipMalloc(&device_prefix, grid_size * sizeof(type)));
+
+        HIP_CHECK(hipMemcpy(device_input, host_input, size * sizeof(type), hipMemcpyHostToDevice));
+
+        // Running kernel
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(shuffle_down_with_prefix_kernel<block_size, items_per_thread, type>),
+            dim3(grid_size),
+            dim3(block_size),
+            0,
+            0,
+            device_input,
+            device_output,
+            device_prefix);
+
+        // Reading results back
+        HIP_CHECK(
+            hipMemcpy(host_output, device_output, size * sizeof(type), hipMemcpyDeviceToHost));
+
+        type* host_block_prefix = new type[grid_size];
+
+        HIP_CHECK(hipMemcpy(host_block_prefix,
+                            device_prefix,
+                            sizeof(type) * grid_size,
+                            hipMemcpyDeviceToHost));
+
+        // Calculate expected results on host
+        for(size_t block_index = 0; block_index < grid_size; block_index++)
+        {
+            size_t prefix_index = (block_index * items_per_block);
+            ASSERT_EQ(host_block_prefix[block_index], host_input[prefix_index]);
+            for(size_t thread_index = 0; thread_index < block_size; thread_index++)
+            {
+                size_t start_offset = (block_index * block_size + thread_index) * items_per_thread;
+                for(size_t item_index = 0; item_index < items_per_thread; item_index++)
+                {
+                    if((thread_index != block_size - 1) && (item_index != items_per_thread - 1))
+                        ASSERT_EQ(host_input[start_offset + item_index + 1],
+                                  host_output[start_offset + item_index]);
+                }
+            }
+        }
+
+        delete[] host_input;
+        delete[] host_output;
+        delete[] host_block_prefix;
+        HIP_CHECK(hipFree(device_input));
+        HIP_CHECK(hipFree(device_output));
+    }
+}
\ No newline at end of file

From 58417d6d03f9540e06f6aa99202f0dbd4b69e566 Mon Sep 17 00:00:00 2001
From: Wayne Franz <wayfranz@amd.com>
Date: Thu, 31 Jul 2025 18:31:06 -0400
Subject: [PATCH 04/10] [hipCUB] Add missing include for test_block_radix_rank
 (#988)

This change adds a missing header include (`<numeric>`) in
`projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp` for the
call it makes to `std::exclusive_scan`.

Co-authored-by: Stanley Tsang <stanley.tsang@amd.com>
---
 projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp
index ad47a54ce3e..cca727699d2 100644
--- a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp
+++ b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp
@@ -38,6 +38,7 @@
 #include "hipcub/util_type.hpp"
 
 #include <bitset>
+#include <numeric>
 
 template<class Key,
          unsigned int BlockSize,
@@ -779,4 +780,4 @@ TYPED_TEST(HipcubBlockRadixRank, BlockRadixRankMemoizeWithPrefixSumOutput)
 TYPED_TEST(HipcubBlockRadixRank, BlockRadixRankMatchWithPrefixSumOutput)
 {
     test_radix_rank_with_prefix_sum_output<TestFixture, RadixRankAlgorithm::RADIX_RANK_MATCH>();
-}
\ No newline at end of file
+}

From 19f99d45afebc47c4db8d41819e38641fd170708 Mon Sep 17 00:00:00 2001
From: NguyenNhuDi <zee.nguyen@amd.com>
Date: Tue, 5 Aug 2025 08:34:05 -0700
Subject: [PATCH 05/10] nuiked .jenkins to fix mci

---
 projects/hipcub/.jenkins/common.groovy        | 79 -----------------
 projects/hipcub/.jenkins/precheckin.groovy    | 84 -------------------
 .../hipcub/.jenkins/staticanalysis.groovy     | 64 --------------
 projects/hipcub/.jenkins/staticlibrary.groovy | 82 ------------------
 4 files changed, 309 deletions(-)
 delete mode 100644 projects/hipcub/.jenkins/common.groovy
 delete mode 100644 projects/hipcub/.jenkins/precheckin.groovy
 delete mode 100644 projects/hipcub/.jenkins/staticanalysis.groovy
 delete mode 100644 projects/hipcub/.jenkins/staticlibrary.groovy

diff --git a/projects/hipcub/.jenkins/common.groovy b/projects/hipcub/.jenkins/common.groovy
deleted file mode 100644
index ae8f7978256..00000000000
--- a/projects/hipcub/.jenkins/common.groovy
+++ /dev/null
@@ -1,79 +0,0 @@
-// This file is for internal AMD use.
-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
-
-def runCompileCommand(platform, project, jobName, boolean debug=false, boolean sameOrg=true)
-{
-    project.paths.construct_build_prefix()
-
-    String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug' : '-DCMAKE_BUILD_TYPE=Release'
-    String buildTypeDir = debug ? 'debug' : 'release'
-    String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake'
-    //Set CI node's gfx arch as target if PR, otherwise use default targets of the library
-    String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : ''
-
-    def getRocPRIM = auxiliary.getLibrary('rocPRIM', platform.jenkinsLabel, null, sameOrg)
-
-    def command = """#!/usr/bin/env bash
-                set -x
-                ${getRocPRIM}
-                cd ${project.paths.project_build_prefix}
-                mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir}
-                ${auxiliary.gfxTargetParser()}
-                ${cmake} --toolchain=toolchain-linux.cmake ${buildTypeArg} ${amdgpuTargets} -DBUILD_TEST=ON -DBUILD_BENCHMARK=ON ../..
-                make -j\$(nproc)
-                """
-
-    platform.runCommand(this, command)
-}
-
-
-def runTestCommand (platform, project, boolean rocmExamples=false)
-{
-    String sudo = auxiliary.sudo(platform.jenkinsLabel)
-
-    def testCommand = "ctest --output-on-failure --verbose --timeout 900"
-    def command = """#!/usr/bin/env bash
-                set -x
-                cd ${project.paths.project_build_prefix}
-                cd ${project.testDirectory}
-                ${sudo} LD_LIBRARY_PATH=/opt/rocm/lib ${testCommand}
-            """
-
-    platform.runCommand(this, command)
-    if (rocmExamples){
-        String buildString = ""
-        if (platform.os.contains("ubuntu")){
-            buildString += "sudo dpkg -i *.deb"
-        }
-        else {
-            buildString += "sudo rpm -i *.rpm"
-        }
-        testCommand = """#!/usr/bin/env bash
-                    set -ex
-                    cd ${project.paths.project_build_prefix}/build/release/package
-                    ${buildString}
-                    cd ../../..
-                    testDirs=("Libraries/hipCUB")
-                    git clone https://github.com/ROCm/rocm-examples.git
-                    rocm_examples_dir=\$(readlink -f rocm-examples)
-                    for testDir in \${testDirs[@]}; do
-                        cd \${rocm_examples_dir}/\${testDir}
-                        cmake -S . -B build
-                        cmake --build build
-                        cd ./build
-                        ctest --output-on-failure
-                    done
-                """
-        platform.runCommand(this, testCommand, "ROCM Examples")  
-    }
-}
-
-def runPackageCommand(platform, project)
-{
-    def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/release")
-
-    platform.runCommand(this, packageHelper[0])
-        platform.archiveArtifacts(this, packageHelper[1])
-}
-
-return this
diff --git a/projects/hipcub/.jenkins/precheckin.groovy b/projects/hipcub/.jenkins/precheckin.groovy
deleted file mode 100644
index 70f2bb54a76..00000000000
--- a/projects/hipcub/.jenkins/precheckin.groovy
+++ /dev/null
@@ -1,84 +0,0 @@
-#!/usr/bin/env groovy
-// This shared library is available at https://github.com/ROCm/rocJENKINS/
-@Library('rocJenkins@pong') _
-
-// This file is for internal AMD use.
-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
-
-import com.amd.project.*
-import com.amd.docker.*
-import java.nio.file.Path;
-
-def runCI = 
-{
-    nodeDetails, jobName->
-
-    def prj = new rocProject('hipCUB', 'PreCheckin')
-    prj.timeout.compile = 400
-    // Define test architectures, optional rocm version argument is available
-    def nodes = new dockerNodes(nodeDetails, jobName, prj)
-
-    boolean formatCheck = false
-
-    def commonGroovy
-
-    def compileCommand =
-    {
-        platform, project->
-
-        commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
-        commonGroovy.runCompileCommand(platform, project, jobName)
-    }
-
-    def testCommand =
-    {
-        platform, project->
-
-        commonGroovy.runTestCommand(platform, project, true)
-    }
-
-    def packageCommand =
-    {
-        platform, project->
-        
-        commonGroovy.runPackageCommand(platform, project)
-    }
-
-    buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
-}
-
-ci: { 
-    String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
-
-    def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]]
-    propertyList = auxiliary.appendPropertyList(propertyList)
-
-    def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])]
-    jobNameList = auxiliary.appendJobNameList(jobNameList)
-
-    auxiliary.registerDependencyBranchParameter(["rocPRIM"])
-    
-    propertyList.each 
-    {
-        jobName, property->
-        if (urlJobName == jobName)
-            properties(auxiliary.addCommonProperties(property))
-    }
-
-    Set seenJobNames = []
-    jobNameList.each 
-    {
-        jobName, nodeDetails->
-        seenJobNames.add(jobName)
-        if (urlJobName == jobName)
-            runCI(nodeDetails, jobName)
-    }
-
-    // For url job names that are outside of the standardJobNameSet i.e. compute-rocm-dkms-no-npi-1901
-    if(!seenJobNames.contains(urlJobName))
-    {
-        properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
-        runCI([ubuntu16:['gfx906']], urlJobName)       
-    }
-}
-
diff --git a/projects/hipcub/.jenkins/staticanalysis.groovy b/projects/hipcub/.jenkins/staticanalysis.groovy
deleted file mode 100644
index 4e2237ab1a8..00000000000
--- a/projects/hipcub/.jenkins/staticanalysis.groovy
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/usr/bin/env groovy
-// This shared library is available at https://github.com/ROCm/rocJENKINS/
-@Library('rocJenkins@pong') _
-
-// This is file for internal AMD use.
-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
-
-import com.amd.project.*
-import com.amd.docker.*
-import java.nio.file.Path
-
-def runCompileCommand(platform, project, jobName, boolean debug=false)
-{
-    project.paths.construct_build_prefix()
-}
-
-def runCI = 
-{
-    nodeDetails, jobName->
-
-    def prj  = new rocProject('hipCUB', 'StaticAnalysis')
-
-    // Define test architectures, optional rocm version argument is available
-    def nodes = new dockerNodes(nodeDetails, jobName, prj)
-
-    boolean formatCheck = false
-    boolean staticAnalysis = true
-
-    def compileCommand =
-    {
-        platform, project->
-
-        runCompileCommand(platform, project, jobName, false)
-    }
-
-    buildProject(prj , formatCheck, nodes.dockerArray, compileCommand, null, null, staticAnalysis)
-}
-
-ci: {
-    String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
-
-    def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])],
-                        "rocm-docker":[]]
-    propertyList = auxiliary.appendPropertyList(propertyList)
-
-    def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":[]]
-    jobNameList = auxiliary.appendJobNameList(jobNameList)
-
-    propertyList.each
-    {
-        jobName, property->
-        if (urlJobName == jobName)
-            properties(auxiliary.addCommonProperties(property))
-    }
-
-    jobNameList.each
-    {
-        jobName, nodeDetails->
-        if (urlJobName == jobName)
-            stage(jobName) {
-                runCI(nodeDetails, jobName)
-            }
-    }
-}
diff --git a/projects/hipcub/.jenkins/staticlibrary.groovy b/projects/hipcub/.jenkins/staticlibrary.groovy
deleted file mode 100644
index 549913d8cbb..00000000000
--- a/projects/hipcub/.jenkins/staticlibrary.groovy
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/usr/bin/env groovy
-@Library('rocJenkins@pong') _
-import com.amd.project.*
-import com.amd.docker.*
-import java.nio.file.Path;
-
-def runCI =
-{
-    nodeDetails, jobName->
-    
-    def prj = new rocProject('hipCUB', 'Static Library PreCheckin')
-
-    def nodes = new dockerNodes(nodeDetails, jobName, prj)
-
-    def commonGroovy
-
-    boolean formatCheck = false
-     
-    def compileCommand =
-    {
-        platform, project->
-
-        commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
-        commonGroovy.runCompileCommand(platform, project, jobName, false, true)
-    }
-
-    
-    def testCommand =
-    {
-        platform, project->
-
-        commonGroovy.runTestCommand(platform, project)
-    }
-
-    def packageCommand =
-    {
-        platform, project->
-
-        commonGroovy.runPackageCommand(platform, project)
-    }
-
-    buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
-}
-
-ci: { 
-    String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
-
-    def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])], 
-                        "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])],
-                        "rocm-docker":[]]
-    propertyList = auxiliary.appendPropertyList(propertyList)
-
-    def jobNameList = ["compute-rocm-dkms-no-npi":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']]), 
-                       "compute-rocm-dkms-no-npi-hipclang":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']]), 
-                       "rocm-docker":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']])]
-    jobNameList = auxiliary.appendJobNameList(jobNameList)
-
-    propertyList.each 
-    {
-        jobName, property->
-        if (urlJobName == jobName)
-            properties(auxiliary.addCommonProperties(property))
-    }
-
-    jobNameList.each
-    {
-        jobName, nodeDetails->
-        if (urlJobName == jobName)
-            stage(jobName) {
-                runCI(nodeDetails, jobName)
-            }
-    }
-
-    // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
-    if(!jobNameList.keySet().contains(urlJobName))
-    {
-        properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
-        stage(urlJobName) {
-            runCI([ubuntu16:['gfx906']], urlJobName)
-        }
-    }
-}

From 32ae8082fecdf682e6d0c073fba5bd185f6042b3 Mon Sep 17 00:00:00 2001
From: NguyenNhuDi <zee.nguyen@amd.com>
Date: Wed, 6 Aug 2025 12:07:11 -0700
Subject: [PATCH 06/10]  implemented fall back implementation for
 std::exclusive scane  for gcc < 9 (debian10)

---
 .../hipcub/test_hipcub_block_radix_rank.cpp   | 32 ++++++++++++++++---
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp
index cca727699d2..14748a17ff2 100644
--- a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp
+++ b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp
@@ -605,6 +605,21 @@ void rank_with_prefix_sum_kernel(const KeyType* keys_input,
     }
 }
 
+#if defined(_GLIBCXX_RELEASE) && (GLIBCXX_RELEASE < 9)
+
+template <typename It, typename OutIt, typename T>
+void exclusive_scan(It first, It last, OutIt out, T init)
+{
+    // Fallback implementation for exclusive scan if gcc version is < 9
+    for (; first != last; ++first)
+    {
+        *out++ = init;
+        init += *first;
+    }
+}
+
+#endif
+
 template<typename TestFixture, RadixRankAlgorithm Algorithm>
 void test_radix_rank_with_prefix_sum_output()
 {
@@ -703,10 +718,19 @@ void test_radix_rank_with_prefix_sum_output()
 
                     ++histogram[bit_rep];
                 }
-                std::exclusive_scan(histogram.begin(),
-                                    histogram.end(),
-                                    pfs_expected.begin() + pfs_offset,
-                                    0);
+
+                #if defined(_GLIBCXX_RELEASE) && (GLIBCXX_RELEASE >= 9)
+                    std::exclusive_scan(histogram.begin(),
+                                        histogram.end(),
+                                        pfs_expected.begin() + pfs_offset,
+                                        0);
+                #else
+                    exclusive_scan(histogram.begin(),
+                                        histogram.end(),
+                                        pfs_expected.begin() + pfs_offset,
+                                        0);
+                #endif
+
             }
 
             // Preparing device

From eb079869ae1692fafbbc11808a53f90139249f51 Mon Sep 17 00:00:00 2001
From: NguyenNhuDi <zee.nguyen@amd.com>
Date: Wed, 6 Aug 2025 12:14:15 -0700
Subject: [PATCH 07/10] brought back rocprim .jenkins (these should get deleted
 in rocprim PR not hipcub)

---
 projects/rocprim/.jenkins/common.groovy     | 105 ++++++++++++++++++++
 projects/rocprim/.jenkins/precheckin.groovy |  81 +++++++++++++++
 projects/rocprim/.jenkins/static.groovy     |  82 +++++++++++++++
 3 files changed, 268 insertions(+)
 create mode 100644 projects/rocprim/.jenkins/common.groovy
 create mode 100644 projects/rocprim/.jenkins/precheckin.groovy
 create mode 100644 projects/rocprim/.jenkins/static.groovy

diff --git a/projects/rocprim/.jenkins/common.groovy b/projects/rocprim/.jenkins/common.groovy
new file mode 100644
index 00000000000..0ffd1dee600
--- /dev/null
+++ b/projects/rocprim/.jenkins/common.groovy
@@ -0,0 +1,105 @@
+// This file is for internal AMD use.
+// If you are interested in running your own Jenkins, please raise a github issue for assistance.
+
+def runCompileCommand(platform, project, jobName, boolean debug=false, boolean staticLibrary=false)
+{
+    project.paths.construct_build_prefix()
+
+    String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug' : '-DCMAKE_BUILD_TYPE=Release'
+    String buildStatic = staticLibrary ? '-DBUILD_SHARED_LIBS=OFF' : '-DBUILD_SHARED_LIBS=ON'
+    String buildTypeDir = debug ? 'debug' : 'release'
+    String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake'
+    //Set CI node's gfx arch as target if PR, otherwise use default targets of the library
+    String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : ''
+
+    def command = """#!/usr/bin/env bash
+                set -x
+                cd ${project.paths.project_build_prefix}
+                mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir}
+                ${auxiliary.gfxTargetParser()}
+                ${cmake} --toolchain=toolchain-linux.cmake ${buildTypeArg} ${buildStatic} ${amdgpuTargets} -DBUILD_TEST=ON -DBUILD_BENCHMARK=ON ../..
+                make -j\$(nproc)
+                """
+
+    platform.runCommand(this, command)
+}
+
+
+def runTestCommand (platform, project, boolean rocmExamples=false)
+{
+    String sudo = auxiliary.sudo(platform.jenkinsLabel)
+
+    def testCommand = "ctest --output-on-failure "
+    def testCommandExcludeRegex = /(rocprim.block_histogram)/
+    def testCommandExclude = "--exclude-regex \"${testCommandExcludeRegex}\""
+    def hmmExcludeRegex = ''
+    def hmmTestCommandExclude = "--exclude-regex \"${hmmExcludeRegex}\""
+    def hmmTestCommand = ''
+    if (platform.jenkinsLabel.contains('gfx90a'))
+    {
+        echo("HMM TESTS DISABLED")
+        /*hmmTestCommand = """
+                            export HSA_XNACK=1
+                            export ROCPRIM_USE_HMM=1
+                            ${testCommand} ${hmmTestCommandExclude}
+                         """*/
+    }
+    echo(env.JOB_NAME)
+    if (env.JOB_NAME.contains('bleeding-edge'))
+    {
+        testCommand = ''
+        testCommandExclude = ''
+        hmmTestCommand = ''
+        echo("TESTS DISABLED")
+    }
+    def command = """#!/usr/bin/env bash
+                set -x
+                cd ${project.paths.project_build_prefix}
+                cd ${project.testDirectory}
+                ${testCommand} ${testCommandExclude}
+                if (( \$? != 0 )); then
+                    exit 1
+                fi
+                ${hmmTestCommand}
+            """
+    platform.runCommand(this, command)
+    //ROCM Examples
+    if (rocmExamples){
+        String buildString = ""
+        if (platform.os.contains("ubuntu")){
+            buildString += "sudo dpkg -i *.deb"
+        }
+        else {
+            buildString += "sudo rpm -i *.rpm"
+        }
+        testCommand = """#!/usr/bin/env bash
+                    set -ex
+                    cd ${project.paths.project_build_prefix}/build/release/package
+                    ls
+                    ${buildString}
+                    cd ../../..
+                    testDirs=("Libraries/rocPRIM")
+                    git clone https://github.com/ROCm/rocm-examples.git
+                    rocm_examples_dir=\$(readlink -f rocm-examples)
+                    for testDir in \${testDirs[@]}; do
+                        cd \${rocm_examples_dir}/\${testDir}
+                        cmake -S . -B build
+                        cmake --build build
+                        cd ./build
+                        ctest --output-on-failure
+                    done
+                """
+        platform.runCommand(this, testCommand, "ROCM Examples")  
+
+    }
+}
+
+def runPackageCommand(platform, project)
+{
+    def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/release")
+
+    platform.runCommand(this, packageHelper[0])
+        platform.archiveArtifacts(this, packageHelper[1])
+}
+
+return this
diff --git a/projects/rocprim/.jenkins/precheckin.groovy b/projects/rocprim/.jenkins/precheckin.groovy
new file mode 100644
index 00000000000..bbb8274743c
--- /dev/null
+++ b/projects/rocprim/.jenkins/precheckin.groovy
@@ -0,0 +1,81 @@
+#!/usr/bin/env groovy
+@Library('rocJenkins@pong') _
+import com.amd.project.*
+import com.amd.docker.*
+import java.nio.file.Path;
+
+def runCI = 
+{
+    nodeDetails, jobName->
+
+    def prj = new rocProject('rocPRIM', 'PreCheckin')
+    prj.paths.build_command = './install -c'
+    prj.timeout.compile = 600
+
+    def nodes = new dockerNodes(nodeDetails, jobName, prj)
+
+    def commonGroovy
+
+    boolean formatCheck = false
+     
+    def compileCommand =
+    {
+        platform, project->
+
+        commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
+        commonGroovy.runCompileCommand(platform, project, jobName)
+    }
+
+    def testCommand =
+    {
+        platform, project->
+
+        commonGroovy.runTestCommand(platform, project, true)
+    }
+
+    def packageCommand =
+    {
+        platform, project->
+
+        commonGroovy.runPackageCommand(platform, project)
+    }
+
+    buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
+}
+
+ci: { 
+    String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
+
+    def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])], 
+                        "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])],
+                        "rocm-docker":[]]
+    propertyList = auxiliary.appendPropertyList(propertyList)
+
+    def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])]
+    jobNameList = auxiliary.appendJobNameList(jobNameList, 'rocPRIM')
+
+    propertyList.each 
+    {
+        jobName, property->
+        if (urlJobName == jobName)
+            properties(auxiliary.addCommonProperties(property))
+    }
+    
+    jobNameList.each 
+    {
+        jobName, nodeDetails->
+        if (urlJobName == jobName)
+            stage(jobName) {
+                runCI(nodeDetails, jobName)
+            }
+    }
+
+    // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
+    if(!jobNameList.keySet().contains(urlJobName))
+    {
+        properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
+        stage(urlJobName) {
+            runCI([ubuntu16:['gfx906']], urlJobName)
+        }
+    }
+}
diff --git a/projects/rocprim/.jenkins/static.groovy b/projects/rocprim/.jenkins/static.groovy
new file mode 100644
index 00000000000..75606419fdf
--- /dev/null
+++ b/projects/rocprim/.jenkins/static.groovy
@@ -0,0 +1,82 @@
+#!/usr/bin/env groovy
+@Library('rocJenkins@pong') _
+import com.amd.project.*
+import com.amd.docker.*
+import java.nio.file.Path;
+
+def runCI = 
+{
+    nodeDetails, jobName->
+
+    def prj = new rocProject('rocPRIM', 'static')
+    prj.paths.build_command = './install -c -s'
+    prj.timeout.compile = 600
+    prj.timeout.packaging = 120
+
+    def nodes = new dockerNodes(nodeDetails, jobName, prj)
+
+    def commonGroovy
+
+    boolean formatCheck = false
+     
+    def compileCommand =
+    {
+        platform, project->
+
+        commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
+        commonGroovy.runCompileCommand(platform, project, jobName, debug=false, staticLibrary=true)
+    }
+
+    def testCommand =
+    {
+        platform, project->
+
+        commonGroovy.runTestCommand(platform, project)
+    }
+
+    def packageCommand =
+    {
+        platform, project->
+
+        commonGroovy.runPackageCommand(platform, project)
+    }
+
+    buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
+}
+
+ci: { 
+    String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
+
+    def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])], 
+                        "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])],
+                        "rocm-docker":[]]
+    propertyList = auxiliary.appendPropertyList(propertyList)
+
+    def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])]
+    jobNameList = auxiliary.appendJobNameList(jobNameList, 'rocPRIM')
+
+    propertyList.each 
+    {
+        jobName, property->
+        if (urlJobName == jobName)
+            properties(auxiliary.addCommonProperties(property))
+    }
+    
+    jobNameList.each 
+    {
+        jobName, nodeDetails->
+        if (urlJobName == jobName)
+            stage(jobName) {
+                runCI(nodeDetails, jobName)
+            }
+    }
+
+    // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
+    if(!jobNameList.keySet().contains(urlJobName))
+    {
+        properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
+        stage(urlJobName) {
+            runCI([ubuntu16:['gfx906']], urlJobName)
+        }
+    }
+}

From 4128a344e5098b5a498cea47491b155fb8ff44d6 Mon Sep 17 00:00:00 2001
From: NguyenNhuDi <zee.nguyen@amd.com>
Date: Wed, 6 Aug 2025 12:14:56 -0700
Subject: [PATCH 08/10] removed .jenkins again (to accomodate the rocprim fix)

---
 projects/rocprim/.jenkins/common.groovy     | 105 --------------------
 projects/rocprim/.jenkins/precheckin.groovy |  81 ---------------
 projects/rocprim/.jenkins/static.groovy     |  82 ---------------
 3 files changed, 268 deletions(-)
 delete mode 100644 projects/rocprim/.jenkins/common.groovy
 delete mode 100644 projects/rocprim/.jenkins/precheckin.groovy
 delete mode 100644 projects/rocprim/.jenkins/static.groovy

diff --git a/projects/rocprim/.jenkins/common.groovy b/projects/rocprim/.jenkins/common.groovy
deleted file mode 100644
index 0ffd1dee600..00000000000
--- a/projects/rocprim/.jenkins/common.groovy
+++ /dev/null
@@ -1,105 +0,0 @@
-// This file is for internal AMD use.
-// If you are interested in running your own Jenkins, please raise a github issue for assistance.
-
-def runCompileCommand(platform, project, jobName, boolean debug=false, boolean staticLibrary=false)
-{
-    project.paths.construct_build_prefix()
-
-    String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug' : '-DCMAKE_BUILD_TYPE=Release'
-    String buildStatic = staticLibrary ? '-DBUILD_SHARED_LIBS=OFF' : '-DBUILD_SHARED_LIBS=ON'
-    String buildTypeDir = debug ? 'debug' : 'release'
-    String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake'
-    //Set CI node's gfx arch as target if PR, otherwise use default targets of the library
-    String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : ''
-
-    def command = """#!/usr/bin/env bash
-                set -x
-                cd ${project.paths.project_build_prefix}
-                mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir}
-                ${auxiliary.gfxTargetParser()}
-                ${cmake} --toolchain=toolchain-linux.cmake ${buildTypeArg} ${buildStatic} ${amdgpuTargets} -DBUILD_TEST=ON -DBUILD_BENCHMARK=ON ../..
-                make -j\$(nproc)
-                """
-
-    platform.runCommand(this, command)
-}
-
-
-def runTestCommand (platform, project, boolean rocmExamples=false)
-{
-    String sudo = auxiliary.sudo(platform.jenkinsLabel)
-
-    def testCommand = "ctest --output-on-failure "
-    def testCommandExcludeRegex = /(rocprim.block_histogram)/
-    def testCommandExclude = "--exclude-regex \"${testCommandExcludeRegex}\""
-    def hmmExcludeRegex = ''
-    def hmmTestCommandExclude = "--exclude-regex \"${hmmExcludeRegex}\""
-    def hmmTestCommand = ''
-    if (platform.jenkinsLabel.contains('gfx90a'))
-    {
-        echo("HMM TESTS DISABLED")
-        /*hmmTestCommand = """
-                            export HSA_XNACK=1
-                            export ROCPRIM_USE_HMM=1
-                            ${testCommand} ${hmmTestCommandExclude}
-                         """*/
-    }
-    echo(env.JOB_NAME)
-    if (env.JOB_NAME.contains('bleeding-edge'))
-    {
-        testCommand = ''
-        testCommandExclude = ''
-        hmmTestCommand = ''
-        echo("TESTS DISABLED")
-    }
-    def command = """#!/usr/bin/env bash
-                set -x
-                cd ${project.paths.project_build_prefix}
-                cd ${project.testDirectory}
-                ${testCommand} ${testCommandExclude}
-                if (( \$? != 0 )); then
-                    exit 1
-                fi
-                ${hmmTestCommand}
-            """
-    platform.runCommand(this, command)
-    //ROCM Examples
-    if (rocmExamples){
-        String buildString = ""
-        if (platform.os.contains("ubuntu")){
-            buildString += "sudo dpkg -i *.deb"
-        }
-        else {
-            buildString += "sudo rpm -i *.rpm"
-        }
-        testCommand = """#!/usr/bin/env bash
-                    set -ex
-                    cd ${project.paths.project_build_prefix}/build/release/package
-                    ls
-                    ${buildString}
-                    cd ../../..
-                    testDirs=("Libraries/rocPRIM")
-                    git clone https://github.com/ROCm/rocm-examples.git
-                    rocm_examples_dir=\$(readlink -f rocm-examples)
-                    for testDir in \${testDirs[@]}; do
-                        cd \${rocm_examples_dir}/\${testDir}
-                        cmake -S . -B build
-                        cmake --build build
-                        cd ./build
-                        ctest --output-on-failure
-                    done
-                """
-        platform.runCommand(this, testCommand, "ROCM Examples")  
-
-    }
-}
-
-def runPackageCommand(platform, project)
-{
-    def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/release")
-
-    platform.runCommand(this, packageHelper[0])
-        platform.archiveArtifacts(this, packageHelper[1])
-}
-
-return this
diff --git a/projects/rocprim/.jenkins/precheckin.groovy b/projects/rocprim/.jenkins/precheckin.groovy
deleted file mode 100644
index bbb8274743c..00000000000
--- a/projects/rocprim/.jenkins/precheckin.groovy
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/usr/bin/env groovy
-@Library('rocJenkins@pong') _
-import com.amd.project.*
-import com.amd.docker.*
-import java.nio.file.Path;
-
-def runCI = 
-{
-    nodeDetails, jobName->
-
-    def prj = new rocProject('rocPRIM', 'PreCheckin')
-    prj.paths.build_command = './install -c'
-    prj.timeout.compile = 600
-
-    def nodes = new dockerNodes(nodeDetails, jobName, prj)
-
-    def commonGroovy
-
-    boolean formatCheck = false
-     
-    def compileCommand =
-    {
-        platform, project->
-
-        commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
-        commonGroovy.runCompileCommand(platform, project, jobName)
-    }
-
-    def testCommand =
-    {
-        platform, project->
-
-        commonGroovy.runTestCommand(platform, project, true)
-    }
-
-    def packageCommand =
-    {
-        platform, project->
-
-        commonGroovy.runPackageCommand(platform, project)
-    }
-
-    buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
-}
-
-ci: { 
-    String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
-
-    def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])], 
-                        "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])],
-                        "rocm-docker":[]]
-    propertyList = auxiliary.appendPropertyList(propertyList)
-
-    def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])]
-    jobNameList = auxiliary.appendJobNameList(jobNameList, 'rocPRIM')
-
-    propertyList.each 
-    {
-        jobName, property->
-        if (urlJobName == jobName)
-            properties(auxiliary.addCommonProperties(property))
-    }
-    
-    jobNameList.each 
-    {
-        jobName, nodeDetails->
-        if (urlJobName == jobName)
-            stage(jobName) {
-                runCI(nodeDetails, jobName)
-            }
-    }
-
-    // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
-    if(!jobNameList.keySet().contains(urlJobName))
-    {
-        properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
-        stage(urlJobName) {
-            runCI([ubuntu16:['gfx906']], urlJobName)
-        }
-    }
-}
diff --git a/projects/rocprim/.jenkins/static.groovy b/projects/rocprim/.jenkins/static.groovy
deleted file mode 100644
index 75606419fdf..00000000000
--- a/projects/rocprim/.jenkins/static.groovy
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/usr/bin/env groovy
-@Library('rocJenkins@pong') _
-import com.amd.project.*
-import com.amd.docker.*
-import java.nio.file.Path;
-
-def runCI = 
-{
-    nodeDetails, jobName->
-
-    def prj = new rocProject('rocPRIM', 'static')
-    prj.paths.build_command = './install -c -s'
-    prj.timeout.compile = 600
-    prj.timeout.packaging = 120
-
-    def nodes = new dockerNodes(nodeDetails, jobName, prj)
-
-    def commonGroovy
-
-    boolean formatCheck = false
-     
-    def compileCommand =
-    {
-        platform, project->
-
-        commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
-        commonGroovy.runCompileCommand(platform, project, jobName, debug=false, staticLibrary=true)
-    }
-
-    def testCommand =
-    {
-        platform, project->
-
-        commonGroovy.runTestCommand(platform, project)
-    }
-
-    def packageCommand =
-    {
-        platform, project->
-
-        commonGroovy.runPackageCommand(platform, project)
-    }
-
-    buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
-}
-
-ci: { 
-    String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
-
-    def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])], 
-                        "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])],
-                        "rocm-docker":[]]
-    propertyList = auxiliary.appendPropertyList(propertyList)
-
-    def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])]
-    jobNameList = auxiliary.appendJobNameList(jobNameList, 'rocPRIM')
-
-    propertyList.each 
-    {
-        jobName, property->
-        if (urlJobName == jobName)
-            properties(auxiliary.addCommonProperties(property))
-    }
-    
-    jobNameList.each 
-    {
-        jobName, nodeDetails->
-        if (urlJobName == jobName)
-            stage(jobName) {
-                runCI(nodeDetails, jobName)
-            }
-    }
-
-    // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
-    if(!jobNameList.keySet().contains(urlJobName))
-    {
-        properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
-        stage(urlJobName) {
-            runCI([ubuntu16:['gfx906']], urlJobName)
-        }
-    }
-}

From 53124d95a28390be4ced80e8cfbe56d106d7fbec Mon Sep 17 00:00:00 2001
From: NguyenNhuDi <zee.nguyen@amd.com>
Date: Wed, 6 Aug 2025 22:04:56 +0000
Subject: [PATCH 09/10] changed fall back exclusive_scan name to be less
 ambiguous

---
 projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp
index 14748a17ff2..bb018014a88 100644
--- a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp
+++ b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp
@@ -608,7 +608,7 @@ void rank_with_prefix_sum_kernel(const KeyType* keys_input,
 #if defined(_GLIBCXX_RELEASE) && (GLIBCXX_RELEASE < 9)
 
 template <typename It, typename OutIt, typename T>
-void exclusive_scan(It first, It last, OutIt out, T init)
+void fall_back_exclusive_scan(It first, It last, OutIt out, T init)
 {
     // Fallback implementation for exclusive scan if gcc version is < 9
     for (; first != last; ++first)
@@ -725,7 +725,7 @@ void test_radix_rank_with_prefix_sum_output()
                                         pfs_expected.begin() + pfs_offset,
                                         0);
                 #else
-                    exclusive_scan(histogram.begin(),
+                    fall_back_exclusive_scan(histogram.begin(),
                                         histogram.end(),
                                         pfs_expected.begin() + pfs_offset,
                                         0);

From b6184c0fff30d18df5707f0eda46c9a6ccedd0a2 Mon Sep 17 00:00:00 2001
From: NguyenNhuDi <zee.nguyen@amd.com>
Date: Wed, 6 Aug 2025 22:21:25 +0000
Subject: [PATCH 10/10] commented on name change

---
 projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp
index bb018014a88..67575363526 100644
--- a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp
+++ b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp
@@ -607,6 +607,10 @@ void rank_with_prefix_sum_kernel(const KeyType* keys_input,
 
 #if defined(_GLIBCXX_RELEASE) && (GLIBCXX_RELEASE < 9)
 
+/**
+ * name this function fall_back_exclusive_scan to prevent
+ * ambiguous name error 
+ */
 template <typename It, typename OutIt, typename T>
 void fall_back_exclusive_scan(It first, It last, OutIt out, T init)
 {