diff --git a/CHANGELOG.md b/CHANGELOG.md
index f405848e9..bfa0aa7c3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -40,6 +40,12 @@ This is a complete list of affected functions and how their default accumulator
 * Added the `rocprim::merge_inplace` function for merging in-place.
 * Added initial value support for warp- and block-level inclusive scan.
 * Added support for building tests with device-side random data generation, making them finish faster. This requires rocRAND, and is enabled with the `WITH_ROCRAND=ON` build flag.
+* Added additional unit tests for `test_block_load.hpp`
+* Added additional unit tests for `test_block_rank.hpp`
+* Added additional unit tests for `test_block_scan.hpp`
+* Added additional unit tests for `test_block_sort.hpp`
+* Added additional unit tests for `test_block_store.hpp`
+* Added missing `rank_keys_desc` with `digit_extractor` parameter for `block_radix_rank_match.hpp`
 
 ### Changed
 
diff --git a/rocprim/include/rocprim/block/detail/block_radix_rank_match.hpp b/rocprim/include/rocprim/block/detail/block_radix_rank_match.hpp
index 7f3b9da6c..cf621d00c 100644
--- a/rocprim/include/rocprim/block/detail/block_radix_rank_match.hpp
+++ b/rocprim/include/rocprim/block/detail/block_radix_rank_match.hpp
@@ -271,6 +271,19 @@ class block_radix_rank_match
         rank_keys_impl<true>(keys, ranks, storage.get(), begin_bit, pass_bits);
     }
 
+    template<typename Key, unsigned ItemsPerThread, typename DigitExtractor>
+    ROCPRIM_DEVICE void rank_keys_desc(const Key (&keys)[ItemsPerThread],
+                                  unsigned int (&ranks)[ItemsPerThread],
+                                  storage_type&  storage,
+                                  DigitExtractor digit_extractor)
+    {
+        rank_keys_impl(keys, ranks, storage.get(),
+            [&digit_extractor](const Key & key){
+                const unsigned int digit = digit_extractor(key);
+                return radix_digits - 1 - digit;
+            });
+    }
+
     template<typename Key, unsigned ItemsPerThread, typename DigitExtractor>
     ROCPRIM_DEVICE void rank_keys(const Key (&keys)[ItemsPerThread],
                                   unsigned int (&ranks)[ItemsPerThread],
diff --git a/test/rocprim/test_block_load_store.hpp b/test/rocprim/test_block_load_store.hpp
index 382a8061f..bafb9b95b 100644
--- a/test/rocprim/test_block_load_store.hpp
+++ b/test/rocprim/test_block_load_store.hpp
@@ -192,6 +192,179 @@ typed_test_def(suite_name, name_suffix, LoadStoreClassValid)
 
 }
 
+typed_test_def(suite_name, name_suffix, LoadStoreClassWithStorage)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id = " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using Type = typename TestFixture::params::type;
+    static constexpr size_t block_size = TestFixture::params::block_size;
+    static constexpr rocprim::block_load_method load_method = TestFixture::params::load_method;
+    static constexpr rocprim::block_store_method store_method = TestFixture::params::store_method;
+    static constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
+    static constexpr auto items_per_block = block_size * items_per_thread;
+    const size_t size = items_per_block * 113;
+    const auto grid_size = size / items_per_block;
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size() || (block_size & (block_size - 1)) != 0)
+    {
+        return;
+    }
+
+    if(load_method == rocprim::block_load_method::block_load_warp_transpose
+       || store_method == rocprim::block_store_method::block_store_warp_transpose)
+    {
+        unsigned int host_warp_size;
+        HIP_CHECK(::rocprim::host_warp_size(device_id, host_warp_size));
+        if(block_size % host_warp_size != 0)
+        {
+            GTEST_SKIP() << "Cannot run test of block size " << block_size
+                         << " on a device with warp size " << host_warp_size;
+        }
+    }
+
+    const size_t valid = items_per_block;
+
+    for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++)
+    {
+        unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed = " << seed_value);
+
+        // Generate data
+        std::vector<Type> input
+            = test_utils::get_random_data_wrapped<Type>(size, -100, 100, seed_value);
+        std::vector<Type> output(input.size(), (Type)0);
+
+        // Calculate expected results on host
+        std::vector<Type> expected(input.size(), (Type)0);
+        for (size_t i = 0; i < 113; i++)
+        {
+            size_t block_offset = i * items_per_block;
+            for (size_t j = 0; j < items_per_block; j++)
+            {
+                if (j < valid)
+                {
+                    expected[j + block_offset] = input[j + block_offset];
+                }
+            }
+        }
+
+        // Preparing device
+        common::device_ptr<Type> device_input(input);
+        // Have to initialize output for unvalid data to make sure they are not changed
+        common::device_ptr<Type> device_output(output);
+
+        // Running kernel
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(load_store_storage_kernel<Type,
+                                                                   load_method,
+                                                                   store_method,
+                                                                   block_size,
+                                                                   items_per_thread>),
+                           dim3(grid_size),
+                           dim3(block_size),
+                           0,
+                           0,
+                           device_input.get(),
+                           device_output.get());
+        HIP_CHECK(hipGetLastError());
+
+        // Reading results from device
+        output = device_output.load();
+
+        // Validating results
+        ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected));
+    }
+
+}
+
+typed_test_def(suite_name, name_suffix, LoadStoreClassValidWithStorage)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id = " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using Type = typename TestFixture::params::type;
+    static constexpr size_t block_size = TestFixture::params::block_size;
+    static constexpr rocprim::block_load_method load_method = TestFixture::params::load_method;
+    static constexpr rocprim::block_store_method store_method = TestFixture::params::store_method;
+    static constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
+    static constexpr auto items_per_block = block_size * items_per_thread;
+    const size_t size = items_per_block * 113;
+    const auto grid_size = size / items_per_block;
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size() || (block_size & (block_size - 1)) != 0)
+    {
+        return;
+    }
+
+    if(load_method == rocprim::block_load_method::block_load_warp_transpose
+       || store_method == rocprim::block_store_method::block_store_warp_transpose)
+    {
+        unsigned int host_warp_size;
+        HIP_CHECK(::rocprim::host_warp_size(device_id, host_warp_size));
+        if(block_size % host_warp_size != 0)
+        {
+            GTEST_SKIP() << "Cannot run test of block size " << block_size
+                         << " on a device with warp size " << host_warp_size;
+        }
+    }
+
+    const size_t valid = items_per_block - 32;
+
+    for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++)
+    {
+        unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed = " << seed_value);
+
+        // Generate data
+        std::vector<Type> input
+            = test_utils::get_random_data_wrapped<Type>(size, -100, 100, seed_value);
+        std::vector<Type> output(input.size(), (Type)0);
+
+        // Calculate expected results on host
+        std::vector<Type> expected(input.size(), (Type)0);
+        for (size_t i = 0; i < 113; i++)
+        {
+            size_t block_offset = i * items_per_block;
+            for (size_t j = 0; j < items_per_block; j++)
+            {
+                if (j < valid)
+                {
+                    expected[j + block_offset] = input[j + block_offset];
+                }
+            }
+        }
+
+        // Preparing device
+        common::device_ptr<Type> device_input(input);
+        // Have to initialize output for unvalid data to make sure they are not changed
+        common::device_ptr<Type> device_output(output);
+
+        // Running kernel
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(load_store_storage_valid_kernel<Type,
+                                                                   load_method,
+                                                                   store_method,
+                                                                   block_size,
+                                                                   items_per_thread>),
+                           dim3(grid_size),
+                           dim3(block_size),
+                           0,
+                           0,
+                           device_input.get(),
+                           device_output.get(),
+                           valid);
+        HIP_CHECK(hipGetLastError());
+
+        // Reading results from device
+        output = device_output.load();
+
+        // Validating results
+        ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected));
+    }
+
+}
+
 typed_test_def(suite_name, name_suffix, LoadStoreClassDefault)
 {
     int device_id = test_common_utils::obtain_device_from_ctest();
@@ -277,3 +450,89 @@ typed_test_def(suite_name, name_suffix, LoadStoreClassDefault)
         ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected));
     }
 }
+
+typed_test_def(suite_name, name_suffix, LoadStoreClassDefaultWithStorage)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id = " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using Type = typename TestFixture::params::type;
+    static constexpr size_t block_size = TestFixture::params::block_size;
+    static constexpr rocprim::block_load_method load_method = TestFixture::params::load_method;
+    static constexpr rocprim::block_store_method store_method = TestFixture::params::store_method;
+    static constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
+    static constexpr auto items_per_block = block_size * items_per_thread;
+    const size_t size = items_per_block * 113;
+    const auto grid_size = size / items_per_block;
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size() || (block_size & (block_size - 1)) != 0)
+    {
+        return;
+    }
+
+    if(load_method == rocprim::block_load_method::block_load_warp_transpose
+       || store_method == rocprim::block_store_method::block_store_warp_transpose)
+    {
+        unsigned int host_warp_size;
+        HIP_CHECK(::rocprim::host_warp_size(device_id, host_warp_size));
+        if(block_size % host_warp_size != 0)
+        {
+            GTEST_SKIP() << "Cannot run test of block size " << block_size
+                         << " on a device with warp size " << host_warp_size;
+        }
+    }
+
+    const size_t valid = items_per_thread + 1;
+    Type _default = (Type)-1;
+
+    for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++)
+    {
+        unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed = " << seed_value);
+
+        // Generate data
+        std::vector<Type> input
+            = test_utils::get_random_data_wrapped<Type>(size, -100, 100, seed_value);
+
+        // Calculate expected results on host
+        std::vector<Type> expected(input.size(), _default);
+        for (size_t i = 0; i < 113; i++)
+        {
+            size_t block_offset = i * items_per_block;
+            for (size_t j = 0; j < items_per_block; j++)
+            {
+                if (j < valid)
+                {
+                    expected[j + block_offset] = input[j + block_offset];
+                }
+            }
+        }
+
+        // Preparing device
+        common::device_ptr<Type> device_input(input);
+        common::device_ptr<Type> device_output(size);
+
+        // Running kernel
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(load_store_valid_default_storage_kernel<Type,
+                                                                           load_method,
+                                                                           store_method,
+                                                                           block_size,
+                                                                           items_per_thread>),
+                           dim3(grid_size),
+                           dim3(block_size),
+                           0,
+                           0,
+                           device_input.get(),
+                           device_output.get(),
+                           valid,
+                           _default);
+        HIP_CHECK(hipGetLastError());
+
+        // Reading results from device
+        const auto output = device_output.load();
+
+        // Validating results
+        ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected));
+    }
+}
diff --git a/test/rocprim/test_block_load_store.kernels.hpp b/test/rocprim/test_block_load_store.kernels.hpp
index cf8bb090a..4f1b12f83 100644
--- a/test/rocprim/test_block_load_store.kernels.hpp
+++ b/test/rocprim/test_block_load_store.kernels.hpp
@@ -553,6 +553,100 @@ void load_store_valid_kernel(Type* device_input, Type* device_output, size_t val
     store.store(device_output + offset, _items, (unsigned int)valid);
 }
 
+template<
+    class Type,
+    rocprim::block_load_method LoadMethod,
+    rocprim::block_store_method StoreMethod,
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    std::enable_if_t<(enable_block_load_store_test<LoadMethod, StoreMethod, BlockSize>::value), int> = 0
+>
+__global__
+__launch_bounds__(BlockSize)
+void load_store_storage_kernel(Type* device_input, Type* device_output)
+{
+    Type _items[ItemsPerThread];
+    auto offset = blockIdx.x * BlockSize * ItemsPerThread;
+    using impl  = get_block_load_store<Type, LoadMethod, StoreMethod, BlockSize, ItemsPerThread>;
+    typename impl::block_load  load;
+    typename impl::block_store store;
+
+    ROCPRIM_SHARED_MEMORY typename impl::block_load::storage_type load_storage;
+    ROCPRIM_SHARED_MEMORY typename impl::block_store::storage_type store_storage;
+
+    load.load(device_input + offset, _items, load_storage);
+    store.store(device_output + offset, _items, store_storage);
+}
+
+// The default kernel in case load store storage kernel fails to build
+template<
+    class Type,
+    rocprim::block_load_method LoadMethod,
+    rocprim::block_store_method StoreMethod,
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    std::enable_if_t<!(enable_block_load_store_test<LoadMethod, StoreMethod, BlockSize>::value), int> = 0
+>
+__global__
+__launch_bounds__(BlockSize)
+void load_store_storage_kernel(Type* device_input, Type* device_output)
+{
+    Type _items[ItemsPerThread];
+    auto offset = blockIdx.x * BlockSize * ItemsPerThread;
+    using impl  = get_block_load_store<Type, LoadMethod, StoreMethod, BlockSize, ItemsPerThread>;
+    typename impl::block_load  load;
+    typename impl::block_store store;
+    load.load(device_input + offset, _items);
+    store.store(device_output + offset, _items);
+}
+
+template<
+    class Type,
+    rocprim::block_load_method LoadMethod,
+    rocprim::block_store_method StoreMethod,
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    std::enable_if_t<(enable_block_load_store_test<LoadMethod, StoreMethod, BlockSize>::value), int> = 0
+>
+__global__
+__launch_bounds__(BlockSize)
+void load_store_storage_valid_kernel(Type* device_input, Type* device_output, size_t valid)
+{
+    Type _items[ItemsPerThread];
+    auto offset = blockIdx.x * BlockSize * ItemsPerThread;
+    using impl  = get_block_load_store<Type, LoadMethod, StoreMethod, BlockSize, ItemsPerThread>;
+    typename impl::block_load  load;
+    typename impl::block_store store;
+
+    ROCPRIM_SHARED_MEMORY typename impl::block_load::storage_type load_storage;
+    ROCPRIM_SHARED_MEMORY typename impl::block_store::storage_type store_storage;
+
+    load.load(device_input + offset, _items, (unsigned int)valid, load_storage);
+    store.store(device_output + offset, _items, (unsigned int)valid, store_storage);
+}
+
+// The default kernel in case load store storage valid kernel fails to build
+template<
+    class Type,
+    rocprim::block_load_method LoadMethod,
+    rocprim::block_store_method StoreMethod,
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    std::enable_if_t<!(enable_block_load_store_test<LoadMethod, StoreMethod, BlockSize>::value), int> = 0
+>
+__global__
+__launch_bounds__(BlockSize)
+void load_store_storage_valid_kernel(Type* device_input, Type* device_output, size_t valid)
+{
+    Type _items[ItemsPerThread];
+    auto offset = blockIdx.x * BlockSize * ItemsPerThread;
+    using impl  = get_block_load_store<Type, LoadMethod, StoreMethod, BlockSize, ItemsPerThread>;
+    typename impl::block_load  load;
+    typename impl::block_store store;
+    load.load(device_input + offset, _items, (unsigned int)valid);
+    store.store(device_output + offset, _items, (unsigned int)valid);
+}
+
 template<
     class Type,
     rocprim::block_load_method LoadMethod,
@@ -574,4 +668,53 @@ void load_store_valid_default_kernel(Type* device_input, Type* device_output, si
     store.store(device_output + offset, _items);
 }
 
+template<
+    class Type,
+    rocprim::block_load_method LoadMethod,
+    rocprim::block_store_method StoreMethod,
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    class Def,
+    std::enable_if_t<(enable_block_load_store_test<LoadMethod, StoreMethod, BlockSize>::value), int> = 0
+>
+__global__
+__launch_bounds__(BlockSize)
+void load_store_valid_default_storage_kernel(Type* device_input, Type* device_output, size_t valid, Def _default)
+{
+    Type _items[ItemsPerThread];
+    auto offset = blockIdx.x * BlockSize * ItemsPerThread;
+    using impl  = get_block_load_store<Type, LoadMethod, StoreMethod, BlockSize, ItemsPerThread>;
+    typename impl::block_load  load;
+    typename impl::block_store store;
+
+    ROCPRIM_SHARED_MEMORY typename impl::block_load::storage_type load_storage;
+    ROCPRIM_SHARED_MEMORY typename impl::block_store::storage_type store_storage;
+
+    load.load(device_input + offset, _items, (unsigned int)valid, _default, load_storage);
+    store.store(device_output + offset, _items, store_storage);
+}
+
+// The default kernel in case load_store_valid_default_storage_kernel fails to build
+template<
+    class Type,
+    rocprim::block_load_method LoadMethod,
+    rocprim::block_store_method StoreMethod,
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    class Def,
+    std::enable_if_t<!(enable_block_load_store_test<LoadMethod, StoreMethod, BlockSize>::value), int> = 0
+>
+__global__
+__launch_bounds__(BlockSize)
+void load_store_valid_default_storage_kernel(Type* device_input, Type* device_output, size_t valid, Def _default)
+{
+    Type _items[ItemsPerThread];
+    auto offset = blockIdx.x * BlockSize * ItemsPerThread;
+    using impl  = get_block_load_store<Type, LoadMethod, StoreMethod, BlockSize, ItemsPerThread>;
+    typename impl::block_load  load;
+    typename impl::block_store store;
+    load.load(device_input + offset, _items, (unsigned int)valid, _default);
+    store.store(device_output + offset, _items);
+}
+
 #endif // TEST_BLOCK_LOAD_STORE_KERNELS_HPP_
diff --git a/test/rocprim/test_block_radix_rank.cpp.in b/test/rocprim/test_block_radix_rank.cpp.in
index 8e793b5f0..656035aef 100644
--- a/test/rocprim/test_block_radix_rank.cpp.in
+++ b/test/rocprim/test_block_radix_rank.cpp.in
@@ -47,13 +47,20 @@
 
 #if   ROCPRIM_TEST_SUITE_SLICE == 0
     TYPED_TEST_P(SUITE, RankBasic  ) { test_block_radix_rank_algorithm<rocprim::block_radix_rank_algorithm::basic, TestFixture>(); }
-    REGISTER_TYPED_TEST_SUITE_P(SUITE, RankBasic);
+    TYPED_TEST_P(SUITE, RankBasicExtractor  ) { test_block_radix_rank_extractor_algorithm<rocprim::block_radix_rank_algorithm::basic, TestFixture>(); }
+    TYPED_TEST_P(SUITE, RankBasicExtractorWithPrefixCount  ) { test_block_radix_rank_extractor_with_prefix_count_algorithm<rocprim::block_radix_rank_algorithm::basic, TestFixture>(); }
+    REGISTER_TYPED_TEST_SUITE_P(SUITE, RankBasic, RankBasicExtractor, RankBasicExtractorWithPrefixCount);
+    
 #elif ROCPRIM_TEST_SUITE_SLICE == 1
     TYPED_TEST_P(SUITE, RankMemoize) { test_block_radix_rank_algorithm<rocprim::block_radix_rank_algorithm::basic_memoize, TestFixture>(); }
-    REGISTER_TYPED_TEST_SUITE_P(SUITE, RankMemoize);
+    TYPED_TEST_P(SUITE, RankMemoizeExtractor) { test_block_radix_rank_extractor_algorithm<rocprim::block_radix_rank_algorithm::basic_memoize, TestFixture>(); }
+    TYPED_TEST_P(SUITE, RankMemoizeExtractorWithPrefixCount) { test_block_radix_rank_extractor_with_prefix_count_algorithm<rocprim::block_radix_rank_algorithm::basic_memoize, TestFixture>(); }
+    REGISTER_TYPED_TEST_SUITE_P(SUITE, RankMemoize, RankMemoizeExtractor, RankMemoizeExtractorWithPrefixCount);
 #elif ROCPRIM_TEST_SUITE_SLICE == 2
     TYPED_TEST_P(SUITE, RankMatch  ) { test_block_radix_rank_algorithm<rocprim::block_radix_rank_algorithm::match, TestFixture>(); }
-    REGISTER_TYPED_TEST_SUITE_P(SUITE, RankMatch);
+    TYPED_TEST_P(SUITE, RankMatchExtractor  ) { test_block_radix_rank_extractor_algorithm<rocprim::block_radix_rank_algorithm::match, TestFixture>(); }
+    TYPED_TEST_P(SUITE, RankMatchExtractorWithPrefixCount  ) { test_block_radix_rank_extractor_with_prefix_count_algorithm<rocprim::block_radix_rank_algorithm::match, TestFixture>(); }
+    REGISTER_TYPED_TEST_SUITE_P(SUITE, RankMatch, RankMatchExtractor, RankMatchExtractorWithPrefixCount);
 #endif
 
 #if   ROCPRIM_TEST_TYPE_SLICE == 0
diff --git a/test/rocprim/test_block_radix_rank.hpp b/test/rocprim/test_block_radix_rank.hpp
index e078fdcfa..e3b2b7b4f 100644
--- a/test/rocprim/test_block_radix_rank.hpp
+++ b/test/rocprim/test_block_radix_rank.hpp
@@ -59,8 +59,13 @@ static constexpr size_t       n_sizes                   = 12;
 static constexpr unsigned int items_per_thread[n_sizes] = {1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3};
 static constexpr unsigned int rank_desc[n_sizes]
     = {false, false, false, false, false, false, true, true, true, true, true, true};
+static constexpr unsigned int use_storage[n_sizes]
+    = {false, true, false, true, false, true, false, true, false, true, false, true};
+static constexpr unsigned int end_bits[n_sizes]
+        = {0x1, 0x3, 0x7, 0xf, 0x1, 0x3, 0x7, 0xf, 0x1, 0x3, 0x7, 0xf};
 static constexpr unsigned int pass_start_bit[n_sizes]  = {0, 0, 0, 6, 2, 1, 0, 0, 0, 1, 4, 7};
 static constexpr unsigned int max_radix_bits[n_sizes]  = {4, 3, 5, 3, 1, 5, 4, 2, 4, 3, 1, 2};
+static constexpr unsigned int max_radix_bits_extractor[n_sizes]  = {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4};
 static constexpr unsigned int pass_radix_bits[n_sizes] = {0, 0, 1, 0, 0, 2, 0, 0, 3, 0, 0, 1};
 
 template<typename T,
@@ -71,6 +76,7 @@ template<typename T,
 __global__ __launch_bounds__(BlockSize) void rank_kernel(const T* const      items_input,
                                                          unsigned int* const ranks_output,
                                                          const bool          descending,
+                                                         const bool          use_storage,
                                                          const unsigned int  start_bit,
                                                          const unsigned int  radix_bits)
 {
@@ -107,11 +113,17 @@ __global__ __launch_bounds__(BlockSize) void rank_kernel(const T* const      ite
 
     if(descending)
     {
-        block_rank_type().rank_keys_desc(keys, ranks, storage.rank, start_bit, radix_bits);
+        if (use_storage)
+            block_rank_type().rank_keys_desc(keys, ranks, storage.rank, start_bit, radix_bits);
+        else
+            block_rank_type().rank_keys_desc(keys, ranks, start_bit, radix_bits);
     }
     else
     {
-        block_rank_type().rank_keys(keys, ranks, storage.rank, start_bit, radix_bits);
+        if (use_storage)
+            block_rank_type().rank_keys(keys, ranks, storage.rank, start_bit, radix_bits);
+        else
+            block_rank_type().rank_keys(keys, ranks, start_bit, radix_bits);
     }
 
     if ROCPRIM_IF_CONSTEXPR(warp_striped)
@@ -123,6 +135,167 @@ __global__ __launch_bounds__(BlockSize) void rank_kernel(const T* const      ite
     rocprim::block_store_direct_blocked(lid, ranks_output + block_offset, ranks);
 }
 
+template<typename T,
+         unsigned int                        BlockSize,
+         unsigned int                        ItemsPerThread,
+         unsigned int                        MaxRadixBits,
+         rocprim::block_radix_rank_algorithm Algorithm>
+__global__ __launch_bounds__(BlockSize) void rank_kernel(const T* const      items_input,
+                                                         unsigned int* const ranks_output,
+                                                         const bool          descending,
+                                                         const bool          use_storage,
+                                                         const unsigned int  last_bits)
+{
+    using block_rank_type     = rocprim::block_radix_rank<BlockSize, MaxRadixBits, Algorithm>;
+    using keys_exchange_type  = rocprim::block_exchange<T, BlockSize, ItemsPerThread>;
+    using ranks_exchange_type = rocprim::block_exchange<unsigned int, BlockSize, ItemsPerThread>;
+
+    constexpr bool warp_striped = Algorithm == rocprim::block_radix_rank_algorithm::match;
+
+    constexpr unsigned int items_per_block = BlockSize * ItemsPerThread;
+    const unsigned int     lid             = threadIdx.x;
+    const unsigned int     block_offset    = blockIdx.x * items_per_block;
+
+    ROCPRIM_SHARED_MEMORY union
+    {
+        typename keys_exchange_type::storage_type  keys_exchange;
+        typename block_rank_type::storage_type     rank;
+        typename ranks_exchange_type::storage_type ranks_exchange;
+    } storage;
+
+    T            keys[ItemsPerThread];
+    unsigned int ranks[ItemsPerThread];
+
+    rocprim::block_load_direct_blocked(lid, items_input + block_offset, keys);
+    if ROCPRIM_IF_CONSTEXPR(warp_striped)
+    {
+        // block_radix_rank_match requires warp striped input and output. Instead of using
+        // rocprim::block_load_direct_warp_striped though, we load directly and exchange the
+        // values manually, as we can also test with block sizes that do not divide the hardware
+        // warp size that way.
+        keys_exchange_type().blocked_to_warp_striped(keys, keys, storage.keys_exchange);
+        rocprim::syncthreads();
+    }
+
+    union converter{
+        T in;
+        uint64_t out;
+    };
+
+    if(descending)
+    {
+        if (use_storage)
+            block_rank_type().rank_keys_desc(keys, ranks, storage.rank, [=](const T & key){
+                converter c;
+                c.in = key;
+                uint64_t out = c.out & last_bits;
+                return out;
+            });
+        else
+            block_rank_type().rank_keys_desc(keys, ranks, [=](const T & key){
+                converter c;
+                c.in = key;
+                uint64_t out = c.out & last_bits;
+                return out;
+            });
+    }
+    else
+    {
+        if (use_storage)
+            block_rank_type().rank_keys(keys, ranks, storage.rank, [=](const T & key){
+                converter c;
+                c.in = key;
+                uint64_t out = c.out & last_bits;
+                return out;
+            });
+        else
+            block_rank_type().rank_keys(keys, ranks, [=](const T & key){
+                converter c;
+                c.in = key;
+                uint64_t out = c.out & last_bits;
+                return out;
+            });
+    }
+
+    if ROCPRIM_IF_CONSTEXPR(warp_striped)
+    {
+        // See the comment above.
+        rocprim::syncthreads();
+        ranks_exchange_type().warp_striped_to_blocked(ranks, ranks, storage.ranks_exchange);
+    }
+    rocprim::block_store_direct_blocked(lid, ranks_output + block_offset, ranks);
+}
+
+template<typename T,
+         unsigned int                        BlockSize,
+         unsigned int                        ItemsPerThread,
+         unsigned int                        RadixBits,
+         unsigned int                        MaxRadixBits,
+         rocprim::block_radix_rank_algorithm Algorithm,
+         class Extractor>
+__global__ __launch_bounds__(BlockSize) void rank_kernel(const T* const      items_input,
+                                                         unsigned int* const ranks_output,
+                                                         unsigned int*       prefix_output,
+                                                         unsigned int*       counts_output,
+                                                         Extractor digit_extractor
+                                                        )
+{
+    using block_rank_type     = rocprim::block_radix_rank<BlockSize, MaxRadixBits, Algorithm>;
+    using keys_exchange_type  = rocprim::block_exchange<T, BlockSize, ItemsPerThread>;
+    using ranks_exchange_type = rocprim::block_exchange<unsigned int, BlockSize, ItemsPerThread>;
+
+    constexpr bool warp_striped = Algorithm == rocprim::block_radix_rank_algorithm::match;
+
+    constexpr unsigned int items_per_block = BlockSize * ItemsPerThread;
+    const unsigned int     lid             = threadIdx.x;
+    const unsigned int     block_offset    = blockIdx.x * items_per_block;
+
+    ROCPRIM_SHARED_MEMORY union
+    {
+        typename keys_exchange_type::storage_type  keys_exchange;
+        typename block_rank_type::storage_type     rank;
+        typename ranks_exchange_type::storage_type ranks_exchange;
+    } storage;
+    
+    T            keys[ItemsPerThread];
+    unsigned int ranks[ItemsPerThread];
+
+    const unsigned int digits_per_thread = block_rank_type().digits_per_thread;
+
+    unsigned int prefix[digits_per_thread];
+    unsigned int counts[digits_per_thread];
+    rocprim::block_load_direct_blocked(lid, items_input + block_offset, keys);
+    if ROCPRIM_IF_CONSTEXPR(warp_striped)
+    {
+        // block_radix_rank_match requires warp striped input and output. Instead of using
+        // rocprim::block_load_direct_warp_striped though, we load directly and exchange the
+        // values manually, as we can also test with block sizes that do not divide the hardware
+        // warp size that way.
+        keys_exchange_type().blocked_to_warp_striped(keys, keys, storage.keys_exchange);
+        rocprim::syncthreads();
+    }
+
+    block_rank_type().rank_keys(keys, ranks, storage.rank, digit_extractor, prefix, counts);
+
+    if ROCPRIM_IF_CONSTEXPR(warp_striped)
+    {
+        // See the comment above.
+        rocprim::syncthreads();
+        ranks_exchange_type().warp_striped_to_blocked(ranks, ranks, storage.ranks_exchange);
+    }
+    rocprim::block_store_direct_blocked(lid, ranks_output + block_offset, ranks);
+
+    // storing count and prefix output
+    const size_t pc_offset = (threadIdx.x * digits_per_thread) + (blockIdx.x * (1 << RadixBits));
+
+    for(size_t i = 0; i < digits_per_thread; i++){
+        if((threadIdx.x * digits_per_thread) + i < (1 << RadixBits)){
+            prefix_output[pc_offset + i] = prefix[i];
+            counts_output[pc_offset + i] = counts[i];
+        }
+    }
+}
+
 template<typename T,
          unsigned int                        BlockSize,
          unsigned int                        ItemsPerThread,
@@ -130,6 +303,7 @@ template<typename T,
          unsigned int                        MaxRadixBits,
          unsigned int                        RadixBits,
          bool                                Descending,
+         bool                                UseStorage,
          rocprim::block_radix_rank_algorithm Algorithm>
 void test_block_radix_rank()
 {
@@ -141,6 +315,7 @@ void test_block_radix_rank()
     constexpr size_t                              radix_bits       = RadixBits;
     constexpr size_t                              end_bit          = start_bit + radix_bits;
     constexpr bool                                descending       = Descending;
+    constexpr bool                                use_storage      = UseStorage;
     constexpr rocprim::block_radix_rank_algorithm algorithm        = Algorithm;
 
     const size_t grid_size = 23;
@@ -205,6 +380,7 @@ void test_block_radix_rank()
             d_keys_input.get(),
             d_ranks_output.get(),
             descending,
+            use_storage,
             start_bit,
             radix_bits);
         HIP_CHECK(hipGetLastError());
@@ -216,6 +392,259 @@ void test_block_radix_rank()
     }
 }
 
+template<typename T,
+         unsigned int                        BlockSize,
+         unsigned int                        ItemsPerThread,
+         unsigned int                        MaxRadixBits,
+         unsigned int                        EndBits,
+         bool                                Descending,
+         bool                                UseStorage,
+         rocprim::block_radix_rank_algorithm Algorithm>
+void test_block_radix_extractor_rank()
+{
+    constexpr size_t                              block_size       = BlockSize;
+    constexpr size_t                              items_per_thread = ItemsPerThread;
+    constexpr size_t                              items_per_block  = block_size * items_per_thread;
+    constexpr size_t                              max_radix_bits   = MaxRadixBits;
+    constexpr size_t                              end_bits         = EndBits;
+    constexpr bool                                descending       = Descending;
+    constexpr bool                                use_storage      = UseStorage;
+    constexpr rocprim::block_radix_rank_algorithm algorithm        = Algorithm;
+
+    const size_t grid_size = 2;
+    const size_t size      = items_per_block * grid_size;
+
+    SCOPED_TRACE(testing::Message() << "with block_size = " << block_size);
+    SCOPED_TRACE(testing::Message() << "with items_per_thread = " << items_per_thread);
+    SCOPED_TRACE(testing::Message() << "with descending = " << (descending ? "true" : "false"));
+    SCOPED_TRACE(testing::Message() << "with max_radix_bits = " << MaxRadixBits);
+    SCOPED_TRACE(testing::Message() << "with grid_size = " << size);
+    SCOPED_TRACE(testing::Message() << "with size = " << size);
+
+    for(size_t seed_index = 0; seed_index < number_of_runs; ++seed_index)
+    {
+        seed_type seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed = " << seed_value);
+
+        // Generate data
+        std::vector<T> keys_input
+            = test_utils::get_random_data_wrapped<T>(size,
+                                                     common::generate_limits<T>::min(),
+                                                     common::generate_limits<T>::max(),
+                                                     seed_value);
+
+
+        union converter{
+            T in;
+            uint64_t out;
+        };
+        // Calculated expected results on host
+        std::vector<unsigned int> expected(size);
+        for(size_t i = 0; i < grid_size; ++i)
+        {
+            size_t     block_offset = i * items_per_block;
+
+            // Perform an 'argsort', which gives a sorted sequence of indices into `keys_input`.
+            std::vector<int> indices(items_per_block);
+            std::iota(indices.begin(), indices.end(), 0);
+
+            std::stable_sort(
+                indices.begin(),
+                indices.end(),
+                [&](const int& i, const int& j)
+                { 
+                    converter c;
+                    c.in = keys_input[block_offset + i];
+                    uint64_t left = c.out & end_bits;
+
+                    c.in = keys_input[block_offset + j];
+
+                    uint64_t right = c.out & end_bits;
+
+                    return descending ? right < left : left < right; 
+                });
+
+            // Invert the sorted indices sequence to obtain the ranks.
+            for(size_t j = 0; j < items_per_block; ++j)
+            {
+                expected[block_offset + indices[j]] = static_cast<int>(j);
+            }
+        }
+
+        common::device_ptr<T>            d_keys_input(keys_input);
+        common::device_ptr<unsigned int> d_ranks_output(size);
+
+        // Running kernel
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(
+                rank_kernel<T, block_size, items_per_thread, max_radix_bits, algorithm>),
+            dim3(grid_size),
+            dim3(block_size),
+            0,
+            0,
+            d_keys_input.get(),
+            d_ranks_output.get(),
+            descending,
+            use_storage,
+            end_bits);
+        HIP_CHECK(hipGetLastError());
+
+        // Getting results to host
+        auto ranks_output = d_ranks_output.load();
+
+        ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(ranks_output, expected));
+    }
+}
+
+template<typename T,
+         unsigned int                        BlockSize,
+         unsigned int                        ItemsPerThread,
+         unsigned int                        MaxRadixBits,
+         unsigned int                        RadixBits,
+         bool                                Descending,
+         rocprim::block_radix_rank_algorithm Algorithm>
+void test_block_radix_rank_with_prefix_and_count()
+{
+    constexpr size_t                              block_size       = BlockSize;
+    constexpr size_t                              items_per_thread = ItemsPerThread;
+    constexpr size_t                              items_per_block  = block_size * items_per_thread;
+    constexpr size_t                              max_radix_bits   = MaxRadixBits;
+    constexpr size_t                              radix_bits       = RadixBits;
+    constexpr rocprim::block_radix_rank_algorithm algorithm        = Algorithm;
+    
+    const size_t grid_size = 2;
+    const size_t size      = items_per_block * grid_size;
+    const size_t pc_items_per_block = (1 << radix_bits);
+    const uint64_t end_bits = ((pc_items_per_block) - 1);
+    const size_t pc_size = (pc_items_per_block) * grid_size;
+
+    SCOPED_TRACE(testing::Message() << "with block_size = " << block_size);
+    SCOPED_TRACE(testing::Message() << "with items_per_thread = " << items_per_thread);
+    SCOPED_TRACE(testing::Message() << "with max_radix_bits = " << MaxRadixBits);
+    SCOPED_TRACE(testing::Message() << "with grid_size = " << grid_size);
+    SCOPED_TRACE(testing::Message() << "with size = " << size);
+
+    for(size_t seed_index = 0; seed_index < number_of_runs; ++seed_index)
+    {
+        seed_type seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed = " << seed_value);
+
+        // Generate data
+        std::vector<T> keys_input
+            = test_utils::get_random_data_wrapped<T>(size,
+                                                     common::generate_limits<T>::min(),
+                                                     common::generate_limits<T>::max(),
+                                                     seed_value);
+
+
+        union converter{
+            T in;
+            uint64_t out;
+        } c;
+        // Calculated expected results on host
+        std::vector<unsigned int> expected(size);
+        std::vector<unsigned int> expected_histogram(pc_size, 0);
+        std::vector<unsigned int> expected_prefix(pc_size, 0);
+        for(size_t i = 0; i < grid_size; ++i)
+        {
+            size_t     block_offset = i * items_per_block;
+
+            // Perform an 'argsort', which gives a sorted sequence of indices into `keys_input`.
+            std::vector<int> indices(items_per_block);
+            std::iota(indices.begin(), indices.end(), 0);
+
+            std::stable_sort(
+                indices.begin(),
+                indices.end(),
+                [&](const int& i, const int& j)
+                { 
+                    c.in = keys_input[block_offset + i];
+                    uint64_t left = c.out & end_bits;
+
+                    c.in = keys_input[block_offset + j];
+
+                    uint64_t right = c.out & end_bits;
+
+                    return Descending ? right < left : left < right; 
+                });
+
+            // Invert the sorted indices sequence to obtain the ranks.
+            for(size_t j = 0; j < items_per_block; ++j)
+            {
+                expected[block_offset + indices[j]] = static_cast<int>(j);
+            }
+
+            size_t pc_block_offset = i * (pc_items_per_block);
+            for(size_t j = 0; j < items_per_block; j++){
+                c.in = keys_input[block_offset + j];
+                uint64_t bit_rep = c.out;
+                bit_rep &= end_bits;
+
+                if(Descending)
+                    bit_rep = pc_items_per_block - (1 + bit_rep);
+                
+                ++expected_histogram[bit_rep + pc_block_offset];
+            }
+            std::exclusive_scan(
+                expected_histogram.begin() + pc_block_offset, 
+                expected_histogram.begin() + pc_block_offset + pc_items_per_block, 
+                expected_prefix.begin() + pc_block_offset, 
+                0
+            );
+        }
+        common::device_ptr<T>            d_keys_input(keys_input);
+        common::device_ptr<unsigned int> d_ranks_output(size);
+        common::device_ptr<unsigned int> d_prefix_output(pc_size);
+        common::device_ptr<unsigned int> d_counts_output(pc_size);
+
+        // Running kernel
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(
+                rank_kernel<T, block_size, items_per_thread, radix_bits, max_radix_bits, algorithm>),
+            dim3(grid_size),
+            dim3(block_size),
+            0,
+            0,
+            d_keys_input.get(),
+            d_ranks_output.get(),
+            d_prefix_output.get(),
+            d_counts_output.get(),
+            [] (const T & key){
+                const uint64_t end_bits = ((pc_items_per_block) - 1);
+                union converter{
+                    T in;
+                    uint64_t out;
+                } c;
+                c.in = key;
+
+                uint64_t out = c.out & end_bits;
+
+                if(Descending)
+                    out = pc_items_per_block - (1 + out);
+
+                return out;
+            }
+        );
+        HIP_CHECK(hipGetLastError());
+
+        // Getting results to host
+        auto ranks_output = d_ranks_output.load();
+        auto prefix_output = d_prefix_output.load();
+        auto counts_output = d_counts_output.load();
+
+        for(size_t i = 0; i < ranks_output.size(); i++){
+            ASSERT_EQ(ranks_output[i], expected[i]) << "Index: " << i << std::endl;
+
+            if(i < pc_size){
+                ASSERT_EQ(prefix_output[i], expected_prefix[i]) << "Index: " << i << std::endl;
+                ASSERT_EQ(counts_output[i], expected_histogram[i]) << "Index: " << i << std::endl;
+            }
+        }
+    }
+}
+
 template<unsigned int First,
          unsigned int Last,
          typename T,
@@ -237,10 +666,42 @@ struct static_for
                                   max_radix_bits[First],
                                   radix_bits,
                                   rank_desc[First],
+                                  use_storage[First],
                                   Algorithm>();
         }
         static_for<First + 1, Last, T, BlockSize, Algorithm>::run();
     }
+
+    static void run_extractor()
+    {
+        {
+            SCOPED_TRACE(testing::Message() << "TestID = " << First);
+            test_block_radix_extractor_rank<T,
+                                  BlockSize,
+                                  items_per_thread[First],
+                                  max_radix_bits_extractor[First],
+                                  end_bits[First],
+                                  rank_desc[First],
+                                  use_storage[First],
+                                  Algorithm>();
+        }
+        static_for<First + 1, Last, T, BlockSize, Algorithm>::run_extractor();
+    }
+
+    static void run_extractor_with_prefix_count()
+    {
+        {
+            SCOPED_TRACE(testing::Message() << "TestID = " << First);
+            test_block_radix_rank_with_prefix_and_count<T,
+                                  BlockSize,
+                                  items_per_thread[First],
+                                  max_radix_bits_extractor[First],
+                                  max_radix_bits_extractor[First],
+                                  rank_desc[First],
+                                  Algorithm>();
+        }
+        static_for<First + 1, Last, T, BlockSize, Algorithm>::run_extractor_with_prefix_count();
+    }
 };
 
 template<unsigned int Last,
@@ -250,6 +711,8 @@ template<unsigned int Last,
 struct static_for<Last, Last, T, BlockSize, Algorithm>
 {
     static void run() {}
+    static void run_extractor() {}
+    static void run_extractor_with_prefix_count() {}
 };
 
 template<rocprim::block_radix_rank_algorithm Algorithm, typename TestFixture>
@@ -266,4 +729,32 @@ void test_block_radix_rank_algorithm()
     static_for<0, n_sizes, type, block_size, Algorithm>::run();
 }
 
+template<rocprim::block_radix_rank_algorithm Algorithm, typename TestFixture>
+void test_block_radix_rank_extractor_algorithm()
+{
+    using type                  = typename TestFixture::params::input_type;
+    constexpr size_t block_size = TestFixture::params::block_size;
+
+    if(block_size > test_utils::get_max_block_size())
+    {
+        GTEST_SKIP();
+    }
+
+    static_for<0, n_sizes, type, block_size, Algorithm>::run_extractor();
+}
+
+template<rocprim::block_radix_rank_algorithm Algorithm, typename TestFixture>
+void test_block_radix_rank_extractor_with_prefix_count_algorithm()
+{
+    using type                  = typename TestFixture::params::input_type;
+    constexpr size_t block_size = TestFixture::params::block_size;
+
+    if(block_size > test_utils::get_max_block_size())
+    {
+        GTEST_SKIP();
+    }
+
+    static_for<0, n_sizes, type, block_size, Algorithm>::run_extractor_with_prefix_count();
+}
+
 #endif // TEST_BLOCK_RADIX_RANK_KERNELS_HPP_
diff --git a/test/rocprim/test_block_scan.hpp b/test/rocprim/test_block_scan.hpp
index 32ac8c51f..97704a26c 100644
--- a/test/rocprim/test_block_scan.hpp
+++ b/test/rocprim/test_block_scan.hpp
@@ -664,6 +664,34 @@ typed_test_def(suite_name_array, name_suffix, InclusiveScanPrefixCallback)
     static_for_input_array<0, 2, T, 2, block_size>::run();
 }
 
+typed_test_def(suite_name_array, name_suffix, InclusiveScanWithStorage)
+{
+    using T = typename TestFixture::input_type;
+    constexpr size_t block_size = TestFixture::block_size;
+
+    static_for_input_array<0, 2, T, 6, block_size>::run();
+}
+typed_test_def(suite_name_array, name_suffix, InclusiveScanWithStorageAndInit)
+{
+    using T = typename TestFixture::input_type;
+    constexpr size_t block_size = TestFixture::block_size;
+
+    static_for_input_array<0, 2, T, 7, block_size>::run();
+}
+typed_test_def(suite_name_array, name_suffix, InclusiveScanReduceWithStorage)
+{
+    using T = typename TestFixture::input_type;
+    constexpr size_t block_size = TestFixture::block_size;
+
+    static_for_input_array<0, 2, T, 8, block_size>::run();
+}
+typed_test_def(suite_name_array, name_suffix, InclusiveScanReduceWithStorageAndInit)
+{
+    using T = typename TestFixture::input_type;
+    constexpr size_t block_size = TestFixture::block_size;
+
+    static_for_input_array<0, 2, T, 9, block_size>::run();
+}
 typed_test_def(suite_name_array, name_suffix, ExclusiveScan)
 {
     using T = typename TestFixture::input_type;
@@ -687,3 +715,19 @@ typed_test_def(suite_name_array, name_suffix, ExclusiveScanPrefixCallback)
 
     static_for_input_array<0, 2, T, 5, block_size>::run();
 }
+
+typed_test_def(suite_name_array, name_suffix, ExclusiveScanWithStorage)
+{
+    using T = typename TestFixture::input_type;
+    constexpr size_t block_size = TestFixture::block_size;
+
+    static_for_input_array<0, 2, T, 10, block_size>::run();
+}
+
+typed_test_def(suite_name_array, name_suffix, ExclusiveScanReduceWithStorage)
+{
+    using T = typename TestFixture::input_type;
+    constexpr size_t block_size = TestFixture::block_size;
+
+    static_for_input_array<0, 2, T, 11, block_size>::run();
+}
\ No newline at end of file
diff --git a/test/rocprim/test_block_scan.kernels.hpp b/test/rocprim/test_block_scan.kernels.hpp
index 4eca2c418..79cd7395b 100644
--- a/test/rocprim/test_block_scan.kernels.hpp
+++ b/test/rocprim/test_block_scan.kernels.hpp
@@ -400,6 +400,138 @@ void inclusive_scan_array_prefix_callback_kernel(T* device_output, T* device_out
     }
 }
 
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    rocprim::block_scan_algorithm Algorithm,
+    class T,
+    class BinaryOp
+>
+__global__
+__launch_bounds__(BlockSize)
+void inclusive_scan_array_with_storage_kernel(T * device_output){
+
+    const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread;
+    
+    T input[ItemsPerThread];
+    
+    for(size_t i = 0; i < ItemsPerThread; i++)
+        input[i] = device_output[i + index];
+    
+    
+    using bscan_t = rocprim::block_scan<T, BlockSize, Algorithm>;
+    __shared__ typename bscan_t::storage_type storage;
+    rocprim::block_scan<T, BlockSize, Algorithm> bscan;
+
+    bscan.inclusive_scan(input, input, storage, BinaryOp());
+
+    for(size_t i = 0; i < ItemsPerThread; i++)
+        device_output[i + index] = input[i];
+}
+
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    rocprim::block_scan_algorithm Algorithm,
+    class T,
+    class BinaryOp
+>
+__global__
+__launch_bounds__(BlockSize)
+void inclusive_scan_array_with_storage_kernel(T * device_output, T init){
+
+    const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread;
+    
+    T input[ItemsPerThread];
+    
+    for(size_t i = 0; i < ItemsPerThread; i++)
+        input[i] = device_output[i + index];
+    
+    
+    using bscan_t = rocprim::block_scan<T, BlockSize, Algorithm>;
+    __shared__ typename bscan_t::storage_type storage;
+    rocprim::block_scan<T, BlockSize, Algorithm> bscan;
+
+    bscan.inclusive_scan(input, init, input, storage, BinaryOp());
+
+    for(size_t i = 0; i < ItemsPerThread; i++)
+        device_output[i + index] = input[i];
+}
+
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    rocprim::block_scan_algorithm Algorithm,
+    class T,
+    class BinaryOp
+>
+__global__
+__launch_bounds__(BlockSize)
+void inclusive_scan_reduce_array_with_storage_kernel(T* device_output, T* device_output_reductions)
+{
+    const unsigned int index = ((blockIdx.x * BlockSize ) + threadIdx.x) * ItemsPerThread;
+
+    // load
+    T in_out[ItemsPerThread];
+    for(unsigned int j = 0; j < ItemsPerThread; j++)
+    {
+        in_out[j] = device_output[index + j];
+    }
+    using bscan_t = rocprim::block_scan<T, BlockSize, Algorithm>;
+    __shared__ typename bscan_t::storage_type storage;
+    rocprim::block_scan<T, BlockSize, Algorithm> bscan;
+    T reduction;
+    bscan.inclusive_scan(in_out, in_out, reduction, storage, BinaryOp());
+
+    // store
+    for(unsigned int j = 0; j < ItemsPerThread; j++)
+    {
+        device_output[index + j] = in_out[j];
+    }
+
+    if(threadIdx.x == 0)
+    {
+        device_output_reductions[blockIdx.x] = reduction;
+    }
+}
+
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    rocprim::block_scan_algorithm Algorithm,
+    class T,
+    class BinaryOp
+>
+__global__
+__launch_bounds__(BlockSize)
+void inclusive_scan_reduce_array_with_storage_kernel(T* device_output, T* device_output_reductions, T init)
+{
+    const unsigned int index = ((blockIdx.x * BlockSize ) + threadIdx.x) * ItemsPerThread;
+
+    // load
+    T in_out[ItemsPerThread];
+    for(unsigned int j = 0; j < ItemsPerThread; j++)
+    {
+        in_out[j] = device_output[index + j];
+    }
+    using bscan_t = rocprim::block_scan<T, BlockSize, Algorithm>;
+    __shared__ typename bscan_t::storage_type storage;
+    rocprim::block_scan<T, BlockSize, Algorithm> bscan;
+    T reduction;
+    bscan.inclusive_scan(in_out, init, in_out, reduction, storage, BinaryOp());
+
+    // store
+    for(unsigned int j = 0; j < ItemsPerThread; j++)
+    {
+        device_output[index + j] = in_out[j];
+    }
+
+    if(threadIdx.x == 0)
+    {
+        device_output_reductions[blockIdx.x] = reduction;
+    }
+}
+
 template<
     unsigned int BlockSize,
     unsigned int ItemsPerThread,
@@ -511,6 +643,72 @@ void exclusive_scan_prefix_callback_array_kernel(
     }
 }
 
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    rocprim::block_scan_algorithm Algorithm,
+    class T,
+    class BinaryOp
+>
+__global__
+__launch_bounds__(BlockSize)
+void exclusive_scan_array_with_storage_kernel(T * device_output, T init){
+
+    const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread;
+    
+    T input[ItemsPerThread];
+    
+    for(size_t i = 0; i < ItemsPerThread; i++)
+        input[i] = device_output[i + index];
+    
+    
+    using bscan_t = rocprim::block_scan<T, BlockSize, Algorithm>;
+    __shared__ typename bscan_t::storage_type storage;
+    rocprim::block_scan<T, BlockSize, Algorithm> bscan;
+
+    bscan.exclusive_scan(input, input, init, storage, BinaryOp());
+
+    for(size_t i = 0; i < ItemsPerThread; i++)
+        device_output[i + index] = input[i];
+}
+
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    rocprim::block_scan_algorithm Algorithm,
+    class T,
+    class BinaryOp
+>
+__global__
+__launch_bounds__(BlockSize)
+void exclusive_scan_reduce_array_with_storage_kernel(T* device_output, T* device_output_reductions, T init)
+{
+    const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread;
+    // load
+    T in_out[ItemsPerThread];
+    for(unsigned int j = 0; j < ItemsPerThread; j++)
+    {
+        in_out[j] = device_output[index + j];
+    }
+    
+    using bscan_t = rocprim::block_scan<T, BlockSize, Algorithm>;
+    __shared__ typename bscan_t::storage_type storage;
+    rocprim::block_scan<T, BlockSize, Algorithm> bscan;
+    T reduction;
+    bscan.exclusive_scan(in_out, in_out, init, reduction, storage, BinaryOp());
+
+    // store
+    for(unsigned int j = 0; j < ItemsPerThread; j++)
+    {
+        device_output[index + j] = in_out[j];
+    }
+
+    if(threadIdx.x == 0)
+    {
+        device_output_reductions[blockIdx.x] = reduction;
+    }
+}
+
 // Test for scan
 template<
     class T,
@@ -772,7 +970,7 @@ template<
     rocprim::block_scan_algorithm Algorithm = rocprim::block_scan_algorithm::using_warp_scan
 >
 auto test_block_scan_input_arrays()
--> typename std::enable_if<Method == 3>::type
+-> typename std::enable_if<Method == 6>::type
 {
     using binary_op_type = rocprim::maximum<T>;
 
@@ -787,8 +985,8 @@ auto test_block_scan_input_arrays()
     }
 
     const size_t items_per_block = block_size * items_per_thread;
-    const size_t size            = items_per_block * 19;
-    const size_t grid_size       = size / items_per_block;
+    const size_t size = items_per_block * 19;
+    const size_t grid_size = size / items_per_block;
 
     SCOPED_TRACE(testing::Message() << "with items_per_block = " << items_per_block);
     SCOPED_TRACE(testing::Message() << "with size = " << size);
@@ -801,18 +999,16 @@ auto test_block_scan_input_arrays()
 
         // Generate data
         std::vector<T> output = test_utils::get_random_data_wrapped<T>(size, 2, 100, seed_value);
-        const T init = test_utils::get_random_value<T>(0, 100, seed_value);
 
         // Calculate expected results on host
         std::vector<T> expected(output.size(), T(0));
         binary_op_type binary_op;
         for(size_t i = 0; i < output.size() / items_per_block; i++)
         {
-            expected[i * items_per_block] = init;
-            for(size_t j = 1; j < items_per_block; j++)
+            for(size_t j = 0; j < items_per_block; j++)
             {
                 auto idx = i * items_per_block + j;
-                expected[idx] = binary_op(output[idx-1], expected[idx-1]);
+                expected[idx] = binary_op(output[idx], expected[j > 0 ? idx-1 : idx]);
             }
         }
 
@@ -820,7 +1016,7 @@ auto test_block_scan_input_arrays()
         common::device_ptr<T> device_output(output);
 
         // Launching kernel
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(exclusive_scan_array_kernel<block_size,
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(inclusive_scan_array_with_storage_kernel<block_size,
                                                                        items_per_thread,
                                                                        algorithm,
                                                                        T,
@@ -829,8 +1025,7 @@ auto test_block_scan_input_arrays()
                            dim3(block_size),
                            0,
                            0,
-                           device_output.get(),
-                           init);
+                           device_output.get());
 
         HIP_CHECK(hipGetLastError());
         HIP_CHECK(hipDeviceSynchronize());
@@ -841,7 +1036,6 @@ auto test_block_scan_input_arrays()
         // Validating results
         ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected));
     }
-
 }
 
 template<
@@ -852,7 +1046,7 @@ template<
     rocprim::block_scan_algorithm Algorithm = rocprim::block_scan_algorithm::using_warp_scan
 >
 auto test_block_scan_input_arrays()
--> typename std::enable_if<Method == 4>::type
+-> typename std::enable_if<Method == 7>::type
 {
     using binary_op_type = rocprim::maximum<T>;
 
@@ -878,63 +1072,50 @@ auto test_block_scan_input_arrays()
     {
         unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
         SCOPED_TRACE(testing::Message() << "with seed = " << seed_value);
-
+        
         // Generate data
         std::vector<T> output = test_utils::get_random_data_wrapped<T>(size, 2, 100, seed_value);
-
-        // Output reduce results
-        std::vector<T> output_reductions(size / items_per_block);
-        const T init = test_utils::get_random_value<T>(0, 100, seed_value);
-
+        
         // Calculate expected results on host
         std::vector<T> expected(output.size(), T(0));
-        std::vector<T> expected_reductions(output_reductions.size(), T(0));
+        const T init = test_utils::get_random_value<T>(0, 100, seed_value);
         binary_op_type binary_op;
         for(size_t i = 0; i < output.size() / items_per_block; i++)
         {
             expected[i * items_per_block] = init;
-            for(size_t j = 1; j < items_per_block; j++)
-            {
-                auto idx = i * items_per_block + j;
-                expected[idx] = binary_op(output[idx-1], expected[idx-1]);
-            }
             for(size_t j = 0; j < items_per_block; j++)
             {
                 auto idx = i * items_per_block + j;
-                expected_reductions[i] = binary_op(expected_reductions[i], output[idx]);
+                expected[idx] = binary_op(output[idx], expected[j > 0 ? idx-1 : idx]);
             }
         }
 
         // Writing to device memory
         common::device_ptr<T> device_output(output);
-        common::device_ptr<T> device_output_reductions(output_reductions.size());
 
         // Launching kernel
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(exclusive_scan_reduce_array_kernel<block_size,
-                                                                              items_per_thread,
-                                                                              algorithm,
-                                                                              T,
-                                                                              binary_op_type>),
-                           dim3(grid_size),
-                           dim3(block_size),
-                           0,
-                           0,
-                           device_output.get(),
-                           device_output_reductions.get(),
-                           init);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(inclusive_scan_array_with_storage_kernel<block_size,
+                                                                       items_per_thread,
+                                                                       algorithm,
+                                                                       T,
+                                                                       binary_op_type>),
+                            dim3(grid_size),
+                            dim3(block_size),
+                            0,
+                            0,
+                            device_output.get(),
+                            init
+                        );
 
         HIP_CHECK(hipGetLastError());
         HIP_CHECK(hipDeviceSynchronize());
 
         // Read from device memory
-        output            = device_output.load();
-        output_reductions = device_output_reductions.load();
+        output = device_output.load();
 
         // Validating results
         ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected));
-        ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output_reductions, expected_reductions));
     }
-
 }
 
 template<
@@ -945,7 +1126,7 @@ template<
     rocprim::block_scan_algorithm Algorithm = rocprim::block_scan_algorithm::using_warp_scan
 >
 auto test_block_scan_input_arrays()
--> typename std::enable_if<Method == 5>::type
+-> typename std::enable_if<Method == 8>::type
 {
     using binary_op_type = rocprim::maximum<T>;
 
@@ -974,53 +1155,403 @@ auto test_block_scan_input_arrays()
 
         // Generate data
         std::vector<T> output = test_utils::get_random_data_wrapped<T>(size, 2, 100, seed_value);
-        std::vector<T> output_block_prefixes(size / items_per_block);
-        T block_prefix = test_utils::get_random_value<T>(0, 100, seed_value);
+
+        // Output reduce results
+        std::vector<T> output_reductions(size / block_size, T(0));
 
         // Calculate expected results on host
         std::vector<T> expected(output.size(), T(0));
-        std::vector<T> expected_block_prefixes(output_block_prefixes.size(), T(0));
+        std::vector<T> expected_reductions(output_reductions.size(), T(0));
         binary_op_type binary_op;
         for(size_t i = 0; i < output.size() / items_per_block; i++)
         {
-            expected[i * items_per_block] = block_prefix;
-            for(size_t j = 1; j < items_per_block; j++)
-            {
-                auto idx = i * items_per_block + j;
-                expected[idx] = binary_op(output[idx-1], expected[idx-1]);
-            }
-            expected_block_prefixes[i] = block_prefix;
             for(size_t j = 0; j < items_per_block; j++)
             {
                 auto idx = i * items_per_block + j;
-                expected_block_prefixes[i] = binary_op(expected_block_prefixes[i], output[idx]);
+                expected[idx] = binary_op(output[idx], expected[j > 0 ? idx-1 : idx]);
             }
+            expected_reductions[i] = expected[(i+1) * items_per_block - 1];
         }
 
         // Writing to device memory
         common::device_ptr<T> device_output(output);
-        common::device_ptr<T> device_output_bp(output_block_prefixes.size());
+        common::device_ptr<T> device_output_reductions(output_reductions);
 
         // Launching kernel
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(exclusive_scan_prefix_callback_array_kernel<block_size,
-                                                                        items_per_thread,
-                                                                        algorithm,
-                                                                        T,
-                                                                        binary_op_type>),
-            dim3(grid_size),
-            dim3(block_size),
-            0,
-            0,
-            device_output.get(),
-            device_output_bp.get(),
-            block_prefix);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(inclusive_scan_reduce_array_with_storage_kernel<block_size,
+                                                                              items_per_thread,
+                                                                              algorithm,
+                                                                              T,
+                                                                              binary_op_type>),
+                           dim3(grid_size),
+                           dim3(block_size),
+                           0,
+                           0,
+                           device_output.get(),
+                           device_output_reductions.get());
 
         HIP_CHECK(hipGetLastError());
         HIP_CHECK(hipDeviceSynchronize());
 
         // Read from device memory
-        output                = device_output.load();
+        output            = device_output.load();
+        output_reductions = device_output_reductions.load();
+
+        // Validating results
+        ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected));
+        ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output_reductions, expected_reductions));
+    }
+
+}
+
+template<
+    class T,
+    int Method,
+    unsigned int BlockSize = 256U,
+    unsigned int ItemsPerThread = 1U,
+    rocprim::block_scan_algorithm Algorithm = rocprim::block_scan_algorithm::using_warp_scan
+>
+auto test_block_scan_input_arrays()
+-> typename std::enable_if<Method == 9>::type
+{
+    using binary_op_type = rocprim::maximum<T>;
+
+    static constexpr auto algorithm = Algorithm;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
+
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size())
+    {
+        return;
+    }
+
+    const size_t items_per_block = block_size * items_per_thread;
+    const size_t size = items_per_block * 19;
+    const size_t grid_size = size / items_per_block;
+
+    SCOPED_TRACE(testing::Message() << "with items_per_block = " << items_per_block);
+    SCOPED_TRACE(testing::Message() << "with size = " << size);
+    SCOPED_TRACE(testing::Message() << "with grid_size = " << grid_size);
+
+    for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++)
+    {
+        unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed = " << seed_value);
+
+        // Generate data
+        std::vector<T> output = test_utils::get_random_data_wrapped<T>(size, 2, 100, seed_value);
+
+        // Output reduce results
+        std::vector<T> output_reductions(size / block_size, T(0));
+
+        // Calculate expected results on host
+        std::vector<T> expected(output.size(), T(0));
+        std::vector<T> expected_reductions(output_reductions.size(), T(0));
+        const T init = test_utils::get_random_value<T>(0, 100, seed_value);
+        
+        binary_op_type binary_op;
+        for(size_t i = 0; i < output.size() / items_per_block; i++)
+        {
+            expected[i * items_per_block] = init;
+            for(size_t j = 0; j < items_per_block; j++)
+            {
+                auto idx = i * items_per_block + j;
+                expected[idx] = binary_op(output[idx], expected[j > 0 ? idx-1 : idx]);
+            }
+            expected_reductions[i] = expected[(i+1) * items_per_block - 1];
+        }
+
+        // Writing to device memory
+        common::device_ptr<T> device_output(output);
+        common::device_ptr<T> device_output_reductions(output_reductions);
+
+        // Launching kernel
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(inclusive_scan_reduce_array_with_storage_kernel<block_size,
+                                                                              items_per_thread,
+                                                                              algorithm,
+                                                                              T,
+                                                                              binary_op_type>),
+                           dim3(grid_size),
+                           dim3(block_size),
+                           0,
+                           0,
+                           device_output.get(),
+                           device_output_reductions.get(),
+                           init
+                        );
+
+        HIP_CHECK(hipGetLastError());
+        HIP_CHECK(hipDeviceSynchronize());
+
+        // Read from device memory
+        output            = device_output.load();
+        output_reductions = device_output_reductions.load();
+
+        // Validating results
+        ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected));
+        ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output_reductions, expected_reductions));
+    }
+
+}
+
+template<
+    class T,
+    int Method,
+    unsigned int BlockSize = 256U,
+    unsigned int ItemsPerThread = 1U,
+    rocprim::block_scan_algorithm Algorithm = rocprim::block_scan_algorithm::using_warp_scan
+>
+auto test_block_scan_input_arrays()
+-> typename std::enable_if<Method == 3>::type
+{
+    using binary_op_type = rocprim::maximum<T>;
+
+    static constexpr auto algorithm = Algorithm;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
+
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size())
+    {
+        return;
+    }
+
+    const size_t items_per_block = block_size * items_per_thread;
+    const size_t size            = items_per_block * 19;
+    const size_t grid_size       = size / items_per_block;
+
+    SCOPED_TRACE(testing::Message() << "with items_per_block = " << items_per_block);
+    SCOPED_TRACE(testing::Message() << "with size = " << size);
+    SCOPED_TRACE(testing::Message() << "with grid_size = " << grid_size);
+
+    for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++)
+    {
+        unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed = " << seed_value);
+
+        // Generate data
+        std::vector<T> output = test_utils::get_random_data_wrapped<T>(size, 2, 100, seed_value);
+        const T init = test_utils::get_random_value<T>(0, 100, seed_value);
+
+        // Calculate expected results on host
+        std::vector<T> expected(output.size(), T(0));
+        binary_op_type binary_op;
+        for(size_t i = 0; i < output.size() / items_per_block; i++)
+        {
+            expected[i * items_per_block] = init;
+            for(size_t j = 1; j < items_per_block; j++)
+            {
+                auto idx = i * items_per_block + j;
+                expected[idx] = binary_op(output[idx-1], expected[idx-1]);
+            }
+        }
+
+        // Writing to device memory
+        common::device_ptr<T> device_output(output);
+
+        // Launching kernel
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(exclusive_scan_array_kernel<block_size,
+                                                                       items_per_thread,
+                                                                       algorithm,
+                                                                       T,
+                                                                       binary_op_type>),
+                           dim3(grid_size),
+                           dim3(block_size),
+                           0,
+                           0,
+                           device_output.get(),
+                           init);
+
+        HIP_CHECK(hipGetLastError());
+        HIP_CHECK(hipDeviceSynchronize());
+
+        // Read from device memory
+        output = device_output.load();
+
+        // Validating results
+        ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected));
+    }
+
+}
+
+template<
+    class T,
+    int Method,
+    unsigned int BlockSize = 256U,
+    unsigned int ItemsPerThread = 1U,
+    rocprim::block_scan_algorithm Algorithm = rocprim::block_scan_algorithm::using_warp_scan
+>
+auto test_block_scan_input_arrays()
+-> typename std::enable_if<Method == 4>::type
+{
+    using binary_op_type = rocprim::maximum<T>;
+
+    static constexpr auto algorithm = Algorithm;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
+
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size())
+    {
+        return;
+    }
+
+    const size_t items_per_block = block_size * items_per_thread;
+    const size_t size = items_per_block * 19;
+    const size_t grid_size = size / items_per_block;
+
+    SCOPED_TRACE(testing::Message() << "with items_per_block = " << items_per_block);
+    SCOPED_TRACE(testing::Message() << "with size = " << size);
+    SCOPED_TRACE(testing::Message() << "with grid_size = " << grid_size);
+
+    for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++)
+    {
+        unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed = " << seed_value);
+
+        // Generate data
+        std::vector<T> output = test_utils::get_random_data_wrapped<T>(size, 2, 100, seed_value);
+
+        // Output reduce results
+        std::vector<T> output_reductions(size / items_per_block);
+        const T init = test_utils::get_random_value<T>(0, 100, seed_value);
+
+        // Calculate expected results on host
+        std::vector<T> expected(output.size(), T(0));
+        std::vector<T> expected_reductions(output_reductions.size(), T(0));
+        binary_op_type binary_op;
+        for(size_t i = 0; i < output.size() / items_per_block; i++)
+        {
+            expected[i * items_per_block] = init;
+            for(size_t j = 1; j < items_per_block; j++)
+            {
+                auto idx = i * items_per_block + j;
+                expected[idx] = binary_op(output[idx-1], expected[idx-1]);
+            }
+            for(size_t j = 0; j < items_per_block; j++)
+            {
+                auto idx = i * items_per_block + j;
+                expected_reductions[i] = binary_op(expected_reductions[i], output[idx]);
+            }
+        }
+
+        // Writing to device memory
+        common::device_ptr<T> device_output(output);
+        common::device_ptr<T> device_output_reductions(output_reductions.size());
+
+        // Launching kernel
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(exclusive_scan_reduce_array_kernel<block_size,
+                                                                              items_per_thread,
+                                                                              algorithm,
+                                                                              T,
+                                                                              binary_op_type>),
+                           dim3(grid_size),
+                           dim3(block_size),
+                           0,
+                           0,
+                           device_output.get(),
+                           device_output_reductions.get(),
+                           init);
+
+        HIP_CHECK(hipGetLastError());
+        HIP_CHECK(hipDeviceSynchronize());
+
+        // Read from device memory
+        output            = device_output.load();
+        output_reductions = device_output_reductions.load();
+
+        // Validating results
+        ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected));
+        ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output_reductions, expected_reductions));
+    }
+
+}
+
+template<
+    class T,
+    int Method,
+    unsigned int BlockSize = 256U,
+    unsigned int ItemsPerThread = 1U,
+    rocprim::block_scan_algorithm Algorithm = rocprim::block_scan_algorithm::using_warp_scan
+>
+auto test_block_scan_input_arrays()
+-> typename std::enable_if<Method == 5>::type
+{
+    using binary_op_type = rocprim::maximum<T>;
+
+    static constexpr auto algorithm = Algorithm;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
+
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size())
+    {
+        return;
+    }
+
+    const size_t items_per_block = block_size * items_per_thread;
+    const size_t size = items_per_block * 19;
+    const size_t grid_size = size / items_per_block;
+
+    SCOPED_TRACE(testing::Message() << "with items_per_block = " << items_per_block);
+    SCOPED_TRACE(testing::Message() << "with size = " << size);
+    SCOPED_TRACE(testing::Message() << "with grid_size = " << grid_size);
+
+    for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++)
+    {
+        unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed = " << seed_value);
+
+        // Generate data
+        std::vector<T> output = test_utils::get_random_data_wrapped<T>(size, 2, 100, seed_value);
+        std::vector<T> output_block_prefixes(size / items_per_block);
+        T block_prefix = test_utils::get_random_value<T>(0, 100, seed_value);
+
+        // Calculate expected results on host
+        std::vector<T> expected(output.size(), T(0));
+        std::vector<T> expected_block_prefixes(output_block_prefixes.size(), T(0));
+        binary_op_type binary_op;
+        for(size_t i = 0; i < output.size() / items_per_block; i++)
+        {
+            expected[i * items_per_block] = block_prefix;
+            for(size_t j = 1; j < items_per_block; j++)
+            {
+                auto idx = i * items_per_block + j;
+                expected[idx] = binary_op(output[idx-1], expected[idx-1]);
+            }
+            expected_block_prefixes[i] = block_prefix;
+            for(size_t j = 0; j < items_per_block; j++)
+            {
+                auto idx = i * items_per_block + j;
+                expected_block_prefixes[i] = binary_op(expected_block_prefixes[i], output[idx]);
+            }
+        }
+
+        // Writing to device memory
+        common::device_ptr<T> device_output(output);
+        common::device_ptr<T> device_output_bp(output_block_prefixes.size());
+
+        // Launching kernel
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(exclusive_scan_prefix_callback_array_kernel<block_size,
+                                                                        items_per_thread,
+                                                                        algorithm,
+                                                                        T,
+                                                                        binary_op_type>),
+            dim3(grid_size),
+            dim3(block_size),
+            0,
+            0,
+            device_output.get(),
+            device_output_bp.get(),
+            block_prefix);
+
+        HIP_CHECK(hipGetLastError());
+        HIP_CHECK(hipDeviceSynchronize());
+
+        // Read from device memory
+        output                = device_output.load();
         output_block_prefixes = device_output_bp.load();
 
         // Validating results
@@ -1031,6 +1562,179 @@ auto test_block_scan_input_arrays()
 
 }
 
+template<
+    class T,
+    int Method,
+    unsigned int BlockSize = 256U,
+    unsigned int ItemsPerThread = 1U,
+    rocprim::block_scan_algorithm Algorithm = rocprim::block_scan_algorithm::using_warp_scan
+>
+auto test_block_scan_input_arrays()
+-> typename std::enable_if<Method == 10>::type
+{
+    using binary_op_type = rocprim::maximum<T>;
+
+    static constexpr auto algorithm = Algorithm;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
+
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size())
+    {
+        return;
+    }
+
+    const size_t items_per_block = block_size * items_per_thread;
+    const size_t size            = items_per_block * 19;
+    const size_t grid_size       = size / items_per_block;
+
+    SCOPED_TRACE(testing::Message() << "with items_per_block = " << items_per_block);
+    SCOPED_TRACE(testing::Message() << "with size = " << size);
+    SCOPED_TRACE(testing::Message() << "with grid_size = " << grid_size);
+
+    for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++)
+    {
+        unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed = " << seed_value);
+
+        // Generate data
+        std::vector<T> output = test_utils::get_random_data_wrapped<T>(size, 2, 100, seed_value);
+        const T init = test_utils::get_random_value<T>(0, 100, seed_value);
+
+        // Calculate expected results on host
+        std::vector<T> expected(output.size(), T(0));
+        binary_op_type binary_op;
+        for(size_t i = 0; i < output.size() / items_per_block; i++)
+        {
+            expected[i * items_per_block] = init;
+            for(size_t j = 1; j < items_per_block; j++)
+            {
+                auto idx = i * items_per_block + j;
+                expected[idx] = binary_op(output[idx-1], expected[idx-1]);
+            }
+        }
+
+        // Writing to device memory
+        common::device_ptr<T> device_output(output);
+
+        // Launching kernel
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(exclusive_scan_array_with_storage_kernel<block_size,
+                                                                       items_per_thread,
+                                                                       algorithm,
+                                                                       T,
+                                                                       binary_op_type>),
+                           dim3(grid_size),
+                           dim3(block_size),
+                           0,
+                           0,
+                           device_output.get(),
+                           init);
+
+        HIP_CHECK(hipGetLastError());
+        HIP_CHECK(hipDeviceSynchronize());
+
+        // Read from device memory
+        output = device_output.load();
+
+        // Validating results
+        ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected));
+    }
+
+}
+
+template<
+    class T,
+    int Method,
+    unsigned int BlockSize = 256U,
+    unsigned int ItemsPerThread = 1U,
+    rocprim::block_scan_algorithm Algorithm = rocprim::block_scan_algorithm::using_warp_scan
+>
+auto test_block_scan_input_arrays()
+-> typename std::enable_if<Method == 11>::type
+{
+    using binary_op_type = rocprim::maximum<T>;
+
+    static constexpr auto algorithm = Algorithm;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
+
+    // Given block size not supported
+    if(block_size > test_utils::get_max_block_size())
+    {
+        return;
+    }
+
+    const size_t items_per_block = block_size * items_per_thread;
+    const size_t size = items_per_block * 19;
+    const size_t grid_size = size / items_per_block;
+
+    SCOPED_TRACE(testing::Message() << "with items_per_block = " << items_per_block);
+    SCOPED_TRACE(testing::Message() << "with size = " << size);
+    SCOPED_TRACE(testing::Message() << "with grid_size = " << grid_size);
+
+    for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++)
+    {
+        unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed = " << seed_value);
+
+        // Generate data
+        std::vector<T> output = test_utils::get_random_data_wrapped<T>(size, 2, 100, seed_value);
+
+        // Output reduce results
+        std::vector<T> output_reductions(size / items_per_block);
+        const T init = test_utils::get_random_value<T>(0, 100, seed_value);
+
+        // Calculate expected results on host
+        std::vector<T> expected(output.size(), T(0));
+        std::vector<T> expected_reductions(output_reductions.size(), T(0));
+        binary_op_type binary_op;
+        for(size_t i = 0; i < output.size() / items_per_block; i++)
+        {
+            expected[i * items_per_block] = init;
+            for(size_t j = 1; j < items_per_block; j++)
+            {
+                auto idx = i * items_per_block + j;
+                expected[idx] = binary_op(output[idx-1], expected[idx-1]);
+            }
+            for(size_t j = 0; j < items_per_block; j++)
+            {
+                auto idx = i * items_per_block + j;
+                expected_reductions[i] = binary_op(expected_reductions[i], output[idx]);
+            }
+        }
+
+        // Writing to device memory
+        common::device_ptr<T> device_output(output);
+        common::device_ptr<T> device_output_reductions(output_reductions.size());
+
+        // Launching kernel
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(exclusive_scan_reduce_array_with_storage_kernel<block_size,
+                                                                              items_per_thread,
+                                                                              algorithm,
+                                                                              T,
+                                                                              binary_op_type>),
+                           dim3(grid_size),
+                           dim3(block_size),
+                           0,
+                           0,
+                           device_output.get(),
+                           device_output_reductions.get(),
+                           init);
+
+        HIP_CHECK(hipGetLastError());
+        HIP_CHECK(hipDeviceSynchronize());
+
+        // Read from device memory
+        output            = device_output.load();
+        output_reductions = device_output_reductions.load();
+
+        // Validating results
+        ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected));
+        ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output_reductions, expected_reductions));
+    }
+
+}
+
 // Static for-loop
 template<unsigned int First, unsigned int Last, class T, int Method, unsigned int BlockSize = 256U>
 struct static_for_input_array
diff --git a/test/rocprim/test_block_sort.hpp b/test/rocprim/test_block_sort.hpp
index 89fa0fde6..f57543a83 100644
--- a/test/rocprim/test_block_sort.hpp
+++ b/test/rocprim/test_block_sort.hpp
@@ -333,6 +333,287 @@ void TestSortStableKey(std::vector<size_t> sizes)
         }
     }
 }
+
+template<unsigned int block_size,
+         unsigned int items_per_thread,
+         class key_type,
+         class value_type,
+         rocprim::block_sort_algorithm algo,
+         class binary_op_type>
+void TestSortKeyNoSize()
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id = " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    static constexpr const unsigned int items_per_block = block_size * items_per_thread;
+    hipStream_t                         stream          = 0; // default
+
+    if(!is_buildable(block_size, items_per_thread, algo))
+    {
+        GTEST_SKIP();
+    }
+
+    for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++)
+    {
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed = " << seed_value);
+
+        size_t grid_size = 1134; 
+        size_t size = items_per_block * grid_size; 
+        SCOPED_TRACE(testing::Message() << "with size = " << size);
+        // Generate data
+        std::vector<key_type> output
+            = test_utils::get_random_data_wrapped<key_type>(size, -100, 100, seed_value);
+
+        // Calculate expected results on host
+        std::vector<key_type> expected(output);
+        binary_op_type        binary_op;
+        for(size_t i = 0; i < grid_size; i++)
+        {
+            std::sort(expected.begin() + (i * items_per_block),
+                        expected.begin() + std::min(size, ((i + 1) * items_per_block)),
+                        binary_op);
+        }
+
+        // Preparing device
+        common::device_ptr<key_type> device_key_output(output);
+
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(sort_keys_kernel_no_size<block_size,
+                                                            items_per_thread,
+                                                            key_type*,
+                                                            algo,
+                                                            binary_op_type>),
+                            dim3(grid_size),
+                            dim3(block_size),
+                            0,
+                            stream,
+                            device_key_output.get()
+                        );
+
+        // Reading results back
+        output = device_key_output.load();
+
+        test_utils::assert_eq(output, expected);
+    }
+}
+
+template<unsigned int block_size,
+         unsigned int items_per_thread,
+         class key_type,
+         class value_type,
+         rocprim::block_sort_algorithm algo,
+         class binary_op_type>
+void TestSortKeyValueNoSize()
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id = " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    static constexpr const unsigned int items_per_block = block_size * items_per_thread;
+    static constexpr const size_t       grid_size       = 1134;
+    static constexpr const size_t       size            = items_per_block * grid_size;
+    hipStream_t                         stream          = 0; // default
+
+    if(!is_buildable(block_size, items_per_thread, algo))
+    {
+        GTEST_SKIP();
+    }
+
+    for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++)
+    {
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed = " << seed_value);
+        SCOPED_TRACE(testing::Message() << "with size = " << size);
+
+        // Generate data
+        std::vector<key_type> output_key
+            = test_utils::get_random_data_wrapped<key_type>(size, 0, 100, seed_value);
+        std::vector<value_type> output_value
+            = test_utils::get_random_data_wrapped<value_type>(size, -100, 100, seed_value);
+
+        // Combine vectors to form pairs with key and value
+        std::vector<std::pair<key_type, value_type>> target(size);
+        for(unsigned i = 0; i < target.size(); i++)
+            target[i] = std::make_pair(output_key[i], output_value[i]);
+
+        // Calculate expected results on host
+        using key_value = std::pair<key_type, value_type>;
+        std::vector<key_value> expected(target);
+        constexpr bool descending = !std::is_same<binary_op_type, rocprim::less<key_type>>::value;
+        for(size_t i = 0; i < expected.size() / items_per_block; i++)
+        {
+            std::sort(expected.begin() + (i * items_per_block),
+                      expected.begin() + ((i + 1) * items_per_block),
+                      test_utils::key_value_comparator<key_type,
+                                                       value_type,
+                                                       descending,
+                                                       0,
+                                                       sizeof(key_type) * 8>());
+        }
+
+        // Preparing device
+        common::device_ptr<key_type>   device_key_output(output_key);
+        common::device_ptr<value_type> device_value_output(output_value);
+
+        // Running kernel, ignored if invalid size
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(sort_pairs_kernel_no_size<block_size,
+                                                                items_per_thread,
+                                                                key_type,
+                                                                value_type,
+                                                                algo,
+                                                                binary_op_type>),
+                            dim3(grid_size),
+                            dim3(block_size),
+                            0,
+                            stream,
+                            device_key_output.get(),
+                            device_value_output.get());
+        HIP_CHECK(hipGetLastError());
+
+        // Reading results back
+        output_key   = device_key_output.load();
+        output_value = device_value_output.load();
+
+        std::vector<key_type>   expected_key(expected.size());
+        std::vector<value_type> expected_value(expected.size());
+        for(size_t i = 0; i < expected.size(); i++)
+        {
+            expected_key[i]   = expected[i].first;
+            expected_value[i] = expected[i].second;
+        }
+
+        // Keys are sorted, Values order not guaranteed
+        // Sort subsets where key was the same to make sure all values are still present
+        using value_op_type = rocprim::less<value_type>;
+        using eq_op_type    = rocprim::equal_to<key_type>;
+        value_op_type value_op;
+        eq_op_type    eq_op;
+        for(size_t i = 0; i < output_key.size();)
+        {
+            auto j = i;
+            for(; j < output_key.size() && eq_op(output_key[j], output_key[i]); ++j)
+            {}
+            std::sort(output_value.begin() + i, output_value.begin() + j, value_op);
+            std::sort(expected_value.begin() + i, expected_value.begin() + j, value_op);
+            i = j;
+        }
+
+        test_utils::assert_eq(output_key, expected_key);
+        test_utils::assert_eq(output_value, expected_value);
+    }
+}
+
+template<unsigned int block_size,
+         unsigned int items_per_thread,
+         class key_type,
+         class value_type,
+         rocprim::block_sort_algorithm algo,
+         class binary_op_type>
+void TestSortKeyValueWithSize()
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id = " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    static constexpr const unsigned int items_per_block = block_size * items_per_thread;
+    static constexpr const size_t       grid_size       = 1134;
+    static constexpr const size_t       size            = items_per_block * grid_size;
+    hipStream_t                         stream          = 0; // default
+
+    if(!is_buildable(block_size, items_per_thread, algo))
+    {
+        GTEST_SKIP();
+    }
+
+    for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++)
+    {
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed = " << seed_value);
+        SCOPED_TRACE(testing::Message() << "with size = " << size);
+
+        // Generate data
+        std::vector<key_type> output_key
+            = test_utils::get_random_data_wrapped<key_type>(size, 0, 100, seed_value);
+        std::vector<value_type> output_value
+            = test_utils::get_random_data_wrapped<value_type>(size, -100, 100, seed_value);
+
+        // Combine vectors to form pairs with key and value
+        std::vector<std::pair<key_type, value_type>> target(size);
+        for(unsigned i = 0; i < target.size(); i++)
+            target[i] = std::make_pair(output_key[i], output_value[i]);
+
+        // Calculate expected results on host
+        using key_value = std::pair<key_type, value_type>;
+        std::vector<key_value> expected(target);
+        constexpr bool descending = !std::is_same<binary_op_type, rocprim::less<key_type>>::value;
+        for(size_t i = 0; i < expected.size() / items_per_block; i++)
+        {
+            std::sort(expected.begin() + (i * items_per_block),
+                      expected.begin() + ((i + 1) * items_per_block),
+                      test_utils::key_value_comparator<key_type,
+                                                       value_type,
+                                                       descending,
+                                                       0,
+                                                       sizeof(key_type) * 8>());
+        }
+
+        // Preparing device
+        common::device_ptr<key_type>   device_key_output(output_key);
+        common::device_ptr<value_type> device_value_output(output_value);
+
+        // Running kernel, ignored if invalid size
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(sort_pairs_kernel_with_size<block_size,
+                                                                items_per_thread,
+                                                                key_type,
+                                                                value_type,
+                                                                algo,
+                                                                binary_op_type>),
+                            dim3(grid_size),
+                            dim3(block_size),
+                            0,
+                            stream,
+                            device_key_output.get(),
+                            device_value_output.get(),
+                            items_per_block);
+        HIP_CHECK(hipGetLastError());
+
+        // Reading results back
+        output_key   = device_key_output.load();
+        output_value = device_value_output.load();
+
+        std::vector<key_type>   expected_key(expected.size());
+        std::vector<value_type> expected_value(expected.size());
+        for(size_t i = 0; i < expected.size(); i++)
+        {
+            expected_key[i]   = expected[i].first;
+            expected_value[i] = expected[i].second;
+        }
+
+        // Keys are sorted, Values order not guaranteed
+        // Sort subsets where key was the same to make sure all values are still present
+        using value_op_type = rocprim::less<value_type>;
+        using eq_op_type    = rocprim::equal_to<key_type>;
+        value_op_type value_op;
+        eq_op_type    eq_op;
+        for(size_t i = 0; i < output_key.size();)
+        {
+            auto j = i;
+            for(; j < output_key.size() && eq_op(output_key[j], output_key[i]); ++j)
+            {}
+            std::sort(output_value.begin() + i, output_value.begin() + j, value_op);
+            std::sort(expected_value.begin() + i, expected_value.begin() + j, value_op);
+            i = j;
+        }
+
+        test_utils::assert_eq(output_key, expected_key);
+        test_utils::assert_eq(output_value, expected_value);
+    }
+}
+
 #endif // TEST_ROCPRIM_TEST_BLOCK_SORT_HPP_
 // This file is included multiple times in the test_block_sort_[algo].cpp file, because
 // the test definitions below this header guard need to be compiled for each test suites:
@@ -429,7 +710,7 @@ typed_test_def(suite_name,
     static constexpr const unsigned int                  block_size       = TestFixture::block_size;
     static constexpr const unsigned int                  items_per_thread = 4;
     std::vector<size_t> sizes
-        = {0, 53, 512, 5000, 34567, (1 << 17) - 1220, 1134 * 256, (1 << 20) - 123};
+        = {0,53, 512, 5000, 34567, (1 << 17) - 1220, 1134 * 256, (1 << 20) - 123};
     TestSortKey<block_size, items_per_thread, key_type, value_type, algo, binary_op_type>(sizes);
 }
 
@@ -443,3 +724,69 @@ typed_test_def(suite_name, name_suffix, SortKeyValueDesc)
     static constexpr const unsigned int                  items_per_thread = 1;
     TestSortKeyValue<block_size, items_per_thread, key_type, value_type, algo, binary_op_type>();
 }
+
+typed_test_def(suite_name, name_suffix, SortKeyNoSize)
+{
+    using key_type                                            = typename TestFixture::key_type;
+    using value_type                                          = typename TestFixture::value_type;
+    using binary_op_type                                      = typename rocprim::greater<key_type>;
+    static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM;
+    static constexpr const unsigned int                  block_size       = TestFixture::block_size;
+    static constexpr const unsigned int                  items_per_thread = 1;
+    TestSortKeyNoSize<block_size, items_per_thread, key_type, value_type, algo, binary_op_type>();
+}
+
+typed_test_def(suite_name, name_suffix, SortKeyNoSizeMultipleItemsPerThread)
+{
+    using key_type                                            = typename TestFixture::key_type;
+    using value_type                                          = typename TestFixture::value_type;
+    using binary_op_type                                      = typename rocprim::greater<key_type>;
+    static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM;
+    static constexpr const unsigned int                  block_size       = TestFixture::block_size;
+    static constexpr const unsigned int                  items_per_thread = 4;
+    TestSortKeyNoSize<block_size, items_per_thread, key_type, value_type, algo, binary_op_type>();
+}
+
+typed_test_def(suite_name, name_suffix, SortKeyValueNoSize)
+{
+    using key_type                                            = typename TestFixture::key_type;
+    using value_type                                          = typename TestFixture::value_type;
+    using binary_op_type                                      = typename rocprim::less<key_type>;
+    static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM;
+    static constexpr const unsigned int                  block_size       = TestFixture::block_size;
+    static constexpr const unsigned int                  items_per_thread = 1;
+    TestSortKeyValueNoSize<block_size, items_per_thread, key_type, value_type, algo, binary_op_type>();
+}
+
+typed_test_def(suite_name, name_suffix, SortKeyValueNoSizeMultipleItemsPerThread)
+{
+    using key_type                                            = typename TestFixture::key_type;
+    using value_type                                          = typename TestFixture::value_type;
+    using binary_op_type                                      = typename rocprim::less<key_type>;
+    static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM;
+    static constexpr const unsigned int                  block_size       = TestFixture::block_size;
+    static constexpr const unsigned int                  items_per_thread = 4;
+    TestSortKeyValueNoSize<block_size, items_per_thread, key_type, value_type, algo, binary_op_type>();
+}
+
+typed_test_def(suite_name, name_suffix, SortKeyValueWithSize)
+{
+    using key_type                                            = typename TestFixture::key_type;
+    using value_type                                          = typename TestFixture::value_type;
+    using binary_op_type                                      = typename rocprim::less<key_type>;
+    static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM;
+    static constexpr const unsigned int                  block_size       = TestFixture::block_size;
+    static constexpr const unsigned int                  items_per_thread = 1;
+    TestSortKeyValueWithSize<block_size, items_per_thread, key_type, value_type, algo, binary_op_type>();
+}
+
+typed_test_def(suite_name, name_suffix, SortKeyValueWithSizeMultipleItemsPerThread)
+{
+    using key_type                                            = typename TestFixture::key_type;
+    using value_type                                          = typename TestFixture::value_type;
+    using binary_op_type                                      = typename rocprim::less<key_type>;
+    static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM;
+    static constexpr const unsigned int                  block_size       = TestFixture::block_size;
+    static constexpr const unsigned int                  items_per_thread = 4;
+    TestSortKeyValueWithSize<block_size, items_per_thread, key_type, value_type, algo, binary_op_type>();
+}
diff --git a/test/rocprim/test_block_sort.kernels.hpp b/test/rocprim/test_block_sort.kernels.hpp
index d595471d3..6aab8eaee 100644
--- a/test/rocprim/test_block_sort.kernels.hpp
+++ b/test/rocprim/test_block_sort.kernels.hpp
@@ -237,4 +237,256 @@ __global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(key_type* /*keys*
                                                                OffsetT /*size*/)
 {}
 
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    class KeyIterator,
+    rocprim::block_sort_algorithm algorithm,
+    class BinaryOp = rocprim::less<typename std::iterator_traits<KeyIterator>::value_type>,
+    std::enable_if_t<(ItemsPerThread == 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)),
+                     int>
+    = 0>
+__global__ __launch_bounds__(BlockSize) void sort_keys_kernel_no_size(KeyIterator keys)
+{
+    using key_type = typename std::iterator_traits<KeyIterator>::value_type;
+    
+    static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize;
+    const unsigned int                  block_offset  = blockIdx.x * ItemsPerBlock;
+    const unsigned int                  index         = block_offset + threadIdx.x * ItemsPerThread;
+
+    using bsort_type
+        = rocprim::block_sort<key_type, BlockSize, ItemsPerThread, rocprim::empty_type, algorithm>;
+    ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage;
+    
+    key_type thread_key = keys[index];
+
+    bsort_type().sort(thread_key,
+                        storage,
+                        BinaryOp());
+
+    keys[index] = thread_key;
+}
+
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    class KeyIterator,
+    rocprim::block_sort_algorithm algorithm,
+    class BinaryOp = rocprim::less<typename std::iterator_traits<KeyIterator>::value_type>,
+    std::enable_if_t<(ItemsPerThread > 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)),
+                     int>
+    = 0>
+__global__ __launch_bounds__(BlockSize) void sort_keys_kernel_no_size(KeyIterator keys)
+{
+    using key_type = typename std::iterator_traits<KeyIterator>::value_type;
+    
+    static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize;
+    const unsigned int                  block_offset  = blockIdx.x * ItemsPerBlock;
+    const unsigned int                  index         = block_offset + threadIdx.x * ItemsPerThread;
+
+    using bsort_type
+        = rocprim::block_sort<key_type, BlockSize, ItemsPerThread, rocprim::empty_type, algorithm>;
+    ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage;
+    
+    key_type thread_keys[ItemsPerThread];
+    for(size_t i = 0; i < ItemsPerThread; i++)
+        thread_keys[i] = keys[index + i];
+    // 
+    bsort_type().sort(thread_keys,
+                        storage,
+                        BinaryOp());
+    for(size_t i = 0; i < ItemsPerThread; i++)
+        keys[index + i] = thread_keys[i];
+}
+
+template<unsigned int BlockSize,
+         unsigned int ItemsPerThread,
+         class KeyIterator,
+         rocprim::block_sort_algorithm algorithm,
+         class BinaryOp = rocprim::less<typename std::iterator_traits<KeyIterator>::value_type>,
+         std::enable_if_t<!is_buildable(BlockSize, ItemsPerThread, algorithm), int> = 0>
+__global__ __launch_bounds__(BlockSize) void sort_keys_kernel_no_size(KeyIterator)
+{}
+
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    class key_type,
+    class value_type,
+    rocprim::block_sort_algorithm algorithm,
+    class BinaryOp = rocprim::less<key_type>,
+    std::enable_if_t<(ItemsPerThread == 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)),
+                    int> 
+    = 0>
+    
+__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_no_size(key_type*   keys,
+                                                               value_type* values)
+{
+    using bsort_type
+        = rocprim::block_sort<key_type, BlockSize, ItemsPerThread, value_type, algorithm>;
+
+    ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage;
+
+    static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize;
+    const unsigned int                  block_offset  = blockIdx.x * ItemsPerBlock;
+    const unsigned int                  index         = block_offset + (threadIdx.x * ItemsPerThread);
+    key_type thread_key = keys[index];
+    value_type thread_value = values[index];
+
+    bsort_type().sort(thread_key,
+                        thread_value,
+                        storage,
+                        BinaryOp()
+    );
+    keys[index]   = thread_key;
+    values[index] = thread_value;
+}
+
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    class key_type,
+    class value_type,
+    rocprim::block_sort_algorithm algorithm,
+    class BinaryOp = rocprim::less<key_type>,
+    std::enable_if_t<(ItemsPerThread > 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)),
+                    int> 
+    = 0>
+    
+__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_no_size(key_type*   keys,
+                                                               value_type* values)
+{
+    using bsort_type
+        = rocprim::block_sort<key_type, BlockSize, ItemsPerThread, value_type, algorithm>;
+
+    ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage;
+
+    static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize;
+    const unsigned int                  block_offset  = blockIdx.x * ItemsPerBlock;
+    const unsigned int                  index         = block_offset + (threadIdx.x * ItemsPerThread);
+
+    key_type thread_keys[ItemsPerThread];
+    value_type thread_value[ItemsPerThread];
+    
+    for(size_t i = 0; i < ItemsPerThread; i++){
+        thread_keys[i] = keys[index + i];
+        thread_value[i] = values[index + i];
+    }
+    bsort_type().sort(thread_keys,
+                    thread_value,
+                    storage,
+                    BinaryOp());
+    for(size_t i = 0; i < ItemsPerThread; i++){
+        keys[index + i] = thread_keys[i];
+        values[index + i] = thread_value[i];
+    }
+}
+
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    class key_type,
+    class value_type,
+    rocprim::block_sort_algorithm algorithm,
+    class BinaryOp = rocprim::less<key_type>,
+             std::enable_if_t<!is_buildable(BlockSize, ItemsPerThread, algorithm), int> = 0>
+__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_no_size(key_type*   keys,
+    value_type* values)
+{}
+
+
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    class key_type,
+    class value_type,
+    rocprim::block_sort_algorithm algorithm,
+    class BinaryOp = rocprim::less<key_type>,
+    std::enable_if_t<(ItemsPerThread == 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)),
+                    int> 
+    = 0>
+    
+__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_with_size(key_type*   keys,
+                                                                        value_type* values,
+                                                                        const unsigned int size
+                                                                    )
+{
+    using bsort_type
+        = rocprim::block_sort<key_type, BlockSize, ItemsPerThread, value_type, algorithm>;
+
+    ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage;
+
+    static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize;
+    const unsigned int                  block_offset  = blockIdx.x * ItemsPerBlock;
+    const unsigned int                  index         = block_offset + (threadIdx.x * ItemsPerThread);
+
+    key_type thread_keys = keys[index];
+    value_type thread_value = values[index];
+    
+    bsort_type().sort(thread_keys,
+                    thread_value,
+                    storage,
+                    size,
+                    BinaryOp());
+    
+    keys[index] = thread_keys;
+    values[index] = thread_value;
+}
+
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    class key_type,
+    class value_type,
+    rocprim::block_sort_algorithm algorithm,
+    class BinaryOp = rocprim::less<key_type>,
+    std::enable_if_t<(ItemsPerThread > 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)),
+                    int> 
+    = 0>
+    
+__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_with_size(key_type*   keys,
+                                                                        value_type* values,
+                                                                        const unsigned int size
+                                                                    )
+{
+    using bsort_type
+        = rocprim::block_sort<key_type, BlockSize, ItemsPerThread, value_type, algorithm>;
+
+    ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage;
+
+    static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize;
+    const unsigned int                  block_offset  = blockIdx.x * ItemsPerBlock;
+    const unsigned int                  index         = block_offset + (threadIdx.x * ItemsPerThread);
+
+    key_type thread_keys[ItemsPerThread]; 
+    value_type thread_value[ItemsPerThread];
+
+    for(size_t i = 0; i < ItemsPerThread; i++){
+        thread_keys[i] = keys[index + i];
+        thread_value[i] = values[index + i];
+    }
+    
+    bsort_type().sort(thread_keys,
+                    thread_value,
+                    storage,
+                    size,
+                    BinaryOp());
+    
+    for(size_t i = 0; i < ItemsPerThread; i++){
+        keys[index + i] = thread_keys[i];
+        values[index + i] = thread_value[i];
+    }
+}
+
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    class key_type,
+    class value_type,
+    rocprim::block_sort_algorithm algorithm,
+    class BinaryOp = rocprim::less<key_type>,
+             std::enable_if_t<!is_buildable(BlockSize, ItemsPerThread, algorithm), int> = 0>
+__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_with_size(key_type*   keys,
+    value_type* values, const unsigned int size)
+{}
 #endif // TEST_BLOCK_SORT_KERNELS_HPP_
diff --git a/test/rocprim/test_utils_types.hpp b/test/rocprim/test_utils_types.hpp
index 25b97fb9f..ad1f57e27 100644
--- a/test/rocprim/test_utils_types.hpp
+++ b/test/rocprim/test_utils_types.hpp
@@ -271,8 +271,18 @@ static constexpr unsigned int items[n_items] = {
         static constexpr unsigned int block_size = Params::block_size; \
     };
 
+#define block_store_test_suite_type_def_helper(name, suffix) \
+    template<class Params> \
+    class name ## suffix : public ::testing::Test { \
+    public: \
+        using DataType = typename Params::input_type; \
+        static constexpr unsigned int block_size = Params::block_size; \
+    };
+
 #define block_sort_test_suite_type_def(name, suffix) block_sort_test_suite_type_def_helper(name, suffix)
 
+#define block_store_test_suite_type_def(name, suffix) block_store_test_suite_type_def_helper(name, suffix)
+
 #define typed_test_suite_def_helper(name, suffix, params) TYPED_TEST_SUITE(name ## suffix, params)
 
 #define typed_test_suite_def(name, suffix, params) typed_test_suite_def_helper(name, suffix, params)