From 24dbb7a7ec198209203906c0b17f60415a5e62c1 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Tue, 25 Mar 2025 13:11:45 -0600 Subject: [PATCH 01/14] completed missing inclusive scan tests --- test/rocprim/test_block_scan.hpp | 29 ++ test/rocprim/test_block_scan.kernels.hpp | 465 +++++++++++++++++++++++ 2 files changed, 494 insertions(+) diff --git a/test/rocprim/test_block_scan.hpp b/test/rocprim/test_block_scan.hpp index 32ac8c51f68..28be596eccd 100644 --- a/test/rocprim/test_block_scan.hpp +++ b/test/rocprim/test_block_scan.hpp @@ -664,6 +664,34 @@ typed_test_def(suite_name_array, name_suffix, InclusiveScanPrefixCallback) static_for_input_array<0, 2, T, 2, block_size>::run(); } +typed_test_def(suite_name_array, name_suffix, InclusiveScanWithStorage) +{ + using T = typename TestFixture::input_type; + constexpr size_t block_size = TestFixture::block_size; + + static_for_input_array<0, 2, T, 6, block_size>::run(); +} +typed_test_def(suite_name_array, name_suffix, InclusiveScanWithStorageAndInit) +{ + using T = typename TestFixture::input_type; + constexpr size_t block_size = TestFixture::block_size; + + static_for_input_array<0, 2, T, 7, block_size>::run(); +} +typed_test_def(suite_name_array, name_suffix, InclusiveScanReduceWithStorage) +{ + using T = typename TestFixture::input_type; + constexpr size_t block_size = TestFixture::block_size; + + static_for_input_array<0, 2, T, 8, block_size>::run(); +} +typed_test_def(suite_name_array, name_suffix, InclusiveScanReduceWithStorageAndInit) +{ + using T = typename TestFixture::input_type; + constexpr size_t block_size = TestFixture::block_size; + + static_for_input_array<0, 2, T, 9, block_size>::run(); +} typed_test_def(suite_name_array, name_suffix, ExclusiveScan) { using T = typename TestFixture::input_type; @@ -687,3 +715,4 @@ typed_test_def(suite_name_array, name_suffix, ExclusiveScanPrefixCallback) static_for_input_array<0, 2, T, 5, block_size>::run(); } + diff --git a/test/rocprim/test_block_scan.kernels.hpp b/test/rocprim/test_block_scan.kernels.hpp index 4eca2c418a5..cb140da758b 100644 --- a/test/rocprim/test_block_scan.kernels.hpp +++ b/test/rocprim/test_block_scan.kernels.hpp @@ -400,6 +400,138 @@ void inclusive_scan_array_prefix_callback_kernel(T* device_output, T* device_out } } +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + rocprim::block_scan_algorithm Algorithm, + class T, + class BinaryOp +> +__global__ +__launch_bounds__(BlockSize) +void inclusive_scan_array_with_storage_kernel(T * device_output){ + + const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread; + + T input[ItemsPerThread]; + + for(size_t i = 0; i < ItemsPerThread; i++) + input[i] = device_output[i + index]; + + + using bscan_t = rocprim::block_scan; + __shared__ typename bscan_t::storage_type storage; + rocprim::block_scan bscan; + + bscan.inclusive_scan(input, input, storage, BinaryOp()); + + for(size_t i = 0; i < ItemsPerThread; i++) + device_output[i + index] = input[i]; +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + rocprim::block_scan_algorithm Algorithm, + class T, + class BinaryOp +> +__global__ +__launch_bounds__(BlockSize) +void inclusive_scan_array_with_storage_kernel(T * device_output, T init){ + + const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread; + + T input[ItemsPerThread]; + + for(size_t i = 0; i < ItemsPerThread; i++) + input[i] = device_output[i + index]; + + + using bscan_t = rocprim::block_scan; + __shared__ typename bscan_t::storage_type storage; + rocprim::block_scan bscan; + + bscan.inclusive_scan(input, init, input, storage, BinaryOp()); + + for(size_t i = 0; i < ItemsPerThread; i++) + device_output[i + index] = input[i]; +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + rocprim::block_scan_algorithm Algorithm, + class T, + class BinaryOp +> +__global__ +__launch_bounds__(BlockSize) +void inclusive_scan_reduce_array_with_storage_kernel(T* device_output, T* device_output_reductions) +{ + const unsigned int index = ((blockIdx.x * BlockSize ) + threadIdx.x) * ItemsPerThread; + + // load + T in_out[ItemsPerThread]; + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + in_out[j] = device_output[index + j]; + } + using bscan_t = rocprim::block_scan; + __shared__ typename bscan_t::storage_type storage; + rocprim::block_scan bscan; + T reduction; + bscan.inclusive_scan(in_out, in_out, reduction, storage, BinaryOp()); + + // store + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + device_output[index + j] = in_out[j]; + } + + if(threadIdx.x == 0) + { + device_output_reductions[blockIdx.x] = reduction; + } +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + rocprim::block_scan_algorithm Algorithm, + class T, + class BinaryOp +> +__global__ +__launch_bounds__(BlockSize) +void inclusive_scan_reduce_array_with_storage_kernel(T* device_output, T* device_output_reductions, T init) +{ + const unsigned int index = ((blockIdx.x * BlockSize ) + threadIdx.x) * ItemsPerThread; + + // load + T in_out[ItemsPerThread]; + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + in_out[j] = device_output[index + j]; + } + using bscan_t = rocprim::block_scan; + __shared__ typename bscan_t::storage_type storage; + rocprim::block_scan bscan; + T reduction; + bscan.inclusive_scan(in_out, init, in_out, reduction, storage, BinaryOp()); + + // store + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + device_output[index + j] = in_out[j]; + } + + if(threadIdx.x == 0) + { + device_output_reductions[blockIdx.x] = reduction; + } +} + template< unsigned int BlockSize, unsigned int ItemsPerThread, @@ -764,6 +896,339 @@ auto test_block_scan_input_arrays() } +template< + class T, + int Method, + unsigned int BlockSize = 256U, + unsigned int ItemsPerThread = 1U, + rocprim::block_scan_algorithm Algorithm = rocprim::block_scan_algorithm::using_warp_scan +> +auto test_block_scan_input_arrays() +-> typename std::enable_if::type +{ + using binary_op_type = rocprim::maximum; + + static constexpr auto algorithm = Algorithm; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; + + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 19; + const size_t grid_size = size / items_per_block; + + SCOPED_TRACE(testing::Message() << "with items_per_block = " << items_per_block); + SCOPED_TRACE(testing::Message() << "with size = " << size); + SCOPED_TRACE(testing::Message() << "with grid_size = " << grid_size); + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + // Generate data + std::vector output = test_utils::get_random_data_wrapped(size, 2, 100, seed_value); + + // Calculate expected results on host + std::vector expected(output.size(), T(0)); + binary_op_type binary_op; + for(size_t i = 0; i < output.size() / items_per_block; i++) + { + for(size_t j = 0; j < items_per_block; j++) + { + auto idx = i * items_per_block + j; + expected[idx] = binary_op(output[idx], expected[j > 0 ? idx-1 : idx]); + } + } + + // Writing to device memory + common::device_ptr device_output(output); + + // Launching kernel + hipLaunchKernelGGL(HIP_KERNEL_NAME(inclusive_scan_array_with_storage_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output.get()); + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Read from device memory + output = device_output.load(); + + // Validating results + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); + } +} + +template< + class T, + int Method, + unsigned int BlockSize = 256U, + unsigned int ItemsPerThread = 1U, + rocprim::block_scan_algorithm Algorithm = rocprim::block_scan_algorithm::using_warp_scan +> +auto test_block_scan_input_arrays() +-> typename std::enable_if::type +{ + using binary_op_type = rocprim::maximum; + + static constexpr auto algorithm = Algorithm; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; + + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 19; + const size_t grid_size = size / items_per_block; + + SCOPED_TRACE(testing::Message() << "with items_per_block = " << items_per_block); + SCOPED_TRACE(testing::Message() << "with size = " << size); + SCOPED_TRACE(testing::Message() << "with grid_size = " << grid_size); + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + // Generate data + std::vector output = test_utils::get_random_data_wrapped(size, 2, 100, seed_value); + + // Calculate expected results on host + std::vector expected(output.size(), T(0)); + const T init = test_utils::get_random_value(0, 100, seed_value); + binary_op_type binary_op; + for(size_t i = 0; i < output.size() / items_per_block; i++) + { + expected[i * items_per_block] = init; + for(size_t j = 0; j < items_per_block; j++) + { + auto idx = i * items_per_block + j; + expected[idx] = binary_op(output[idx], expected[j > 0 ? idx-1 : idx]); + } + } + + // Writing to device memory + common::device_ptr device_output(output); + + // Launching kernel + hipLaunchKernelGGL(HIP_KERNEL_NAME(inclusive_scan_array_with_storage_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output.get(), + init + ); + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Read from device memory + output = device_output.load(); + + // Validating results + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); + } +} + +template< + class T, + int Method, + unsigned int BlockSize = 256U, + unsigned int ItemsPerThread = 1U, + rocprim::block_scan_algorithm Algorithm = rocprim::block_scan_algorithm::using_warp_scan +> +auto test_block_scan_input_arrays() +-> typename std::enable_if::type +{ + using binary_op_type = rocprim::maximum; + + static constexpr auto algorithm = Algorithm; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; + + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 19; + const size_t grid_size = size / items_per_block; + + SCOPED_TRACE(testing::Message() << "with items_per_block = " << items_per_block); + SCOPED_TRACE(testing::Message() << "with size = " << size); + SCOPED_TRACE(testing::Message() << "with grid_size = " << grid_size); + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + // Generate data + std::vector output = test_utils::get_random_data_wrapped(size, 2, 100, seed_value); + + // Output reduce results + std::vector output_reductions(size / block_size, T(0)); + + // Calculate expected results on host + std::vector expected(output.size(), T(0)); + std::vector expected_reductions(output_reductions.size(), T(0)); + binary_op_type binary_op; + for(size_t i = 0; i < output.size() / items_per_block; i++) + { + for(size_t j = 0; j < items_per_block; j++) + { + auto idx = i * items_per_block + j; + expected[idx] = binary_op(output[idx], expected[j > 0 ? idx-1 : idx]); + } + expected_reductions[i] = expected[(i+1) * items_per_block - 1]; + } + + // Writing to device memory + common::device_ptr device_output(output); + common::device_ptr device_output_reductions(output_reductions); + + // Launching kernel + hipLaunchKernelGGL(HIP_KERNEL_NAME(inclusive_scan_reduce_array_with_storage_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output.get(), + device_output_reductions.get()); + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Read from device memory + output = device_output.load(); + output_reductions = device_output_reductions.load(); + + // Validating results + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output_reductions, expected_reductions)); + } + +} + +template< + class T, + int Method, + unsigned int BlockSize = 256U, + unsigned int ItemsPerThread = 1U, + rocprim::block_scan_algorithm Algorithm = rocprim::block_scan_algorithm::using_warp_scan +> +auto test_block_scan_input_arrays() +-> typename std::enable_if::type +{ + using binary_op_type = rocprim::maximum; + + static constexpr auto algorithm = Algorithm; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; + + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 19; + const size_t grid_size = size / items_per_block; + + SCOPED_TRACE(testing::Message() << "with items_per_block = " << items_per_block); + SCOPED_TRACE(testing::Message() << "with size = " << size); + SCOPED_TRACE(testing::Message() << "with grid_size = " << grid_size); + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + // Generate data + std::vector output = test_utils::get_random_data_wrapped(size, 2, 100, seed_value); + + // Output reduce results + std::vector output_reductions(size / block_size, T(0)); + + // Calculate expected results on host + std::vector expected(output.size(), T(0)); + std::vector expected_reductions(output_reductions.size(), T(0)); + const T init = test_utils::get_random_value(0, 100, seed_value); + + binary_op_type binary_op; + for(size_t i = 0; i < output.size() / items_per_block; i++) + { + expected[i * items_per_block] = init; + for(size_t j = 0; j < items_per_block; j++) + { + auto idx = i * items_per_block + j; + expected[idx] = binary_op(output[idx], expected[j > 0 ? idx-1 : idx]); + } + expected_reductions[i] = expected[(i+1) * items_per_block - 1]; + } + + // Writing to device memory + common::device_ptr device_output(output); + common::device_ptr device_output_reductions(output_reductions); + + // Launching kernel + hipLaunchKernelGGL(HIP_KERNEL_NAME(inclusive_scan_reduce_array_with_storage_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output.get(), + device_output_reductions.get(), + init + ); + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Read from device memory + output = device_output.load(); + output_reductions = device_output_reductions.load(); + + // Validating results + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output_reductions, expected_reductions)); + } + +} + template< class T, int Method, From 85c657d8302dc03af61b7265a8ad93fe7d306509 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Tue, 25 Mar 2025 15:17:23 -0600 Subject: [PATCH 02/14] completed missing exclusive scan tests --- test/rocprim/test_block_scan.hpp | 15 ++ test/rocprim/test_block_scan.kernels.hpp | 239 +++++++++++++++++++++++ 2 files changed, 254 insertions(+) diff --git a/test/rocprim/test_block_scan.hpp b/test/rocprim/test_block_scan.hpp index 28be596eccd..97704a26cec 100644 --- a/test/rocprim/test_block_scan.hpp +++ b/test/rocprim/test_block_scan.hpp @@ -716,3 +716,18 @@ typed_test_def(suite_name_array, name_suffix, ExclusiveScanPrefixCallback) static_for_input_array<0, 2, T, 5, block_size>::run(); } +typed_test_def(suite_name_array, name_suffix, ExclusiveScanWithStorage) +{ + using T = typename TestFixture::input_type; + constexpr size_t block_size = TestFixture::block_size; + + static_for_input_array<0, 2, T, 10, block_size>::run(); +} + +typed_test_def(suite_name_array, name_suffix, ExclusiveScanReduceWithStorage) +{ + using T = typename TestFixture::input_type; + constexpr size_t block_size = TestFixture::block_size; + + static_for_input_array<0, 2, T, 11, block_size>::run(); +} \ No newline at end of file diff --git a/test/rocprim/test_block_scan.kernels.hpp b/test/rocprim/test_block_scan.kernels.hpp index cb140da758b..79cd7395bfb 100644 --- a/test/rocprim/test_block_scan.kernels.hpp +++ b/test/rocprim/test_block_scan.kernels.hpp @@ -643,6 +643,72 @@ void exclusive_scan_prefix_callback_array_kernel( } } +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + rocprim::block_scan_algorithm Algorithm, + class T, + class BinaryOp +> +__global__ +__launch_bounds__(BlockSize) +void exclusive_scan_array_with_storage_kernel(T * device_output, T init){ + + const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread; + + T input[ItemsPerThread]; + + for(size_t i = 0; i < ItemsPerThread; i++) + input[i] = device_output[i + index]; + + + using bscan_t = rocprim::block_scan; + __shared__ typename bscan_t::storage_type storage; + rocprim::block_scan bscan; + + bscan.exclusive_scan(input, input, init, storage, BinaryOp()); + + for(size_t i = 0; i < ItemsPerThread; i++) + device_output[i + index] = input[i]; +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + rocprim::block_scan_algorithm Algorithm, + class T, + class BinaryOp +> +__global__ +__launch_bounds__(BlockSize) +void exclusive_scan_reduce_array_with_storage_kernel(T* device_output, T* device_output_reductions, T init) +{ + const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread; + // load + T in_out[ItemsPerThread]; + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + in_out[j] = device_output[index + j]; + } + + using bscan_t = rocprim::block_scan; + __shared__ typename bscan_t::storage_type storage; + rocprim::block_scan bscan; + T reduction; + bscan.exclusive_scan(in_out, in_out, init, reduction, storage, BinaryOp()); + + // store + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + device_output[index + j] = in_out[j]; + } + + if(threadIdx.x == 0) + { + device_output_reductions[blockIdx.x] = reduction; + } +} + // Test for scan template< class T, @@ -1496,6 +1562,179 @@ auto test_block_scan_input_arrays() } +template< + class T, + int Method, + unsigned int BlockSize = 256U, + unsigned int ItemsPerThread = 1U, + rocprim::block_scan_algorithm Algorithm = rocprim::block_scan_algorithm::using_warp_scan +> +auto test_block_scan_input_arrays() +-> typename std::enable_if::type +{ + using binary_op_type = rocprim::maximum; + + static constexpr auto algorithm = Algorithm; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; + + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 19; + const size_t grid_size = size / items_per_block; + + SCOPED_TRACE(testing::Message() << "with items_per_block = " << items_per_block); + SCOPED_TRACE(testing::Message() << "with size = " << size); + SCOPED_TRACE(testing::Message() << "with grid_size = " << grid_size); + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + // Generate data + std::vector output = test_utils::get_random_data_wrapped(size, 2, 100, seed_value); + const T init = test_utils::get_random_value(0, 100, seed_value); + + // Calculate expected results on host + std::vector expected(output.size(), T(0)); + binary_op_type binary_op; + for(size_t i = 0; i < output.size() / items_per_block; i++) + { + expected[i * items_per_block] = init; + for(size_t j = 1; j < items_per_block; j++) + { + auto idx = i * items_per_block + j; + expected[idx] = binary_op(output[idx-1], expected[idx-1]); + } + } + + // Writing to device memory + common::device_ptr device_output(output); + + // Launching kernel + hipLaunchKernelGGL(HIP_KERNEL_NAME(exclusive_scan_array_with_storage_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output.get(), + init); + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Read from device memory + output = device_output.load(); + + // Validating results + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); + } + +} + +template< + class T, + int Method, + unsigned int BlockSize = 256U, + unsigned int ItemsPerThread = 1U, + rocprim::block_scan_algorithm Algorithm = rocprim::block_scan_algorithm::using_warp_scan +> +auto test_block_scan_input_arrays() +-> typename std::enable_if::type +{ + using binary_op_type = rocprim::maximum; + + static constexpr auto algorithm = Algorithm; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; + + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 19; + const size_t grid_size = size / items_per_block; + + SCOPED_TRACE(testing::Message() << "with items_per_block = " << items_per_block); + SCOPED_TRACE(testing::Message() << "with size = " << size); + SCOPED_TRACE(testing::Message() << "with grid_size = " << grid_size); + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + // Generate data + std::vector output = test_utils::get_random_data_wrapped(size, 2, 100, seed_value); + + // Output reduce results + std::vector output_reductions(size / items_per_block); + const T init = test_utils::get_random_value(0, 100, seed_value); + + // Calculate expected results on host + std::vector expected(output.size(), T(0)); + std::vector expected_reductions(output_reductions.size(), T(0)); + binary_op_type binary_op; + for(size_t i = 0; i < output.size() / items_per_block; i++) + { + expected[i * items_per_block] = init; + for(size_t j = 1; j < items_per_block; j++) + { + auto idx = i * items_per_block + j; + expected[idx] = binary_op(output[idx-1], expected[idx-1]); + } + for(size_t j = 0; j < items_per_block; j++) + { + auto idx = i * items_per_block + j; + expected_reductions[i] = binary_op(expected_reductions[i], output[idx]); + } + } + + // Writing to device memory + common::device_ptr device_output(output); + common::device_ptr device_output_reductions(output_reductions.size()); + + // Launching kernel + hipLaunchKernelGGL(HIP_KERNEL_NAME(exclusive_scan_reduce_array_with_storage_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output.get(), + device_output_reductions.get(), + init); + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Read from device memory + output = device_output.load(); + output_reductions = device_output_reductions.load(); + + // Validating results + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output_reductions, expected_reductions)); + } + +} + // Static for-loop template struct static_for_input_array From cd2f48569d17d69f9dbbe662ba083c272fda1559 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Tue, 25 Mar 2025 15:21:28 -0600 Subject: [PATCH 03/14] updated changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 493215f1c3a..ea472afa042 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ Full documentation for rocPRIM is available at [https://rocm.docs.amd.com/projec * Added the `rocprim::merge_inplace` function for merging in-place. * Added initial value support for warp- and block-level inclusive scan. * Added support for building tests with device-side random data generation, making them finish faster. This requires rocRAND, and is enabled with the `WITH_ROCRAND=ON` build flag. +* Added additional unit tests for `test_block_scan.hpp` ### Changed From 83ffa6e3b31ca35dbffc2e09a1605a9dc7ceca8a Mon Sep 17 00:00:00 2001 From: Di Nguyen Date: Thu, 3 Apr 2025 16:51:15 -0600 Subject: [PATCH 04/14] Zenguyen/block sort missing unit test (#1) * finished sort keys no size tests * completed sort pair no size functions * completed sort pair with size functions * updated changelog --- CHANGELOG.md | 1 + test/rocprim/test_block_sort.hpp | 349 ++++++++++++++++++++++- test/rocprim/test_block_sort.kernels.hpp | 252 ++++++++++++++++ 3 files changed, 601 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ea472afa042..b39c9af6e3b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ Full documentation for rocPRIM is available at [https://rocm.docs.amd.com/projec * Added initial value support for warp- and block-level inclusive scan. * Added support for building tests with device-side random data generation, making them finish faster. This requires rocRAND, and is enabled with the `WITH_ROCRAND=ON` build flag. * Added additional unit tests for `test_block_scan.hpp` +* Added additional unit tests for `test_block_sort.hpp` ### Changed diff --git a/test/rocprim/test_block_sort.hpp b/test/rocprim/test_block_sort.hpp index 89fa0fde6f9..f57543a83da 100644 --- a/test/rocprim/test_block_sort.hpp +++ b/test/rocprim/test_block_sort.hpp @@ -333,6 +333,287 @@ void TestSortStableKey(std::vector sizes) } } } + +template +void TestSortKeyNoSize() +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + static constexpr const unsigned int items_per_block = block_size * items_per_thread; + hipStream_t stream = 0; // default + + if(!is_buildable(block_size, items_per_thread, algo)) + { + GTEST_SKIP(); + } + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + size_t grid_size = 1134; + size_t size = items_per_block * grid_size; + SCOPED_TRACE(testing::Message() << "with size = " << size); + // Generate data + std::vector output + = test_utils::get_random_data_wrapped(size, -100, 100, seed_value); + + // Calculate expected results on host + std::vector expected(output); + binary_op_type binary_op; + for(size_t i = 0; i < grid_size; i++) + { + std::sort(expected.begin() + (i * items_per_block), + expected.begin() + std::min(size, ((i + 1) * items_per_block)), + binary_op); + } + + // Preparing device + common::device_ptr device_key_output(output); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(sort_keys_kernel_no_size), + dim3(grid_size), + dim3(block_size), + 0, + stream, + device_key_output.get() + ); + + // Reading results back + output = device_key_output.load(); + + test_utils::assert_eq(output, expected); + } +} + +template +void TestSortKeyValueNoSize() +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + static constexpr const unsigned int items_per_block = block_size * items_per_thread; + static constexpr const size_t grid_size = 1134; + static constexpr const size_t size = items_per_block * grid_size; + hipStream_t stream = 0; // default + + if(!is_buildable(block_size, items_per_thread, algo)) + { + GTEST_SKIP(); + } + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + SCOPED_TRACE(testing::Message() << "with size = " << size); + + // Generate data + std::vector output_key + = test_utils::get_random_data_wrapped(size, 0, 100, seed_value); + std::vector output_value + = test_utils::get_random_data_wrapped(size, -100, 100, seed_value); + + // Combine vectors to form pairs with key and value + std::vector> target(size); + for(unsigned i = 0; i < target.size(); i++) + target[i] = std::make_pair(output_key[i], output_value[i]); + + // Calculate expected results on host + using key_value = std::pair; + std::vector expected(target); + constexpr bool descending = !std::is_same>::value; + for(size_t i = 0; i < expected.size() / items_per_block; i++) + { + std::sort(expected.begin() + (i * items_per_block), + expected.begin() + ((i + 1) * items_per_block), + test_utils::key_value_comparator()); + } + + // Preparing device + common::device_ptr device_key_output(output_key); + common::device_ptr device_value_output(output_value); + + // Running kernel, ignored if invalid size + hipLaunchKernelGGL(HIP_KERNEL_NAME(sort_pairs_kernel_no_size), + dim3(grid_size), + dim3(block_size), + 0, + stream, + device_key_output.get(), + device_value_output.get()); + HIP_CHECK(hipGetLastError()); + + // Reading results back + output_key = device_key_output.load(); + output_value = device_value_output.load(); + + std::vector expected_key(expected.size()); + std::vector expected_value(expected.size()); + for(size_t i = 0; i < expected.size(); i++) + { + expected_key[i] = expected[i].first; + expected_value[i] = expected[i].second; + } + + // Keys are sorted, Values order not guaranteed + // Sort subsets where key was the same to make sure all values are still present + using value_op_type = rocprim::less; + using eq_op_type = rocprim::equal_to; + value_op_type value_op; + eq_op_type eq_op; + for(size_t i = 0; i < output_key.size();) + { + auto j = i; + for(; j < output_key.size() && eq_op(output_key[j], output_key[i]); ++j) + {} + std::sort(output_value.begin() + i, output_value.begin() + j, value_op); + std::sort(expected_value.begin() + i, expected_value.begin() + j, value_op); + i = j; + } + + test_utils::assert_eq(output_key, expected_key); + test_utils::assert_eq(output_value, expected_value); + } +} + +template +void TestSortKeyValueWithSize() +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + static constexpr const unsigned int items_per_block = block_size * items_per_thread; + static constexpr const size_t grid_size = 1134; + static constexpr const size_t size = items_per_block * grid_size; + hipStream_t stream = 0; // default + + if(!is_buildable(block_size, items_per_thread, algo)) + { + GTEST_SKIP(); + } + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + SCOPED_TRACE(testing::Message() << "with size = " << size); + + // Generate data + std::vector output_key + = test_utils::get_random_data_wrapped(size, 0, 100, seed_value); + std::vector output_value + = test_utils::get_random_data_wrapped(size, -100, 100, seed_value); + + // Combine vectors to form pairs with key and value + std::vector> target(size); + for(unsigned i = 0; i < target.size(); i++) + target[i] = std::make_pair(output_key[i], output_value[i]); + + // Calculate expected results on host + using key_value = std::pair; + std::vector expected(target); + constexpr bool descending = !std::is_same>::value; + for(size_t i = 0; i < expected.size() / items_per_block; i++) + { + std::sort(expected.begin() + (i * items_per_block), + expected.begin() + ((i + 1) * items_per_block), + test_utils::key_value_comparator()); + } + + // Preparing device + common::device_ptr device_key_output(output_key); + common::device_ptr device_value_output(output_value); + + // Running kernel, ignored if invalid size + hipLaunchKernelGGL(HIP_KERNEL_NAME(sort_pairs_kernel_with_size), + dim3(grid_size), + dim3(block_size), + 0, + stream, + device_key_output.get(), + device_value_output.get(), + items_per_block); + HIP_CHECK(hipGetLastError()); + + // Reading results back + output_key = device_key_output.load(); + output_value = device_value_output.load(); + + std::vector expected_key(expected.size()); + std::vector expected_value(expected.size()); + for(size_t i = 0; i < expected.size(); i++) + { + expected_key[i] = expected[i].first; + expected_value[i] = expected[i].second; + } + + // Keys are sorted, Values order not guaranteed + // Sort subsets where key was the same to make sure all values are still present + using value_op_type = rocprim::less; + using eq_op_type = rocprim::equal_to; + value_op_type value_op; + eq_op_type eq_op; + for(size_t i = 0; i < output_key.size();) + { + auto j = i; + for(; j < output_key.size() && eq_op(output_key[j], output_key[i]); ++j) + {} + std::sort(output_value.begin() + i, output_value.begin() + j, value_op); + std::sort(expected_value.begin() + i, expected_value.begin() + j, value_op); + i = j; + } + + test_utils::assert_eq(output_key, expected_key); + test_utils::assert_eq(output_value, expected_value); + } +} + #endif // TEST_ROCPRIM_TEST_BLOCK_SORT_HPP_ // This file is included multiple times in the test_block_sort_[algo].cpp file, because // the test definitions below this header guard need to be compiled for each test suites: @@ -429,7 +710,7 @@ typed_test_def(suite_name, static constexpr const unsigned int block_size = TestFixture::block_size; static constexpr const unsigned int items_per_thread = 4; std::vector sizes - = {0, 53, 512, 5000, 34567, (1 << 17) - 1220, 1134 * 256, (1 << 20) - 123}; + = {0,53, 512, 5000, 34567, (1 << 17) - 1220, 1134 * 256, (1 << 20) - 123}; TestSortKey(sizes); } @@ -443,3 +724,69 @@ typed_test_def(suite_name, name_suffix, SortKeyValueDesc) static constexpr const unsigned int items_per_thread = 1; TestSortKeyValue(); } + +typed_test_def(suite_name, name_suffix, SortKeyNoSize) +{ + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; + using binary_op_type = typename rocprim::greater; + static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 1; + TestSortKeyNoSize(); +} + +typed_test_def(suite_name, name_suffix, SortKeyNoSizeMultipleItemsPerThread) +{ + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; + using binary_op_type = typename rocprim::greater; + static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 4; + TestSortKeyNoSize(); +} + +typed_test_def(suite_name, name_suffix, SortKeyValueNoSize) +{ + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; + using binary_op_type = typename rocprim::less; + static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 1; + TestSortKeyValueNoSize(); +} + +typed_test_def(suite_name, name_suffix, SortKeyValueNoSizeMultipleItemsPerThread) +{ + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; + using binary_op_type = typename rocprim::less; + static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 4; + TestSortKeyValueNoSize(); +} + +typed_test_def(suite_name, name_suffix, SortKeyValueWithSize) +{ + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; + using binary_op_type = typename rocprim::less; + static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 1; + TestSortKeyValueWithSize(); +} + +typed_test_def(suite_name, name_suffix, SortKeyValueWithSizeMultipleItemsPerThread) +{ + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; + using binary_op_type = typename rocprim::less; + static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 4; + TestSortKeyValueWithSize(); +} diff --git a/test/rocprim/test_block_sort.kernels.hpp b/test/rocprim/test_block_sort.kernels.hpp index d595471d39a..6aab8eaee08 100644 --- a/test/rocprim/test_block_sort.kernels.hpp +++ b/test/rocprim/test_block_sort.kernels.hpp @@ -237,4 +237,256 @@ __global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(key_type* /*keys* OffsetT /*size*/) {} +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class KeyIterator, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less::value_type>, + std::enable_if_t<(ItemsPerThread == 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), + int> + = 0> +__global__ __launch_bounds__(BlockSize) void sort_keys_kernel_no_size(KeyIterator keys) +{ + using key_type = typename std::iterator_traits::value_type; + + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + threadIdx.x * ItemsPerThread; + + using bsort_type + = rocprim::block_sort; + ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; + + key_type thread_key = keys[index]; + + bsort_type().sort(thread_key, + storage, + BinaryOp()); + + keys[index] = thread_key; +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class KeyIterator, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less::value_type>, + std::enable_if_t<(ItemsPerThread > 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), + int> + = 0> +__global__ __launch_bounds__(BlockSize) void sort_keys_kernel_no_size(KeyIterator keys) +{ + using key_type = typename std::iterator_traits::value_type; + + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + threadIdx.x * ItemsPerThread; + + using bsort_type + = rocprim::block_sort; + ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; + + key_type thread_keys[ItemsPerThread]; + for(size_t i = 0; i < ItemsPerThread; i++) + thread_keys[i] = keys[index + i]; + // + bsort_type().sort(thread_keys, + storage, + BinaryOp()); + for(size_t i = 0; i < ItemsPerThread; i++) + keys[index + i] = thread_keys[i]; +} + +template::value_type>, + std::enable_if_t = 0> +__global__ __launch_bounds__(BlockSize) void sort_keys_kernel_no_size(KeyIterator) +{} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class key_type, + class value_type, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less, + std::enable_if_t<(ItemsPerThread == 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), + int> + = 0> + +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_no_size(key_type* keys, + value_type* values) +{ + using bsort_type + = rocprim::block_sort; + + ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; + + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); + key_type thread_key = keys[index]; + value_type thread_value = values[index]; + + bsort_type().sort(thread_key, + thread_value, + storage, + BinaryOp() + ); + keys[index] = thread_key; + values[index] = thread_value; +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class key_type, + class value_type, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less, + std::enable_if_t<(ItemsPerThread > 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), + int> + = 0> + +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_no_size(key_type* keys, + value_type* values) +{ + using bsort_type + = rocprim::block_sort; + + ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; + + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); + + key_type thread_keys[ItemsPerThread]; + value_type thread_value[ItemsPerThread]; + + for(size_t i = 0; i < ItemsPerThread; i++){ + thread_keys[i] = keys[index + i]; + thread_value[i] = values[index + i]; + } + bsort_type().sort(thread_keys, + thread_value, + storage, + BinaryOp()); + for(size_t i = 0; i < ItemsPerThread; i++){ + keys[index + i] = thread_keys[i]; + values[index + i] = thread_value[i]; + } +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class key_type, + class value_type, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less, + std::enable_if_t = 0> +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_no_size(key_type* keys, + value_type* values) +{} + + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class key_type, + class value_type, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less, + std::enable_if_t<(ItemsPerThread == 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), + int> + = 0> + +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_with_size(key_type* keys, + value_type* values, + const unsigned int size + ) +{ + using bsort_type + = rocprim::block_sort; + + ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; + + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); + + key_type thread_keys = keys[index]; + value_type thread_value = values[index]; + + bsort_type().sort(thread_keys, + thread_value, + storage, + size, + BinaryOp()); + + keys[index] = thread_keys; + values[index] = thread_value; +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class key_type, + class value_type, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less, + std::enable_if_t<(ItemsPerThread > 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), + int> + = 0> + +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_with_size(key_type* keys, + value_type* values, + const unsigned int size + ) +{ + using bsort_type + = rocprim::block_sort; + + ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; + + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); + + key_type thread_keys[ItemsPerThread]; + value_type thread_value[ItemsPerThread]; + + for(size_t i = 0; i < ItemsPerThread; i++){ + thread_keys[i] = keys[index + i]; + thread_value[i] = values[index + i]; + } + + bsort_type().sort(thread_keys, + thread_value, + storage, + size, + BinaryOp()); + + for(size_t i = 0; i < ItemsPerThread; i++){ + keys[index + i] = thread_keys[i]; + values[index + i] = thread_value[i]; + } +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class key_type, + class value_type, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less, + std::enable_if_t = 0> +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_with_size(key_type* keys, + value_type* values, const unsigned int size) +{} #endif // TEST_BLOCK_SORT_KERNELS_HPP_ From 0e5677ee8c27a5ea4044aaa68a5cda81790169a1 Mon Sep 17 00:00:00 2001 From: Di Nguyen Date: Mon, 7 Apr 2025 16:55:57 -0600 Subject: [PATCH 05/14] implemented block store tests (#2) * inital implementation for block store * added transpose tests and store size * implemented storage tests * updated changelog ### NOTE ### This addition does not include block_store_warp_transpose tests --- CHANGELOG.md | 5 + rocprim/include/rocprim/block/block_store.hpp | 2 - test/rocprim/CMakeLists.txt | 4 + test/rocprim/test_block_store.hpp | 193 ++++++++++++++++++ test/rocprim/test_block_store.kernels.hpp | 154 ++++++++++++++ test/rocprim/test_block_store_direct.cpp | 58 ++++++ test/rocprim/test_block_store_striped.cpp | 58 ++++++ test/rocprim/test_block_store_transpose.cpp | 58 ++++++ test/rocprim/test_block_store_vectorize.cpp | 58 ++++++ test/rocprim/test_utils_types.hpp | 10 + 10 files changed, 598 insertions(+), 2 deletions(-) create mode 100644 test/rocprim/test_block_store.hpp create mode 100644 test/rocprim/test_block_store.kernels.hpp create mode 100644 test/rocprim/test_block_store_direct.cpp create mode 100644 test/rocprim/test_block_store_striped.cpp create mode 100644 test/rocprim/test_block_store_transpose.cpp create mode 100644 test/rocprim/test_block_store_vectorize.cpp diff --git a/CHANGELOG.md b/CHANGELOG.md index b39c9af6e3b..058d9cb9ee9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,11 @@ Full documentation for rocPRIM is available at [https://rocm.docs.amd.com/projec * Added support for building tests with device-side random data generation, making them finish faster. This requires rocRAND, and is enabled with the `WITH_ROCRAND=ON` build flag. * Added additional unit tests for `test_block_scan.hpp` * Added additional unit tests for `test_block_sort.hpp` +* New unit tests for `include/rocprim/block_store.h`. New test files are: + * `test_block_store_direct.cpp` + * `test_block_store_striped.cpp` + * `test_block_store_transposed.cpp` + * `test_block_store_vectorize.cpp` ### Changed diff --git a/rocprim/include/rocprim/block/block_store.hpp b/rocprim/include/rocprim/block/block_store.hpp index 905cb0c01ad..faed25018e2 100644 --- a/rocprim/include/rocprim/block/block_store.hpp +++ b/rocprim/include/rocprim/block/block_store.hpp @@ -40,8 +40,6 @@ BEGIN_ROCPRIM_NAMESPACE /// of items into a blocked/striped arrangement on continuous memory enum class block_store_method { - /// A blocked arrangement of items is stored into a blocked arrangement on continuous - /// memory. /// \par Performance Notes: /// * Performance decreases with increasing number of items per thread (stride /// between reads), because of reduced memory coalescing. diff --git a/test/rocprim/CMakeLists.txt b/test/rocprim/CMakeLists.txt index ff9f06979e8..ddf9e6e3b88 100644 --- a/test/rocprim/CMakeLists.txt +++ b/test/rocprim/CMakeLists.txt @@ -269,6 +269,10 @@ add_rocprim_test("rocprim.block_histogram" test_block_histogram.cpp) add_rocprim_test("rocprim.block_load_store" test_block_load_store.cpp) add_rocprim_test("rocprim.block_sort_merge" test_block_sort_merge.cpp) add_rocprim_test("rocprim.block_sort_merge_stable" test_block_sort_merge_stable.cpp) +add_rocprim_test("rocprim.block_store_direct" test_block_store_direct.cpp) +add_rocprim_test("rocprim.block_store_striped" test_block_store_striped.cpp) +add_rocprim_test("rocprim.block_store_transpose" test_block_store_transpose.cpp) +add_rocprim_test("rocprim.block_store_vectorize" test_block_store_vectorize.cpp) add_rocprim_test_parallel("rocprim.block_radix_rank" test_block_radix_rank.cpp.in) add_rocprim_test_parallel("rocprim.block_radix_sort" test_block_radix_sort.cpp.in) add_rocprim_test("rocprim.block_reduce" test_block_reduce.cpp) diff --git a/test/rocprim/test_block_store.hpp b/test/rocprim/test_block_store.hpp new file mode 100644 index 00000000000..c37f51b50c6 --- /dev/null +++ b/test/rocprim/test_block_store.hpp @@ -0,0 +1,193 @@ +#include "../common_test_header.hpp" +#include "test_utils.hpp" + +#include "../../common/utils.hpp" +#include "../../common/utils_device_ptr.hpp" +#include "test_seed.hpp" +#include "test_utils_assertions.hpp" +#include "test_utils_data_generation.hpp" +#include "test_utils_sort_comparator.hpp" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +block_store_test_suite_type_def(suite_name, name_suffix); +typed_test_suite_def(suite_name, name_suffix, block_params); + +// using header guards for these test functions because this file is included multiple times: +// once for the integrals test suite and once for the floating point test suite. +#ifndef TEST_ROCPRIM_TEST_BLOCK_STORE_HPP_ + #define TEST_ROCPRIM_TEST_BLOCK_STORE_HPP_ + +template< + bool use_size, + unsigned int block_size, + unsigned int items_per_thread, + typename DataType, + rocprim::block_store_method algorithm +> +void TestStore(){ + + constexpr size_t items_per_block = block_size * items_per_thread; + constexpr size_t grid_size = 120; + constexpr size_t size = items_per_block * grid_size; + + + std::vector host_input(size); + common::device_ptr device_output(host_input); + + for(size_t i = 0; i < size; i++) host_input[i] = static_cast(i); + common::device_ptr device_input(host_input); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(store_kernel< + use_size, + block_size, + items_per_thread, + DataType, + algorithm + >), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input.get(), + device_output.get() + ); + + HIP_CHECK(hipGetLastError()); + std::vector host_output = device_output.load(); + test_utils::assert_eq(host_input, host_output); +} + +template< + bool use_size, + unsigned int block_size, + unsigned int items_per_thread, + typename DataType, + rocprim::block_store_method algorithm +> +void TestStoreWithStorage(){ + + constexpr size_t items_per_block = block_size * items_per_thread; + constexpr size_t grid_size = 120; + constexpr size_t size = items_per_block * grid_size; + + + std::vector host_input(size); + common::device_ptr device_output(host_input); + + for(size_t i = 0; i < size; i++) host_input[i] = static_cast(i); + common::device_ptr device_input(host_input); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(store_kernel_with_storage< + use_size, + block_size, + items_per_thread, + DataType, + algorithm + >), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input.get(), + device_output.get() + ); + + HIP_CHECK(hipGetLastError()); + std::vector host_output = device_output.load(); + test_utils::assert_eq(host_input, host_output); +} + +#endif + +typed_test_def(suite_name, name_suffix, Store) +{ + using DataType = typename TestFixture::DataType; + static constexpr const rocprim::block_store_method algo = TEST_BLOCK_STORE_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 1; + static constexpr const bool use_size = false; + TestStore(); +} + +typed_test_def(suite_name, name_suffix, StoreMultipleItemsPerThread) +{ + using DataType = typename TestFixture::DataType; + static constexpr const rocprim::block_store_method algo = TEST_BLOCK_STORE_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 4; + static constexpr const bool use_size = false; + TestStore(); +} + +typed_test_def(suite_name, name_suffix, StoreWithSize) +{ + using DataType = typename TestFixture::DataType; + static constexpr const rocprim::block_store_method algo = TEST_BLOCK_STORE_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 1; + static constexpr const bool use_size = true; + TestStore(); +} + +typed_test_def(suite_name, name_suffix, StoreWithSizeMultipleItemsPerThread) +{ + using DataType = typename TestFixture::DataType; + static constexpr const rocprim::block_store_method algo = TEST_BLOCK_STORE_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 4; + static constexpr const bool use_size = true; + TestStore(); +} + +typed_test_def(suite_name, name_suffix, StoreWithStorage) +{ + using DataType = typename TestFixture::DataType; + static constexpr const rocprim::block_store_method algo = TEST_BLOCK_STORE_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 1; + static constexpr const bool use_size = false; + TestStoreWithStorage(); +} + +typed_test_def(suite_name, name_suffix, StoreMultipleItemsPerThreadWithStorage) +{ + using DataType = typename TestFixture::DataType; + static constexpr const rocprim::block_store_method algo = TEST_BLOCK_STORE_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 4; + static constexpr const bool use_size = false; + TestStoreWithStorage(); +} + +typed_test_def(suite_name, name_suffix, StoreWithSizeWithStorage) +{ + using DataType = typename TestFixture::DataType; + static constexpr const rocprim::block_store_method algo = TEST_BLOCK_STORE_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 1; + static constexpr const bool use_size = true; + TestStoreWithStorage(); +} + +typed_test_def(suite_name, name_suffix, StoreWithSizeMultipleItemsPerThreadWithStorage) +{ + using DataType = typename TestFixture::DataType; + static constexpr const rocprim::block_store_method algo = TEST_BLOCK_STORE_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 4; + static constexpr const bool use_size = true; + TestStoreWithStorage(); +} diff --git a/test/rocprim/test_block_store.kernels.hpp b/test/rocprim/test_block_store.kernels.hpp new file mode 100644 index 00000000000..8d9303588fd --- /dev/null +++ b/test/rocprim/test_block_store.kernels.hpp @@ -0,0 +1,154 @@ +#ifndef TEST_BLOCK_STORE_KERNELS_HPP_ +#define TEST_BLOCK_STORE_KERNELS_HPP_ + + + +constexpr bool is_buildable(unsigned int BlockSize, + unsigned int ItemsPerThread, + rocprim::block_store_method algorithm + ) +{ + switch(algorithm) + { + case rocprim::block_store_method::block_store_direct: + case rocprim::block_store_method::block_store_striped: + case rocprim::block_store_method::block_store_transpose: + return true; + case rocprim::block_store_method::block_store_vectorize: + return (ItemsPerThread % 2 == 0) && ((BlockSize * ItemsPerThread) % 4 == 0); + case rocprim::block_store_method::block_store_warp_transpose: + return BlockSize % rocprim::device_warp_size() == 0; + } + return false; +} + +template< + bool useSize, + unsigned int BlockSize, + unsigned int ItemsPerThread, + typename DataType, + rocprim::block_store_method algorithm, + std::enable_if_t<(is_buildable(BlockSize, ItemsPerThread, algorithm)), + int> = 0 + > +__global__ __launch_bounds__(BlockSize) void store_kernel(DataType * input, DataType * output){ + using bstore_type = rocprim::block_store; + + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); + + DataType temp[ItemsPerThread]; + __shared__ DataType storage[ItemsPerBlock]; + + for(unsigned int i = 0; i < ItemsPerThread; i++){ + switch(algorithm){ + case rocprim::block_store_method::block_store_direct: + case rocprim::block_store_method::block_store_transpose: + case rocprim::block_store_method::block_store_vectorize: + temp[i] = input[index + i]; + break; + + case rocprim::block_store_method::block_store_striped: + temp[i] = input[block_offset + (threadIdx.x + i * BlockSize)]; + break; + } + } + + if(useSize) + bstore_type().store(storage, temp, ItemsPerBlock); + else + bstore_type().store(storage, temp); + + __syncthreads(); + + for(unsigned int i = 0; i < ItemsPerThread; i++){ + output[index + i] = storage[threadIdx.x * ItemsPerThread + i]; + } +} + +template< + bool useSize, + unsigned int BlockSize, + unsigned int ItemsPerThread, + typename DataType, + rocprim::block_store_method algorithm, + std::enable_if_t = 0 + > +__global__ __launch_bounds__(BlockSize) void store_kernel(DataType * input, DataType * output){ + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); + + for(unsigned int i = 0; i < ItemsPerThread; i++){ + output[index + i] = input[index + i]; + } +} + +template< + bool useSize, + unsigned int BlockSize, + unsigned int ItemsPerThread, + typename DataType, + rocprim::block_store_method algorithm, + std::enable_if_t<(is_buildable(BlockSize, ItemsPerThread, algorithm)), + int> = 0 + > +__global__ __launch_bounds__(BlockSize) void store_kernel_with_storage(DataType * input, DataType * output){ + using bstore_type = rocprim::block_store; + + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); + + DataType temp[ItemsPerThread]; + __shared__ DataType temp_out[ItemsPerBlock]; + + for(unsigned int i = 0; i < ItemsPerThread; i++){ + switch(algorithm){ + case rocprim::block_store_method::block_store_direct: + case rocprim::block_store_method::block_store_transpose: + case rocprim::block_store_method::block_store_vectorize: + temp[i] = input[index + i]; + break; + + case rocprim::block_store_method::block_store_striped: + temp[i] = input[block_offset + (threadIdx.x + i * BlockSize)]; + break; + } + } + ROCPRIM_SHARED_MEMORY typename bstore_type::storage_type storage; + + if(useSize) + bstore_type().store(temp_out, temp, ItemsPerBlock, storage); + else + bstore_type().store(temp_out, temp, storage); + + __syncthreads(); + + for(unsigned int i = 0; i < ItemsPerThread; i++){ + output[index + i] = temp_out[threadIdx.x * ItemsPerThread + i]; + } +} + +template< + bool useSize, + unsigned int BlockSize, + unsigned int ItemsPerThread, + typename DataType, + rocprim::block_store_method algorithm, + std::enable_if_t = 0 + > +__global__ __launch_bounds__(BlockSize) void store_kernel_with_storage(DataType * input, DataType * output){ + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); + + for(unsigned int i = 0; i < ItemsPerThread; i++){ + output[index + i] = input[index + i]; + } +} + +#endif \ No newline at end of file diff --git a/test/rocprim/test_block_store_direct.cpp b/test/rocprim/test_block_store_direct.cpp new file mode 100644 index 00000000000..ce23a7fa7ad --- /dev/null +++ b/test/rocprim/test_block_store_direct.cpp @@ -0,0 +1,58 @@ +// MIT License +// +// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "../common_test_header.hpp" + +// required rocprim headers +#include + +// required test headers +#include "test_utils_types.hpp" + +// required common headers +#include "../../common/utils_device_ptr.hpp" + +// kernel definitions +#include "test_block_store.kernels.hpp" + +// Start stamping out tests +struct RocprimBlockStoreDirectTests; +#ifndef TEST_BLOCK_STORE_ALGORITHM + #define TEST_BLOCK_STORE_ALGORITHM rocprim::block_store_method::block_store_direct +#endif +struct Integral; +#define suite_name RocprimBlockStoreDirectTests +#define block_params BlockParamsIntegral +#define name_suffix Integral + +#include "test_block_store.hpp" + +#undef suite_name +#undef block_params +#undef name_suffix + +struct Floating; +#define suite_name RocprimBlockStoreDirectTests +#define block_params BlockParamsFloating +#define name_suffix Floating + +#include "test_block_store.hpp" diff --git a/test/rocprim/test_block_store_striped.cpp b/test/rocprim/test_block_store_striped.cpp new file mode 100644 index 00000000000..cf01a40882c --- /dev/null +++ b/test/rocprim/test_block_store_striped.cpp @@ -0,0 +1,58 @@ +// MIT License +// +// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "../common_test_header.hpp" + +// required rocprim headers +#include + +// required test headers +#include "test_utils_types.hpp" + +// required common headers +#include "../../common/utils_device_ptr.hpp" + +// kernel definitions +#include "test_block_store.kernels.hpp" + +// Start stamping out tests +struct RocprimBlockStoreStripedTests; +#ifndef TEST_BLOCK_STORE_ALGORITHM + #define TEST_BLOCK_STORE_ALGORITHM rocprim::block_store_method::block_store_striped +#endif +struct Integral; +#define suite_name RocprimBlockStoreStripedTests +#define block_params BlockParamsIntegral +#define name_suffix Integral + +#include "test_block_store.hpp" + +#undef suite_name +#undef block_params +#undef name_suffix + +struct Floating; +#define suite_name RocprimBlockStoreStripedTests +#define block_params BlockParamsFloating +#define name_suffix Floating + +#include "test_block_store.hpp" diff --git a/test/rocprim/test_block_store_transpose.cpp b/test/rocprim/test_block_store_transpose.cpp new file mode 100644 index 00000000000..37f66782b59 --- /dev/null +++ b/test/rocprim/test_block_store_transpose.cpp @@ -0,0 +1,58 @@ +// MIT License +// +// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "../common_test_header.hpp" + +// required rocprim headers +#include + +// required test headers +#include "test_utils_types.hpp" + +// required common headers +#include "../../common/utils_device_ptr.hpp" + +// kernel definitions +#include "test_block_store.kernels.hpp" + +// Start stamping out tests +struct RocprimBlockStoreTransposeTests; +#ifndef TEST_BLOCK_STORE_ALGORITHM + #define TEST_BLOCK_STORE_ALGORITHM rocprim::block_store_method::block_store_transpose +#endif +struct Integral; +#define suite_name RocprimBlockStoreTransposeTests +#define block_params BlockParamsIntegral +#define name_suffix Integral + +#include "test_block_store.hpp" + +#undef suite_name +#undef block_params +#undef name_suffix + +struct Floating; +#define suite_name RocprimBlockStoreTransposeTests +#define block_params BlockParamsFloating +#define name_suffix Floating + +#include "test_block_store.hpp" diff --git a/test/rocprim/test_block_store_vectorize.cpp b/test/rocprim/test_block_store_vectorize.cpp new file mode 100644 index 00000000000..eee434a820a --- /dev/null +++ b/test/rocprim/test_block_store_vectorize.cpp @@ -0,0 +1,58 @@ +// MIT License +// +// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "../common_test_header.hpp" + +// required rocprim headers +#include + +// required test headers +#include "test_utils_types.hpp" + +// required common headers +#include "../../common/utils_device_ptr.hpp" + +// kernel definitions +#include "test_block_store.kernels.hpp" + +// Start stamping out tests +struct RocprimBlockStoreVectorizeTests; +#ifndef TEST_BLOCK_STORE_ALGORITHM + #define TEST_BLOCK_STORE_ALGORITHM rocprim::block_store_method::block_store_vectorize +#endif +struct Integral; +#define suite_name RocprimBlockStoreVectorizeTests +#define block_params BlockParamsIntegral +#define name_suffix Integral + +#include "test_block_store.hpp" + +#undef suite_name +#undef block_params +#undef name_suffix + +struct Floating; +#define suite_name RocprimBlockStoreVectorizeTests +#define block_params BlockParamsFloating +#define name_suffix Floating + +#include "test_block_store.hpp" diff --git a/test/rocprim/test_utils_types.hpp b/test/rocprim/test_utils_types.hpp index 25b97fb9f2f..ad1f57e273b 100644 --- a/test/rocprim/test_utils_types.hpp +++ b/test/rocprim/test_utils_types.hpp @@ -271,8 +271,18 @@ static constexpr unsigned int items[n_items] = { static constexpr unsigned int block_size = Params::block_size; \ }; +#define block_store_test_suite_type_def_helper(name, suffix) \ + template \ + class name ## suffix : public ::testing::Test { \ + public: \ + using DataType = typename Params::input_type; \ + static constexpr unsigned int block_size = Params::block_size; \ + }; + #define block_sort_test_suite_type_def(name, suffix) block_sort_test_suite_type_def_helper(name, suffix) +#define block_store_test_suite_type_def(name, suffix) block_store_test_suite_type_def_helper(name, suffix) + #define typed_test_suite_def_helper(name, suffix, params) TYPED_TEST_SUITE(name ## suffix, params) #define typed_test_suite_def(name, suffix, params) typed_test_suite_def_helper(name, suffix, params) From 90cb7ed04cf106c0152809a6e519a7788688a4e0 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Tue, 8 Apr 2025 10:35:00 -0600 Subject: [PATCH 06/14] revereted block store tests --- CHANGELOG.md | 6 - test/rocprim/test_block_sort.hpp | 349 +---------------------- test/rocprim/test_block_sort.kernels.hpp | 252 ---------------- 3 files changed, 1 insertion(+), 606 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 058d9cb9ee9..ea472afa042 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,12 +19,6 @@ Full documentation for rocPRIM is available at [https://rocm.docs.amd.com/projec * Added initial value support for warp- and block-level inclusive scan. * Added support for building tests with device-side random data generation, making them finish faster. This requires rocRAND, and is enabled with the `WITH_ROCRAND=ON` build flag. * Added additional unit tests for `test_block_scan.hpp` -* Added additional unit tests for `test_block_sort.hpp` -* New unit tests for `include/rocprim/block_store.h`. New test files are: - * `test_block_store_direct.cpp` - * `test_block_store_striped.cpp` - * `test_block_store_transposed.cpp` - * `test_block_store_vectorize.cpp` ### Changed diff --git a/test/rocprim/test_block_sort.hpp b/test/rocprim/test_block_sort.hpp index f57543a83da..89fa0fde6f9 100644 --- a/test/rocprim/test_block_sort.hpp +++ b/test/rocprim/test_block_sort.hpp @@ -333,287 +333,6 @@ void TestSortStableKey(std::vector sizes) } } } - -template -void TestSortKeyNoSize() -{ - int device_id = test_common_utils::obtain_device_from_ctest(); - SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); - HIP_CHECK(hipSetDevice(device_id)); - - static constexpr const unsigned int items_per_block = block_size * items_per_thread; - hipStream_t stream = 0; // default - - if(!is_buildable(block_size, items_per_thread, algo)) - { - GTEST_SKIP(); - } - - for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) - { - unsigned int seed_value - = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; - SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); - - size_t grid_size = 1134; - size_t size = items_per_block * grid_size; - SCOPED_TRACE(testing::Message() << "with size = " << size); - // Generate data - std::vector output - = test_utils::get_random_data_wrapped(size, -100, 100, seed_value); - - // Calculate expected results on host - std::vector expected(output); - binary_op_type binary_op; - for(size_t i = 0; i < grid_size; i++) - { - std::sort(expected.begin() + (i * items_per_block), - expected.begin() + std::min(size, ((i + 1) * items_per_block)), - binary_op); - } - - // Preparing device - common::device_ptr device_key_output(output); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(sort_keys_kernel_no_size), - dim3(grid_size), - dim3(block_size), - 0, - stream, - device_key_output.get() - ); - - // Reading results back - output = device_key_output.load(); - - test_utils::assert_eq(output, expected); - } -} - -template -void TestSortKeyValueNoSize() -{ - int device_id = test_common_utils::obtain_device_from_ctest(); - SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); - HIP_CHECK(hipSetDevice(device_id)); - - static constexpr const unsigned int items_per_block = block_size * items_per_thread; - static constexpr const size_t grid_size = 1134; - static constexpr const size_t size = items_per_block * grid_size; - hipStream_t stream = 0; // default - - if(!is_buildable(block_size, items_per_thread, algo)) - { - GTEST_SKIP(); - } - - for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) - { - unsigned int seed_value - = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; - SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); - SCOPED_TRACE(testing::Message() << "with size = " << size); - - // Generate data - std::vector output_key - = test_utils::get_random_data_wrapped(size, 0, 100, seed_value); - std::vector output_value - = test_utils::get_random_data_wrapped(size, -100, 100, seed_value); - - // Combine vectors to form pairs with key and value - std::vector> target(size); - for(unsigned i = 0; i < target.size(); i++) - target[i] = std::make_pair(output_key[i], output_value[i]); - - // Calculate expected results on host - using key_value = std::pair; - std::vector expected(target); - constexpr bool descending = !std::is_same>::value; - for(size_t i = 0; i < expected.size() / items_per_block; i++) - { - std::sort(expected.begin() + (i * items_per_block), - expected.begin() + ((i + 1) * items_per_block), - test_utils::key_value_comparator()); - } - - // Preparing device - common::device_ptr device_key_output(output_key); - common::device_ptr device_value_output(output_value); - - // Running kernel, ignored if invalid size - hipLaunchKernelGGL(HIP_KERNEL_NAME(sort_pairs_kernel_no_size), - dim3(grid_size), - dim3(block_size), - 0, - stream, - device_key_output.get(), - device_value_output.get()); - HIP_CHECK(hipGetLastError()); - - // Reading results back - output_key = device_key_output.load(); - output_value = device_value_output.load(); - - std::vector expected_key(expected.size()); - std::vector expected_value(expected.size()); - for(size_t i = 0; i < expected.size(); i++) - { - expected_key[i] = expected[i].first; - expected_value[i] = expected[i].second; - } - - // Keys are sorted, Values order not guaranteed - // Sort subsets where key was the same to make sure all values are still present - using value_op_type = rocprim::less; - using eq_op_type = rocprim::equal_to; - value_op_type value_op; - eq_op_type eq_op; - for(size_t i = 0; i < output_key.size();) - { - auto j = i; - for(; j < output_key.size() && eq_op(output_key[j], output_key[i]); ++j) - {} - std::sort(output_value.begin() + i, output_value.begin() + j, value_op); - std::sort(expected_value.begin() + i, expected_value.begin() + j, value_op); - i = j; - } - - test_utils::assert_eq(output_key, expected_key); - test_utils::assert_eq(output_value, expected_value); - } -} - -template -void TestSortKeyValueWithSize() -{ - int device_id = test_common_utils::obtain_device_from_ctest(); - SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); - HIP_CHECK(hipSetDevice(device_id)); - - static constexpr const unsigned int items_per_block = block_size * items_per_thread; - static constexpr const size_t grid_size = 1134; - static constexpr const size_t size = items_per_block * grid_size; - hipStream_t stream = 0; // default - - if(!is_buildable(block_size, items_per_thread, algo)) - { - GTEST_SKIP(); - } - - for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) - { - unsigned int seed_value - = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; - SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); - SCOPED_TRACE(testing::Message() << "with size = " << size); - - // Generate data - std::vector output_key - = test_utils::get_random_data_wrapped(size, 0, 100, seed_value); - std::vector output_value - = test_utils::get_random_data_wrapped(size, -100, 100, seed_value); - - // Combine vectors to form pairs with key and value - std::vector> target(size); - for(unsigned i = 0; i < target.size(); i++) - target[i] = std::make_pair(output_key[i], output_value[i]); - - // Calculate expected results on host - using key_value = std::pair; - std::vector expected(target); - constexpr bool descending = !std::is_same>::value; - for(size_t i = 0; i < expected.size() / items_per_block; i++) - { - std::sort(expected.begin() + (i * items_per_block), - expected.begin() + ((i + 1) * items_per_block), - test_utils::key_value_comparator()); - } - - // Preparing device - common::device_ptr device_key_output(output_key); - common::device_ptr device_value_output(output_value); - - // Running kernel, ignored if invalid size - hipLaunchKernelGGL(HIP_KERNEL_NAME(sort_pairs_kernel_with_size), - dim3(grid_size), - dim3(block_size), - 0, - stream, - device_key_output.get(), - device_value_output.get(), - items_per_block); - HIP_CHECK(hipGetLastError()); - - // Reading results back - output_key = device_key_output.load(); - output_value = device_value_output.load(); - - std::vector expected_key(expected.size()); - std::vector expected_value(expected.size()); - for(size_t i = 0; i < expected.size(); i++) - { - expected_key[i] = expected[i].first; - expected_value[i] = expected[i].second; - } - - // Keys are sorted, Values order not guaranteed - // Sort subsets where key was the same to make sure all values are still present - using value_op_type = rocprim::less; - using eq_op_type = rocprim::equal_to; - value_op_type value_op; - eq_op_type eq_op; - for(size_t i = 0; i < output_key.size();) - { - auto j = i; - for(; j < output_key.size() && eq_op(output_key[j], output_key[i]); ++j) - {} - std::sort(output_value.begin() + i, output_value.begin() + j, value_op); - std::sort(expected_value.begin() + i, expected_value.begin() + j, value_op); - i = j; - } - - test_utils::assert_eq(output_key, expected_key); - test_utils::assert_eq(output_value, expected_value); - } -} - #endif // TEST_ROCPRIM_TEST_BLOCK_SORT_HPP_ // This file is included multiple times in the test_block_sort_[algo].cpp file, because // the test definitions below this header guard need to be compiled for each test suites: @@ -710,7 +429,7 @@ typed_test_def(suite_name, static constexpr const unsigned int block_size = TestFixture::block_size; static constexpr const unsigned int items_per_thread = 4; std::vector sizes - = {0,53, 512, 5000, 34567, (1 << 17) - 1220, 1134 * 256, (1 << 20) - 123}; + = {0, 53, 512, 5000, 34567, (1 << 17) - 1220, 1134 * 256, (1 << 20) - 123}; TestSortKey(sizes); } @@ -724,69 +443,3 @@ typed_test_def(suite_name, name_suffix, SortKeyValueDesc) static constexpr const unsigned int items_per_thread = 1; TestSortKeyValue(); } - -typed_test_def(suite_name, name_suffix, SortKeyNoSize) -{ - using key_type = typename TestFixture::key_type; - using value_type = typename TestFixture::value_type; - using binary_op_type = typename rocprim::greater; - static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; - static constexpr const unsigned int block_size = TestFixture::block_size; - static constexpr const unsigned int items_per_thread = 1; - TestSortKeyNoSize(); -} - -typed_test_def(suite_name, name_suffix, SortKeyNoSizeMultipleItemsPerThread) -{ - using key_type = typename TestFixture::key_type; - using value_type = typename TestFixture::value_type; - using binary_op_type = typename rocprim::greater; - static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; - static constexpr const unsigned int block_size = TestFixture::block_size; - static constexpr const unsigned int items_per_thread = 4; - TestSortKeyNoSize(); -} - -typed_test_def(suite_name, name_suffix, SortKeyValueNoSize) -{ - using key_type = typename TestFixture::key_type; - using value_type = typename TestFixture::value_type; - using binary_op_type = typename rocprim::less; - static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; - static constexpr const unsigned int block_size = TestFixture::block_size; - static constexpr const unsigned int items_per_thread = 1; - TestSortKeyValueNoSize(); -} - -typed_test_def(suite_name, name_suffix, SortKeyValueNoSizeMultipleItemsPerThread) -{ - using key_type = typename TestFixture::key_type; - using value_type = typename TestFixture::value_type; - using binary_op_type = typename rocprim::less; - static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; - static constexpr const unsigned int block_size = TestFixture::block_size; - static constexpr const unsigned int items_per_thread = 4; - TestSortKeyValueNoSize(); -} - -typed_test_def(suite_name, name_suffix, SortKeyValueWithSize) -{ - using key_type = typename TestFixture::key_type; - using value_type = typename TestFixture::value_type; - using binary_op_type = typename rocprim::less; - static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; - static constexpr const unsigned int block_size = TestFixture::block_size; - static constexpr const unsigned int items_per_thread = 1; - TestSortKeyValueWithSize(); -} - -typed_test_def(suite_name, name_suffix, SortKeyValueWithSizeMultipleItemsPerThread) -{ - using key_type = typename TestFixture::key_type; - using value_type = typename TestFixture::value_type; - using binary_op_type = typename rocprim::less; - static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; - static constexpr const unsigned int block_size = TestFixture::block_size; - static constexpr const unsigned int items_per_thread = 4; - TestSortKeyValueWithSize(); -} diff --git a/test/rocprim/test_block_sort.kernels.hpp b/test/rocprim/test_block_sort.kernels.hpp index 6aab8eaee08..d595471d39a 100644 --- a/test/rocprim/test_block_sort.kernels.hpp +++ b/test/rocprim/test_block_sort.kernels.hpp @@ -237,256 +237,4 @@ __global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(key_type* /*keys* OffsetT /*size*/) {} -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - class KeyIterator, - rocprim::block_sort_algorithm algorithm, - class BinaryOp = rocprim::less::value_type>, - std::enable_if_t<(ItemsPerThread == 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), - int> - = 0> -__global__ __launch_bounds__(BlockSize) void sort_keys_kernel_no_size(KeyIterator keys) -{ - using key_type = typename std::iterator_traits::value_type; - - static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; - const unsigned int block_offset = blockIdx.x * ItemsPerBlock; - const unsigned int index = block_offset + threadIdx.x * ItemsPerThread; - - using bsort_type - = rocprim::block_sort; - ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; - - key_type thread_key = keys[index]; - - bsort_type().sort(thread_key, - storage, - BinaryOp()); - - keys[index] = thread_key; -} - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - class KeyIterator, - rocprim::block_sort_algorithm algorithm, - class BinaryOp = rocprim::less::value_type>, - std::enable_if_t<(ItemsPerThread > 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), - int> - = 0> -__global__ __launch_bounds__(BlockSize) void sort_keys_kernel_no_size(KeyIterator keys) -{ - using key_type = typename std::iterator_traits::value_type; - - static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; - const unsigned int block_offset = blockIdx.x * ItemsPerBlock; - const unsigned int index = block_offset + threadIdx.x * ItemsPerThread; - - using bsort_type - = rocprim::block_sort; - ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; - - key_type thread_keys[ItemsPerThread]; - for(size_t i = 0; i < ItemsPerThread; i++) - thread_keys[i] = keys[index + i]; - // - bsort_type().sort(thread_keys, - storage, - BinaryOp()); - for(size_t i = 0; i < ItemsPerThread; i++) - keys[index + i] = thread_keys[i]; -} - -template::value_type>, - std::enable_if_t = 0> -__global__ __launch_bounds__(BlockSize) void sort_keys_kernel_no_size(KeyIterator) -{} - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - class key_type, - class value_type, - rocprim::block_sort_algorithm algorithm, - class BinaryOp = rocprim::less, - std::enable_if_t<(ItemsPerThread == 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), - int> - = 0> - -__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_no_size(key_type* keys, - value_type* values) -{ - using bsort_type - = rocprim::block_sort; - - ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; - - static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; - const unsigned int block_offset = blockIdx.x * ItemsPerBlock; - const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); - key_type thread_key = keys[index]; - value_type thread_value = values[index]; - - bsort_type().sort(thread_key, - thread_value, - storage, - BinaryOp() - ); - keys[index] = thread_key; - values[index] = thread_value; -} - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - class key_type, - class value_type, - rocprim::block_sort_algorithm algorithm, - class BinaryOp = rocprim::less, - std::enable_if_t<(ItemsPerThread > 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), - int> - = 0> - -__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_no_size(key_type* keys, - value_type* values) -{ - using bsort_type - = rocprim::block_sort; - - ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; - - static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; - const unsigned int block_offset = blockIdx.x * ItemsPerBlock; - const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); - - key_type thread_keys[ItemsPerThread]; - value_type thread_value[ItemsPerThread]; - - for(size_t i = 0; i < ItemsPerThread; i++){ - thread_keys[i] = keys[index + i]; - thread_value[i] = values[index + i]; - } - bsort_type().sort(thread_keys, - thread_value, - storage, - BinaryOp()); - for(size_t i = 0; i < ItemsPerThread; i++){ - keys[index + i] = thread_keys[i]; - values[index + i] = thread_value[i]; - } -} - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - class key_type, - class value_type, - rocprim::block_sort_algorithm algorithm, - class BinaryOp = rocprim::less, - std::enable_if_t = 0> -__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_no_size(key_type* keys, - value_type* values) -{} - - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - class key_type, - class value_type, - rocprim::block_sort_algorithm algorithm, - class BinaryOp = rocprim::less, - std::enable_if_t<(ItemsPerThread == 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), - int> - = 0> - -__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_with_size(key_type* keys, - value_type* values, - const unsigned int size - ) -{ - using bsort_type - = rocprim::block_sort; - - ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; - - static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; - const unsigned int block_offset = blockIdx.x * ItemsPerBlock; - const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); - - key_type thread_keys = keys[index]; - value_type thread_value = values[index]; - - bsort_type().sort(thread_keys, - thread_value, - storage, - size, - BinaryOp()); - - keys[index] = thread_keys; - values[index] = thread_value; -} - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - class key_type, - class value_type, - rocprim::block_sort_algorithm algorithm, - class BinaryOp = rocprim::less, - std::enable_if_t<(ItemsPerThread > 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), - int> - = 0> - -__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_with_size(key_type* keys, - value_type* values, - const unsigned int size - ) -{ - using bsort_type - = rocprim::block_sort; - - ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; - - static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; - const unsigned int block_offset = blockIdx.x * ItemsPerBlock; - const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); - - key_type thread_keys[ItemsPerThread]; - value_type thread_value[ItemsPerThread]; - - for(size_t i = 0; i < ItemsPerThread; i++){ - thread_keys[i] = keys[index + i]; - thread_value[i] = values[index + i]; - } - - bsort_type().sort(thread_keys, - thread_value, - storage, - size, - BinaryOp()); - - for(size_t i = 0; i < ItemsPerThread; i++){ - keys[index + i] = thread_keys[i]; - values[index + i] = thread_value[i]; - } -} - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - class key_type, - class value_type, - rocprim::block_sort_algorithm algorithm, - class BinaryOp = rocprim::less, - std::enable_if_t = 0> -__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_with_size(key_type* keys, - value_type* values, const unsigned int size) -{} #endif // TEST_BLOCK_SORT_KERNELS_HPP_ From ce5a2b2d5de024bc3c809713ae9f6401ddc6b13d Mon Sep 17 00:00:00 2001 From: Di Nguyen Date: Tue, 8 Apr 2025 10:39:41 -0600 Subject: [PATCH 07/14] Zenguyen/block sort missing unit test (#3) * finished sort keys no size tests * completed sort pair no size functions * completed sort pair with size functions * updated changelog --- CHANGELOG.md | 1 + test/rocprim/test_block_sort.hpp | 349 ++++++++++++++++++++++- test/rocprim/test_block_sort.kernels.hpp | 252 ++++++++++++++++ 3 files changed, 601 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ea472afa042..b39c9af6e3b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ Full documentation for rocPRIM is available at [https://rocm.docs.amd.com/projec * Added initial value support for warp- and block-level inclusive scan. * Added support for building tests with device-side random data generation, making them finish faster. This requires rocRAND, and is enabled with the `WITH_ROCRAND=ON` build flag. * Added additional unit tests for `test_block_scan.hpp` +* Added additional unit tests for `test_block_sort.hpp` ### Changed diff --git a/test/rocprim/test_block_sort.hpp b/test/rocprim/test_block_sort.hpp index 89fa0fde6f9..f57543a83da 100644 --- a/test/rocprim/test_block_sort.hpp +++ b/test/rocprim/test_block_sort.hpp @@ -333,6 +333,287 @@ void TestSortStableKey(std::vector sizes) } } } + +template +void TestSortKeyNoSize() +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + static constexpr const unsigned int items_per_block = block_size * items_per_thread; + hipStream_t stream = 0; // default + + if(!is_buildable(block_size, items_per_thread, algo)) + { + GTEST_SKIP(); + } + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + size_t grid_size = 1134; + size_t size = items_per_block * grid_size; + SCOPED_TRACE(testing::Message() << "with size = " << size); + // Generate data + std::vector output + = test_utils::get_random_data_wrapped(size, -100, 100, seed_value); + + // Calculate expected results on host + std::vector expected(output); + binary_op_type binary_op; + for(size_t i = 0; i < grid_size; i++) + { + std::sort(expected.begin() + (i * items_per_block), + expected.begin() + std::min(size, ((i + 1) * items_per_block)), + binary_op); + } + + // Preparing device + common::device_ptr device_key_output(output); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(sort_keys_kernel_no_size), + dim3(grid_size), + dim3(block_size), + 0, + stream, + device_key_output.get() + ); + + // Reading results back + output = device_key_output.load(); + + test_utils::assert_eq(output, expected); + } +} + +template +void TestSortKeyValueNoSize() +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + static constexpr const unsigned int items_per_block = block_size * items_per_thread; + static constexpr const size_t grid_size = 1134; + static constexpr const size_t size = items_per_block * grid_size; + hipStream_t stream = 0; // default + + if(!is_buildable(block_size, items_per_thread, algo)) + { + GTEST_SKIP(); + } + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + SCOPED_TRACE(testing::Message() << "with size = " << size); + + // Generate data + std::vector output_key + = test_utils::get_random_data_wrapped(size, 0, 100, seed_value); + std::vector output_value + = test_utils::get_random_data_wrapped(size, -100, 100, seed_value); + + // Combine vectors to form pairs with key and value + std::vector> target(size); + for(unsigned i = 0; i < target.size(); i++) + target[i] = std::make_pair(output_key[i], output_value[i]); + + // Calculate expected results on host + using key_value = std::pair; + std::vector expected(target); + constexpr bool descending = !std::is_same>::value; + for(size_t i = 0; i < expected.size() / items_per_block; i++) + { + std::sort(expected.begin() + (i * items_per_block), + expected.begin() + ((i + 1) * items_per_block), + test_utils::key_value_comparator()); + } + + // Preparing device + common::device_ptr device_key_output(output_key); + common::device_ptr device_value_output(output_value); + + // Running kernel, ignored if invalid size + hipLaunchKernelGGL(HIP_KERNEL_NAME(sort_pairs_kernel_no_size), + dim3(grid_size), + dim3(block_size), + 0, + stream, + device_key_output.get(), + device_value_output.get()); + HIP_CHECK(hipGetLastError()); + + // Reading results back + output_key = device_key_output.load(); + output_value = device_value_output.load(); + + std::vector expected_key(expected.size()); + std::vector expected_value(expected.size()); + for(size_t i = 0; i < expected.size(); i++) + { + expected_key[i] = expected[i].first; + expected_value[i] = expected[i].second; + } + + // Keys are sorted, Values order not guaranteed + // Sort subsets where key was the same to make sure all values are still present + using value_op_type = rocprim::less; + using eq_op_type = rocprim::equal_to; + value_op_type value_op; + eq_op_type eq_op; + for(size_t i = 0; i < output_key.size();) + { + auto j = i; + for(; j < output_key.size() && eq_op(output_key[j], output_key[i]); ++j) + {} + std::sort(output_value.begin() + i, output_value.begin() + j, value_op); + std::sort(expected_value.begin() + i, expected_value.begin() + j, value_op); + i = j; + } + + test_utils::assert_eq(output_key, expected_key); + test_utils::assert_eq(output_value, expected_value); + } +} + +template +void TestSortKeyValueWithSize() +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + static constexpr const unsigned int items_per_block = block_size * items_per_thread; + static constexpr const size_t grid_size = 1134; + static constexpr const size_t size = items_per_block * grid_size; + hipStream_t stream = 0; // default + + if(!is_buildable(block_size, items_per_thread, algo)) + { + GTEST_SKIP(); + } + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + SCOPED_TRACE(testing::Message() << "with size = " << size); + + // Generate data + std::vector output_key + = test_utils::get_random_data_wrapped(size, 0, 100, seed_value); + std::vector output_value + = test_utils::get_random_data_wrapped(size, -100, 100, seed_value); + + // Combine vectors to form pairs with key and value + std::vector> target(size); + for(unsigned i = 0; i < target.size(); i++) + target[i] = std::make_pair(output_key[i], output_value[i]); + + // Calculate expected results on host + using key_value = std::pair; + std::vector expected(target); + constexpr bool descending = !std::is_same>::value; + for(size_t i = 0; i < expected.size() / items_per_block; i++) + { + std::sort(expected.begin() + (i * items_per_block), + expected.begin() + ((i + 1) * items_per_block), + test_utils::key_value_comparator()); + } + + // Preparing device + common::device_ptr device_key_output(output_key); + common::device_ptr device_value_output(output_value); + + // Running kernel, ignored if invalid size + hipLaunchKernelGGL(HIP_KERNEL_NAME(sort_pairs_kernel_with_size), + dim3(grid_size), + dim3(block_size), + 0, + stream, + device_key_output.get(), + device_value_output.get(), + items_per_block); + HIP_CHECK(hipGetLastError()); + + // Reading results back + output_key = device_key_output.load(); + output_value = device_value_output.load(); + + std::vector expected_key(expected.size()); + std::vector expected_value(expected.size()); + for(size_t i = 0; i < expected.size(); i++) + { + expected_key[i] = expected[i].first; + expected_value[i] = expected[i].second; + } + + // Keys are sorted, Values order not guaranteed + // Sort subsets where key was the same to make sure all values are still present + using value_op_type = rocprim::less; + using eq_op_type = rocprim::equal_to; + value_op_type value_op; + eq_op_type eq_op; + for(size_t i = 0; i < output_key.size();) + { + auto j = i; + for(; j < output_key.size() && eq_op(output_key[j], output_key[i]); ++j) + {} + std::sort(output_value.begin() + i, output_value.begin() + j, value_op); + std::sort(expected_value.begin() + i, expected_value.begin() + j, value_op); + i = j; + } + + test_utils::assert_eq(output_key, expected_key); + test_utils::assert_eq(output_value, expected_value); + } +} + #endif // TEST_ROCPRIM_TEST_BLOCK_SORT_HPP_ // This file is included multiple times in the test_block_sort_[algo].cpp file, because // the test definitions below this header guard need to be compiled for each test suites: @@ -429,7 +710,7 @@ typed_test_def(suite_name, static constexpr const unsigned int block_size = TestFixture::block_size; static constexpr const unsigned int items_per_thread = 4; std::vector sizes - = {0, 53, 512, 5000, 34567, (1 << 17) - 1220, 1134 * 256, (1 << 20) - 123}; + = {0,53, 512, 5000, 34567, (1 << 17) - 1220, 1134 * 256, (1 << 20) - 123}; TestSortKey(sizes); } @@ -443,3 +724,69 @@ typed_test_def(suite_name, name_suffix, SortKeyValueDesc) static constexpr const unsigned int items_per_thread = 1; TestSortKeyValue(); } + +typed_test_def(suite_name, name_suffix, SortKeyNoSize) +{ + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; + using binary_op_type = typename rocprim::greater; + static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 1; + TestSortKeyNoSize(); +} + +typed_test_def(suite_name, name_suffix, SortKeyNoSizeMultipleItemsPerThread) +{ + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; + using binary_op_type = typename rocprim::greater; + static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 4; + TestSortKeyNoSize(); +} + +typed_test_def(suite_name, name_suffix, SortKeyValueNoSize) +{ + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; + using binary_op_type = typename rocprim::less; + static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 1; + TestSortKeyValueNoSize(); +} + +typed_test_def(suite_name, name_suffix, SortKeyValueNoSizeMultipleItemsPerThread) +{ + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; + using binary_op_type = typename rocprim::less; + static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 4; + TestSortKeyValueNoSize(); +} + +typed_test_def(suite_name, name_suffix, SortKeyValueWithSize) +{ + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; + using binary_op_type = typename rocprim::less; + static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 1; + TestSortKeyValueWithSize(); +} + +typed_test_def(suite_name, name_suffix, SortKeyValueWithSizeMultipleItemsPerThread) +{ + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; + using binary_op_type = typename rocprim::less; + static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 4; + TestSortKeyValueWithSize(); +} diff --git a/test/rocprim/test_block_sort.kernels.hpp b/test/rocprim/test_block_sort.kernels.hpp index d595471d39a..6aab8eaee08 100644 --- a/test/rocprim/test_block_sort.kernels.hpp +++ b/test/rocprim/test_block_sort.kernels.hpp @@ -237,4 +237,256 @@ __global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(key_type* /*keys* OffsetT /*size*/) {} +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class KeyIterator, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less::value_type>, + std::enable_if_t<(ItemsPerThread == 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), + int> + = 0> +__global__ __launch_bounds__(BlockSize) void sort_keys_kernel_no_size(KeyIterator keys) +{ + using key_type = typename std::iterator_traits::value_type; + + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + threadIdx.x * ItemsPerThread; + + using bsort_type + = rocprim::block_sort; + ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; + + key_type thread_key = keys[index]; + + bsort_type().sort(thread_key, + storage, + BinaryOp()); + + keys[index] = thread_key; +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class KeyIterator, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less::value_type>, + std::enable_if_t<(ItemsPerThread > 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), + int> + = 0> +__global__ __launch_bounds__(BlockSize) void sort_keys_kernel_no_size(KeyIterator keys) +{ + using key_type = typename std::iterator_traits::value_type; + + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + threadIdx.x * ItemsPerThread; + + using bsort_type + = rocprim::block_sort; + ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; + + key_type thread_keys[ItemsPerThread]; + for(size_t i = 0; i < ItemsPerThread; i++) + thread_keys[i] = keys[index + i]; + // + bsort_type().sort(thread_keys, + storage, + BinaryOp()); + for(size_t i = 0; i < ItemsPerThread; i++) + keys[index + i] = thread_keys[i]; +} + +template::value_type>, + std::enable_if_t = 0> +__global__ __launch_bounds__(BlockSize) void sort_keys_kernel_no_size(KeyIterator) +{} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class key_type, + class value_type, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less, + std::enable_if_t<(ItemsPerThread == 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), + int> + = 0> + +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_no_size(key_type* keys, + value_type* values) +{ + using bsort_type + = rocprim::block_sort; + + ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; + + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); + key_type thread_key = keys[index]; + value_type thread_value = values[index]; + + bsort_type().sort(thread_key, + thread_value, + storage, + BinaryOp() + ); + keys[index] = thread_key; + values[index] = thread_value; +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class key_type, + class value_type, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less, + std::enable_if_t<(ItemsPerThread > 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), + int> + = 0> + +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_no_size(key_type* keys, + value_type* values) +{ + using bsort_type + = rocprim::block_sort; + + ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; + + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); + + key_type thread_keys[ItemsPerThread]; + value_type thread_value[ItemsPerThread]; + + for(size_t i = 0; i < ItemsPerThread; i++){ + thread_keys[i] = keys[index + i]; + thread_value[i] = values[index + i]; + } + bsort_type().sort(thread_keys, + thread_value, + storage, + BinaryOp()); + for(size_t i = 0; i < ItemsPerThread; i++){ + keys[index + i] = thread_keys[i]; + values[index + i] = thread_value[i]; + } +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class key_type, + class value_type, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less, + std::enable_if_t = 0> +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_no_size(key_type* keys, + value_type* values) +{} + + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class key_type, + class value_type, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less, + std::enable_if_t<(ItemsPerThread == 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), + int> + = 0> + +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_with_size(key_type* keys, + value_type* values, + const unsigned int size + ) +{ + using bsort_type + = rocprim::block_sort; + + ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; + + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); + + key_type thread_keys = keys[index]; + value_type thread_value = values[index]; + + bsort_type().sort(thread_keys, + thread_value, + storage, + size, + BinaryOp()); + + keys[index] = thread_keys; + values[index] = thread_value; +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class key_type, + class value_type, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less, + std::enable_if_t<(ItemsPerThread > 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), + int> + = 0> + +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_with_size(key_type* keys, + value_type* values, + const unsigned int size + ) +{ + using bsort_type + = rocprim::block_sort; + + ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; + + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); + + key_type thread_keys[ItemsPerThread]; + value_type thread_value[ItemsPerThread]; + + for(size_t i = 0; i < ItemsPerThread; i++){ + thread_keys[i] = keys[index + i]; + thread_value[i] = values[index + i]; + } + + bsort_type().sort(thread_keys, + thread_value, + storage, + size, + BinaryOp()); + + for(size_t i = 0; i < ItemsPerThread; i++){ + keys[index + i] = thread_keys[i]; + values[index + i] = thread_value[i]; + } +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class key_type, + class value_type, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less, + std::enable_if_t = 0> +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_with_size(key_type* keys, + value_type* values, const unsigned int size) +{} #endif // TEST_BLOCK_SORT_KERNELS_HPP_ From 092328169507f1d1e61f3c285b4e1540ca7fdae3 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Tue, 8 Apr 2025 10:39:53 -0600 Subject: [PATCH 08/14] removed redundant tests --- test/rocprim/test_block_store.hpp | 193 -------------------- test/rocprim/test_block_store.kernels.hpp | 154 ---------------- test/rocprim/test_block_store_direct.cpp | 58 ------ test/rocprim/test_block_store_striped.cpp | 58 ------ test/rocprim/test_block_store_transpose.cpp | 58 ------ test/rocprim/test_block_store_vectorize.cpp | 58 ------ 6 files changed, 579 deletions(-) delete mode 100644 test/rocprim/test_block_store.hpp delete mode 100644 test/rocprim/test_block_store.kernels.hpp delete mode 100644 test/rocprim/test_block_store_direct.cpp delete mode 100644 test/rocprim/test_block_store_striped.cpp delete mode 100644 test/rocprim/test_block_store_transpose.cpp delete mode 100644 test/rocprim/test_block_store_vectorize.cpp diff --git a/test/rocprim/test_block_store.hpp b/test/rocprim/test_block_store.hpp deleted file mode 100644 index c37f51b50c6..00000000000 --- a/test/rocprim/test_block_store.hpp +++ /dev/null @@ -1,193 +0,0 @@ -#include "../common_test_header.hpp" -#include "test_utils.hpp" - -#include "../../common/utils.hpp" -#include "../../common/utils_device_ptr.hpp" -#include "test_seed.hpp" -#include "test_utils_assertions.hpp" -#include "test_utils_data_generation.hpp" -#include "test_utils_sort_comparator.hpp" - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -block_store_test_suite_type_def(suite_name, name_suffix); -typed_test_suite_def(suite_name, name_suffix, block_params); - -// using header guards for these test functions because this file is included multiple times: -// once for the integrals test suite and once for the floating point test suite. -#ifndef TEST_ROCPRIM_TEST_BLOCK_STORE_HPP_ - #define TEST_ROCPRIM_TEST_BLOCK_STORE_HPP_ - -template< - bool use_size, - unsigned int block_size, - unsigned int items_per_thread, - typename DataType, - rocprim::block_store_method algorithm -> -void TestStore(){ - - constexpr size_t items_per_block = block_size * items_per_thread; - constexpr size_t grid_size = 120; - constexpr size_t size = items_per_block * grid_size; - - - std::vector host_input(size); - common::device_ptr device_output(host_input); - - for(size_t i = 0; i < size; i++) host_input[i] = static_cast(i); - common::device_ptr device_input(host_input); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(store_kernel< - use_size, - block_size, - items_per_thread, - DataType, - algorithm - >), - dim3(grid_size), - dim3(block_size), - 0, - 0, - device_input.get(), - device_output.get() - ); - - HIP_CHECK(hipGetLastError()); - std::vector host_output = device_output.load(); - test_utils::assert_eq(host_input, host_output); -} - -template< - bool use_size, - unsigned int block_size, - unsigned int items_per_thread, - typename DataType, - rocprim::block_store_method algorithm -> -void TestStoreWithStorage(){ - - constexpr size_t items_per_block = block_size * items_per_thread; - constexpr size_t grid_size = 120; - constexpr size_t size = items_per_block * grid_size; - - - std::vector host_input(size); - common::device_ptr device_output(host_input); - - for(size_t i = 0; i < size; i++) host_input[i] = static_cast(i); - common::device_ptr device_input(host_input); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(store_kernel_with_storage< - use_size, - block_size, - items_per_thread, - DataType, - algorithm - >), - dim3(grid_size), - dim3(block_size), - 0, - 0, - device_input.get(), - device_output.get() - ); - - HIP_CHECK(hipGetLastError()); - std::vector host_output = device_output.load(); - test_utils::assert_eq(host_input, host_output); -} - -#endif - -typed_test_def(suite_name, name_suffix, Store) -{ - using DataType = typename TestFixture::DataType; - static constexpr const rocprim::block_store_method algo = TEST_BLOCK_STORE_ALGORITHM; - static constexpr const unsigned int block_size = TestFixture::block_size; - static constexpr const unsigned int items_per_thread = 1; - static constexpr const bool use_size = false; - TestStore(); -} - -typed_test_def(suite_name, name_suffix, StoreMultipleItemsPerThread) -{ - using DataType = typename TestFixture::DataType; - static constexpr const rocprim::block_store_method algo = TEST_BLOCK_STORE_ALGORITHM; - static constexpr const unsigned int block_size = TestFixture::block_size; - static constexpr const unsigned int items_per_thread = 4; - static constexpr const bool use_size = false; - TestStore(); -} - -typed_test_def(suite_name, name_suffix, StoreWithSize) -{ - using DataType = typename TestFixture::DataType; - static constexpr const rocprim::block_store_method algo = TEST_BLOCK_STORE_ALGORITHM; - static constexpr const unsigned int block_size = TestFixture::block_size; - static constexpr const unsigned int items_per_thread = 1; - static constexpr const bool use_size = true; - TestStore(); -} - -typed_test_def(suite_name, name_suffix, StoreWithSizeMultipleItemsPerThread) -{ - using DataType = typename TestFixture::DataType; - static constexpr const rocprim::block_store_method algo = TEST_BLOCK_STORE_ALGORITHM; - static constexpr const unsigned int block_size = TestFixture::block_size; - static constexpr const unsigned int items_per_thread = 4; - static constexpr const bool use_size = true; - TestStore(); -} - -typed_test_def(suite_name, name_suffix, StoreWithStorage) -{ - using DataType = typename TestFixture::DataType; - static constexpr const rocprim::block_store_method algo = TEST_BLOCK_STORE_ALGORITHM; - static constexpr const unsigned int block_size = TestFixture::block_size; - static constexpr const unsigned int items_per_thread = 1; - static constexpr const bool use_size = false; - TestStoreWithStorage(); -} - -typed_test_def(suite_name, name_suffix, StoreMultipleItemsPerThreadWithStorage) -{ - using DataType = typename TestFixture::DataType; - static constexpr const rocprim::block_store_method algo = TEST_BLOCK_STORE_ALGORITHM; - static constexpr const unsigned int block_size = TestFixture::block_size; - static constexpr const unsigned int items_per_thread = 4; - static constexpr const bool use_size = false; - TestStoreWithStorage(); -} - -typed_test_def(suite_name, name_suffix, StoreWithSizeWithStorage) -{ - using DataType = typename TestFixture::DataType; - static constexpr const rocprim::block_store_method algo = TEST_BLOCK_STORE_ALGORITHM; - static constexpr const unsigned int block_size = TestFixture::block_size; - static constexpr const unsigned int items_per_thread = 1; - static constexpr const bool use_size = true; - TestStoreWithStorage(); -} - -typed_test_def(suite_name, name_suffix, StoreWithSizeMultipleItemsPerThreadWithStorage) -{ - using DataType = typename TestFixture::DataType; - static constexpr const rocprim::block_store_method algo = TEST_BLOCK_STORE_ALGORITHM; - static constexpr const unsigned int block_size = TestFixture::block_size; - static constexpr const unsigned int items_per_thread = 4; - static constexpr const bool use_size = true; - TestStoreWithStorage(); -} diff --git a/test/rocprim/test_block_store.kernels.hpp b/test/rocprim/test_block_store.kernels.hpp deleted file mode 100644 index 8d9303588fd..00000000000 --- a/test/rocprim/test_block_store.kernels.hpp +++ /dev/null @@ -1,154 +0,0 @@ -#ifndef TEST_BLOCK_STORE_KERNELS_HPP_ -#define TEST_BLOCK_STORE_KERNELS_HPP_ - - - -constexpr bool is_buildable(unsigned int BlockSize, - unsigned int ItemsPerThread, - rocprim::block_store_method algorithm - ) -{ - switch(algorithm) - { - case rocprim::block_store_method::block_store_direct: - case rocprim::block_store_method::block_store_striped: - case rocprim::block_store_method::block_store_transpose: - return true; - case rocprim::block_store_method::block_store_vectorize: - return (ItemsPerThread % 2 == 0) && ((BlockSize * ItemsPerThread) % 4 == 0); - case rocprim::block_store_method::block_store_warp_transpose: - return BlockSize % rocprim::device_warp_size() == 0; - } - return false; -} - -template< - bool useSize, - unsigned int BlockSize, - unsigned int ItemsPerThread, - typename DataType, - rocprim::block_store_method algorithm, - std::enable_if_t<(is_buildable(BlockSize, ItemsPerThread, algorithm)), - int> = 0 - > -__global__ __launch_bounds__(BlockSize) void store_kernel(DataType * input, DataType * output){ - using bstore_type = rocprim::block_store; - - static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; - const unsigned int block_offset = blockIdx.x * ItemsPerBlock; - const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); - - DataType temp[ItemsPerThread]; - __shared__ DataType storage[ItemsPerBlock]; - - for(unsigned int i = 0; i < ItemsPerThread; i++){ - switch(algorithm){ - case rocprim::block_store_method::block_store_direct: - case rocprim::block_store_method::block_store_transpose: - case rocprim::block_store_method::block_store_vectorize: - temp[i] = input[index + i]; - break; - - case rocprim::block_store_method::block_store_striped: - temp[i] = input[block_offset + (threadIdx.x + i * BlockSize)]; - break; - } - } - - if(useSize) - bstore_type().store(storage, temp, ItemsPerBlock); - else - bstore_type().store(storage, temp); - - __syncthreads(); - - for(unsigned int i = 0; i < ItemsPerThread; i++){ - output[index + i] = storage[threadIdx.x * ItemsPerThread + i]; - } -} - -template< - bool useSize, - unsigned int BlockSize, - unsigned int ItemsPerThread, - typename DataType, - rocprim::block_store_method algorithm, - std::enable_if_t = 0 - > -__global__ __launch_bounds__(BlockSize) void store_kernel(DataType * input, DataType * output){ - static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; - const unsigned int block_offset = blockIdx.x * ItemsPerBlock; - const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); - - for(unsigned int i = 0; i < ItemsPerThread; i++){ - output[index + i] = input[index + i]; - } -} - -template< - bool useSize, - unsigned int BlockSize, - unsigned int ItemsPerThread, - typename DataType, - rocprim::block_store_method algorithm, - std::enable_if_t<(is_buildable(BlockSize, ItemsPerThread, algorithm)), - int> = 0 - > -__global__ __launch_bounds__(BlockSize) void store_kernel_with_storage(DataType * input, DataType * output){ - using bstore_type = rocprim::block_store; - - static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; - const unsigned int block_offset = blockIdx.x * ItemsPerBlock; - const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); - - DataType temp[ItemsPerThread]; - __shared__ DataType temp_out[ItemsPerBlock]; - - for(unsigned int i = 0; i < ItemsPerThread; i++){ - switch(algorithm){ - case rocprim::block_store_method::block_store_direct: - case rocprim::block_store_method::block_store_transpose: - case rocprim::block_store_method::block_store_vectorize: - temp[i] = input[index + i]; - break; - - case rocprim::block_store_method::block_store_striped: - temp[i] = input[block_offset + (threadIdx.x + i * BlockSize)]; - break; - } - } - ROCPRIM_SHARED_MEMORY typename bstore_type::storage_type storage; - - if(useSize) - bstore_type().store(temp_out, temp, ItemsPerBlock, storage); - else - bstore_type().store(temp_out, temp, storage); - - __syncthreads(); - - for(unsigned int i = 0; i < ItemsPerThread; i++){ - output[index + i] = temp_out[threadIdx.x * ItemsPerThread + i]; - } -} - -template< - bool useSize, - unsigned int BlockSize, - unsigned int ItemsPerThread, - typename DataType, - rocprim::block_store_method algorithm, - std::enable_if_t = 0 - > -__global__ __launch_bounds__(BlockSize) void store_kernel_with_storage(DataType * input, DataType * output){ - static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; - const unsigned int block_offset = blockIdx.x * ItemsPerBlock; - const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); - - for(unsigned int i = 0; i < ItemsPerThread; i++){ - output[index + i] = input[index + i]; - } -} - -#endif \ No newline at end of file diff --git a/test/rocprim/test_block_store_direct.cpp b/test/rocprim/test_block_store_direct.cpp deleted file mode 100644 index ce23a7fa7ad..00000000000 --- a/test/rocprim/test_block_store_direct.cpp +++ /dev/null @@ -1,58 +0,0 @@ -// MIT License -// -// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#include "../common_test_header.hpp" - -// required rocprim headers -#include - -// required test headers -#include "test_utils_types.hpp" - -// required common headers -#include "../../common/utils_device_ptr.hpp" - -// kernel definitions -#include "test_block_store.kernels.hpp" - -// Start stamping out tests -struct RocprimBlockStoreDirectTests; -#ifndef TEST_BLOCK_STORE_ALGORITHM - #define TEST_BLOCK_STORE_ALGORITHM rocprim::block_store_method::block_store_direct -#endif -struct Integral; -#define suite_name RocprimBlockStoreDirectTests -#define block_params BlockParamsIntegral -#define name_suffix Integral - -#include "test_block_store.hpp" - -#undef suite_name -#undef block_params -#undef name_suffix - -struct Floating; -#define suite_name RocprimBlockStoreDirectTests -#define block_params BlockParamsFloating -#define name_suffix Floating - -#include "test_block_store.hpp" diff --git a/test/rocprim/test_block_store_striped.cpp b/test/rocprim/test_block_store_striped.cpp deleted file mode 100644 index cf01a40882c..00000000000 --- a/test/rocprim/test_block_store_striped.cpp +++ /dev/null @@ -1,58 +0,0 @@ -// MIT License -// -// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#include "../common_test_header.hpp" - -// required rocprim headers -#include - -// required test headers -#include "test_utils_types.hpp" - -// required common headers -#include "../../common/utils_device_ptr.hpp" - -// kernel definitions -#include "test_block_store.kernels.hpp" - -// Start stamping out tests -struct RocprimBlockStoreStripedTests; -#ifndef TEST_BLOCK_STORE_ALGORITHM - #define TEST_BLOCK_STORE_ALGORITHM rocprim::block_store_method::block_store_striped -#endif -struct Integral; -#define suite_name RocprimBlockStoreStripedTests -#define block_params BlockParamsIntegral -#define name_suffix Integral - -#include "test_block_store.hpp" - -#undef suite_name -#undef block_params -#undef name_suffix - -struct Floating; -#define suite_name RocprimBlockStoreStripedTests -#define block_params BlockParamsFloating -#define name_suffix Floating - -#include "test_block_store.hpp" diff --git a/test/rocprim/test_block_store_transpose.cpp b/test/rocprim/test_block_store_transpose.cpp deleted file mode 100644 index 37f66782b59..00000000000 --- a/test/rocprim/test_block_store_transpose.cpp +++ /dev/null @@ -1,58 +0,0 @@ -// MIT License -// -// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#include "../common_test_header.hpp" - -// required rocprim headers -#include - -// required test headers -#include "test_utils_types.hpp" - -// required common headers -#include "../../common/utils_device_ptr.hpp" - -// kernel definitions -#include "test_block_store.kernels.hpp" - -// Start stamping out tests -struct RocprimBlockStoreTransposeTests; -#ifndef TEST_BLOCK_STORE_ALGORITHM - #define TEST_BLOCK_STORE_ALGORITHM rocprim::block_store_method::block_store_transpose -#endif -struct Integral; -#define suite_name RocprimBlockStoreTransposeTests -#define block_params BlockParamsIntegral -#define name_suffix Integral - -#include "test_block_store.hpp" - -#undef suite_name -#undef block_params -#undef name_suffix - -struct Floating; -#define suite_name RocprimBlockStoreTransposeTests -#define block_params BlockParamsFloating -#define name_suffix Floating - -#include "test_block_store.hpp" diff --git a/test/rocprim/test_block_store_vectorize.cpp b/test/rocprim/test_block_store_vectorize.cpp deleted file mode 100644 index eee434a820a..00000000000 --- a/test/rocprim/test_block_store_vectorize.cpp +++ /dev/null @@ -1,58 +0,0 @@ -// MIT License -// -// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#include "../common_test_header.hpp" - -// required rocprim headers -#include - -// required test headers -#include "test_utils_types.hpp" - -// required common headers -#include "../../common/utils_device_ptr.hpp" - -// kernel definitions -#include "test_block_store.kernels.hpp" - -// Start stamping out tests -struct RocprimBlockStoreVectorizeTests; -#ifndef TEST_BLOCK_STORE_ALGORITHM - #define TEST_BLOCK_STORE_ALGORITHM rocprim::block_store_method::block_store_vectorize -#endif -struct Integral; -#define suite_name RocprimBlockStoreVectorizeTests -#define block_params BlockParamsIntegral -#define name_suffix Integral - -#include "test_block_store.hpp" - -#undef suite_name -#undef block_params -#undef name_suffix - -struct Floating; -#define suite_name RocprimBlockStoreVectorizeTests -#define block_params BlockParamsFloating -#define name_suffix Floating - -#include "test_block_store.hpp" From 1e9f8beb6099de6edeabf9ccab1260e5c14a99c9 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Tue, 8 Apr 2025 10:41:46 -0600 Subject: [PATCH 09/14] added back deleted documentation --- rocprim/include/rocprim/block/block_store.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rocprim/include/rocprim/block/block_store.hpp b/rocprim/include/rocprim/block/block_store.hpp index faed25018e2..905cb0c01ad 100644 --- a/rocprim/include/rocprim/block/block_store.hpp +++ b/rocprim/include/rocprim/block/block_store.hpp @@ -40,6 +40,8 @@ BEGIN_ROCPRIM_NAMESPACE /// of items into a blocked/striped arrangement on continuous memory enum class block_store_method { + /// A blocked arrangement of items is stored into a blocked arrangement on continuous + /// memory. /// \par Performance Notes: /// * Performance decreases with increasing number of items per thread (stride /// between reads), because of reduced memory coalescing. From 25525deac1432c4b8e590abef19d1b895de555dc Mon Sep 17 00:00:00 2001 From: Di Nguyen Date: Tue, 8 Apr 2025 15:53:25 -0600 Subject: [PATCH 10/14] Implement additional missing block store/load tests (#4) * implemented storage tests * added default test with storage * updated changelog --- CHANGELOG.md | 1 + test/rocprim/CMakeLists.txt | 4 - test/rocprim/test_block_load_store.hpp | 259 ++++++++++++++++++ .../rocprim/test_block_load_store.kernels.hpp | 143 ++++++++++ 4 files changed, 403 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b39c9af6e3b..94a6549b756 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ Full documentation for rocPRIM is available at [https://rocm.docs.amd.com/projec * Added support for building tests with device-side random data generation, making them finish faster. This requires rocRAND, and is enabled with the `WITH_ROCRAND=ON` build flag. * Added additional unit tests for `test_block_scan.hpp` * Added additional unit tests for `test_block_sort.hpp` +* Added additional unit tests for `test_block_load.hpp` ### Changed diff --git a/test/rocprim/CMakeLists.txt b/test/rocprim/CMakeLists.txt index ddf9e6e3b88..ff9f06979e8 100644 --- a/test/rocprim/CMakeLists.txt +++ b/test/rocprim/CMakeLists.txt @@ -269,10 +269,6 @@ add_rocprim_test("rocprim.block_histogram" test_block_histogram.cpp) add_rocprim_test("rocprim.block_load_store" test_block_load_store.cpp) add_rocprim_test("rocprim.block_sort_merge" test_block_sort_merge.cpp) add_rocprim_test("rocprim.block_sort_merge_stable" test_block_sort_merge_stable.cpp) -add_rocprim_test("rocprim.block_store_direct" test_block_store_direct.cpp) -add_rocprim_test("rocprim.block_store_striped" test_block_store_striped.cpp) -add_rocprim_test("rocprim.block_store_transpose" test_block_store_transpose.cpp) -add_rocprim_test("rocprim.block_store_vectorize" test_block_store_vectorize.cpp) add_rocprim_test_parallel("rocprim.block_radix_rank" test_block_radix_rank.cpp.in) add_rocprim_test_parallel("rocprim.block_radix_sort" test_block_radix_sort.cpp.in) add_rocprim_test("rocprim.block_reduce" test_block_reduce.cpp) diff --git a/test/rocprim/test_block_load_store.hpp b/test/rocprim/test_block_load_store.hpp index 382a8061f08..bafb9b95b57 100644 --- a/test/rocprim/test_block_load_store.hpp +++ b/test/rocprim/test_block_load_store.hpp @@ -192,6 +192,179 @@ typed_test_def(suite_name, name_suffix, LoadStoreClassValid) } +typed_test_def(suite_name, name_suffix, LoadStoreClassWithStorage) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using Type = typename TestFixture::params::type; + static constexpr size_t block_size = TestFixture::params::block_size; + static constexpr rocprim::block_load_method load_method = TestFixture::params::load_method; + static constexpr rocprim::block_store_method store_method = TestFixture::params::store_method; + static constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + static constexpr auto items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 113; + const auto grid_size = size / items_per_block; + // Given block size not supported + if(block_size > test_utils::get_max_block_size() || (block_size & (block_size - 1)) != 0) + { + return; + } + + if(load_method == rocprim::block_load_method::block_load_warp_transpose + || store_method == rocprim::block_store_method::block_store_warp_transpose) + { + unsigned int host_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, host_warp_size)); + if(block_size % host_warp_size != 0) + { + GTEST_SKIP() << "Cannot run test of block size " << block_size + << " on a device with warp size " << host_warp_size; + } + } + + const size_t valid = items_per_block; + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + // Generate data + std::vector input + = test_utils::get_random_data_wrapped(size, -100, 100, seed_value); + std::vector output(input.size(), (Type)0); + + // Calculate expected results on host + std::vector expected(input.size(), (Type)0); + for (size_t i = 0; i < 113; i++) + { + size_t block_offset = i * items_per_block; + for (size_t j = 0; j < items_per_block; j++) + { + if (j < valid) + { + expected[j + block_offset] = input[j + block_offset]; + } + } + } + + // Preparing device + common::device_ptr device_input(input); + // Have to initialize output for unvalid data to make sure they are not changed + common::device_ptr device_output(output); + + // Running kernel + hipLaunchKernelGGL(HIP_KERNEL_NAME(load_store_storage_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input.get(), + device_output.get()); + HIP_CHECK(hipGetLastError()); + + // Reading results from device + output = device_output.load(); + + // Validating results + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); + } + +} + +typed_test_def(suite_name, name_suffix, LoadStoreClassValidWithStorage) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using Type = typename TestFixture::params::type; + static constexpr size_t block_size = TestFixture::params::block_size; + static constexpr rocprim::block_load_method load_method = TestFixture::params::load_method; + static constexpr rocprim::block_store_method store_method = TestFixture::params::store_method; + static constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + static constexpr auto items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 113; + const auto grid_size = size / items_per_block; + // Given block size not supported + if(block_size > test_utils::get_max_block_size() || (block_size & (block_size - 1)) != 0) + { + return; + } + + if(load_method == rocprim::block_load_method::block_load_warp_transpose + || store_method == rocprim::block_store_method::block_store_warp_transpose) + { + unsigned int host_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, host_warp_size)); + if(block_size % host_warp_size != 0) + { + GTEST_SKIP() << "Cannot run test of block size " << block_size + << " on a device with warp size " << host_warp_size; + } + } + + const size_t valid = items_per_block - 32; + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + // Generate data + std::vector input + = test_utils::get_random_data_wrapped(size, -100, 100, seed_value); + std::vector output(input.size(), (Type)0); + + // Calculate expected results on host + std::vector expected(input.size(), (Type)0); + for (size_t i = 0; i < 113; i++) + { + size_t block_offset = i * items_per_block; + for (size_t j = 0; j < items_per_block; j++) + { + if (j < valid) + { + expected[j + block_offset] = input[j + block_offset]; + } + } + } + + // Preparing device + common::device_ptr device_input(input); + // Have to initialize output for unvalid data to make sure they are not changed + common::device_ptr device_output(output); + + // Running kernel + hipLaunchKernelGGL(HIP_KERNEL_NAME(load_store_storage_valid_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input.get(), + device_output.get(), + valid); + HIP_CHECK(hipGetLastError()); + + // Reading results from device + output = device_output.load(); + + // Validating results + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); + } + +} + typed_test_def(suite_name, name_suffix, LoadStoreClassDefault) { int device_id = test_common_utils::obtain_device_from_ctest(); @@ -277,3 +450,89 @@ typed_test_def(suite_name, name_suffix, LoadStoreClassDefault) ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); } } + +typed_test_def(suite_name, name_suffix, LoadStoreClassDefaultWithStorage) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using Type = typename TestFixture::params::type; + static constexpr size_t block_size = TestFixture::params::block_size; + static constexpr rocprim::block_load_method load_method = TestFixture::params::load_method; + static constexpr rocprim::block_store_method store_method = TestFixture::params::store_method; + static constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + static constexpr auto items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 113; + const auto grid_size = size / items_per_block; + // Given block size not supported + if(block_size > test_utils::get_max_block_size() || (block_size & (block_size - 1)) != 0) + { + return; + } + + if(load_method == rocprim::block_load_method::block_load_warp_transpose + || store_method == rocprim::block_store_method::block_store_warp_transpose) + { + unsigned int host_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, host_warp_size)); + if(block_size % host_warp_size != 0) + { + GTEST_SKIP() << "Cannot run test of block size " << block_size + << " on a device with warp size " << host_warp_size; + } + } + + const size_t valid = items_per_thread + 1; + Type _default = (Type)-1; + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + // Generate data + std::vector input + = test_utils::get_random_data_wrapped(size, -100, 100, seed_value); + + // Calculate expected results on host + std::vector expected(input.size(), _default); + for (size_t i = 0; i < 113; i++) + { + size_t block_offset = i * items_per_block; + for (size_t j = 0; j < items_per_block; j++) + { + if (j < valid) + { + expected[j + block_offset] = input[j + block_offset]; + } + } + } + + // Preparing device + common::device_ptr device_input(input); + common::device_ptr device_output(size); + + // Running kernel + hipLaunchKernelGGL(HIP_KERNEL_NAME(load_store_valid_default_storage_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input.get(), + device_output.get(), + valid, + _default); + HIP_CHECK(hipGetLastError()); + + // Reading results from device + const auto output = device_output.load(); + + // Validating results + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); + } +} diff --git a/test/rocprim/test_block_load_store.kernels.hpp b/test/rocprim/test_block_load_store.kernels.hpp index 44b81ad7d4b..a0a7dc10634 100644 --- a/test/rocprim/test_block_load_store.kernels.hpp +++ b/test/rocprim/test_block_load_store.kernels.hpp @@ -553,6 +553,100 @@ void load_store_valid_kernel(Type* device_input, Type* device_output, size_t val store.store(device_output + offset, _items, (unsigned int)valid); } +template< + class Type, + rocprim::block_load_method LoadMethod, + rocprim::block_store_method StoreMethod, + unsigned int BlockSize, + unsigned int ItemsPerThread, + std::enable_if_t<(enable_block_load_store_test::value), int> = 0 +> +__global__ +__launch_bounds__(BlockSize) +void load_store_storage_kernel(Type* device_input, Type* device_output) +{ + Type _items[ItemsPerThread]; + auto offset = blockIdx.x * BlockSize * ItemsPerThread; + using impl = get_block_load_store; + typename impl::block_load load; + typename impl::block_store store; + + ROCPRIM_SHARED_MEMORY typename impl::block_load::storage_type load_storage; + ROCPRIM_SHARED_MEMORY typename impl::block_store::storage_type store_storage; + + load.load(device_input + offset, _items, load_storage); + store.store(device_output + offset, _items, store_storage); +} + +// The default kernel in case load store storage kernel fails to build +template< + class Type, + rocprim::block_load_method LoadMethod, + rocprim::block_store_method StoreMethod, + unsigned int BlockSize, + unsigned int ItemsPerThread, + std::enable_if_t::value), int> = 0 +> +__global__ +__launch_bounds__(BlockSize) +void load_store_storage_kernel(Type* device_input, Type* device_output) +{ + Type _items[ItemsPerThread]; + auto offset = blockIdx.x * BlockSize * ItemsPerThread; + using impl = get_block_load_store; + typename impl::block_load load; + typename impl::block_store store; + load.load(device_input + offset, _items); + store.store(device_output + offset, _items); +} + +template< + class Type, + rocprim::block_load_method LoadMethod, + rocprim::block_store_method StoreMethod, + unsigned int BlockSize, + unsigned int ItemsPerThread, + std::enable_if_t<(enable_block_load_store_test::value), int> = 0 +> +__global__ +__launch_bounds__(BlockSize) +void load_store_storage_valid_kernel(Type* device_input, Type* device_output, size_t valid) +{ + Type _items[ItemsPerThread]; + auto offset = blockIdx.x * BlockSize * ItemsPerThread; + using impl = get_block_load_store; + typename impl::block_load load; + typename impl::block_store store; + + ROCPRIM_SHARED_MEMORY typename impl::block_load::storage_type load_storage; + ROCPRIM_SHARED_MEMORY typename impl::block_store::storage_type store_storage; + + load.load(device_input + offset, _items, (unsigned int)valid, load_storage); + store.store(device_output + offset, _items, (unsigned int)valid, store_storage); +} + +// The default kernel in case load store storage valid kernel fails to build +template< + class Type, + rocprim::block_load_method LoadMethod, + rocprim::block_store_method StoreMethod, + unsigned int BlockSize, + unsigned int ItemsPerThread, + std::enable_if_t::value), int> = 0 +> +__global__ +__launch_bounds__(BlockSize) +void load_store_storage_valid_kernel(Type* device_input, Type* device_output, size_t valid) +{ + Type _items[ItemsPerThread]; + auto offset = blockIdx.x * BlockSize * ItemsPerThread; + using impl = get_block_load_store; + typename impl::block_load load; + typename impl::block_store store; + load.load(device_input + offset, _items, (unsigned int)valid); + store.store(device_output + offset, _items, (unsigned int)valid); +} + template< class Type, rocprim::block_load_method LoadMethod, @@ -574,4 +668,53 @@ void load_store_valid_default_kernel(Type* device_input, Type* device_output, si store.store(device_output + offset, _items); } +template< + class Type, + rocprim::block_load_method LoadMethod, + rocprim::block_store_method StoreMethod, + unsigned int BlockSize, + unsigned int ItemsPerThread, + class Def, + std::enable_if_t<(enable_block_load_store_test::value), int> = 0 +> +__global__ +__launch_bounds__(BlockSize) +void load_store_valid_default_storage_kernel(Type* device_input, Type* device_output, size_t valid, Def _default) +{ + Type _items[ItemsPerThread]; + auto offset = blockIdx.x * BlockSize * ItemsPerThread; + using impl = get_block_load_store; + typename impl::block_load load; + typename impl::block_store store; + + ROCPRIM_SHARED_MEMORY typename impl::block_load::storage_type load_storage; + ROCPRIM_SHARED_MEMORY typename impl::block_store::storage_type store_storage; + + load.load(device_input + offset, _items, (unsigned int)valid, _default, load_storage); + store.store(device_output + offset, _items, store_storage); +} + +// The default kernel in case load_store_valid_default_storage_kernel fails to build +template< + class Type, + rocprim::block_load_method LoadMethod, + rocprim::block_store_method StoreMethod, + unsigned int BlockSize, + unsigned int ItemsPerThread, + class Def, + std::enable_if_t::value), int> = 0 +> +__global__ +__launch_bounds__(BlockSize) +void load_store_valid_default_storage_kernel(Type* device_input, Type* device_output, size_t valid, Def _default) +{ + Type _items[ItemsPerThread]; + auto offset = blockIdx.x * BlockSize * ItemsPerThread; + using impl = get_block_load_store; + typename impl::block_load load; + typename impl::block_store store; + load.load(device_input + offset, _items, (unsigned int)valid, _default); + store.store(device_output + offset, _items); +} + #endif // TEST_BLOCK_LOAD_STORE_KERNELS_HPP_ From 6fbb542d04e87e4a8107340d07de47e1f410f988 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Tue, 8 Apr 2025 15:54:43 -0600 Subject: [PATCH 11/14] updated changelog to include block store --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 94a6549b756..e00641f08e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,9 +18,10 @@ Full documentation for rocPRIM is available at [https://rocm.docs.amd.com/projec * Added the `rocprim::merge_inplace` function for merging in-place. * Added initial value support for warp- and block-level inclusive scan. * Added support for building tests with device-side random data generation, making them finish faster. This requires rocRAND, and is enabled with the `WITH_ROCRAND=ON` build flag. +* Added additional unit tests for `test_block_load.hpp` * Added additional unit tests for `test_block_scan.hpp` * Added additional unit tests for `test_block_sort.hpp` -* Added additional unit tests for `test_block_load.hpp` +* Added additional unit tests for `test_block_store.hpp` ### Changed From ca394e8b3c6e38179e041bb895ffb7ab46648331 Mon Sep 17 00:00:00 2001 From: Di Nguyen Date: Wed, 16 Apr 2025 11:26:16 -0600 Subject: [PATCH 12/14] implement missing radix rank tests and missing `rank_keys_desc` function (#5) * added addition unit tests and implement missing function in match * updated changelog --- CHANGELOG.md | 2 + .../block/detail/block_radix_rank_match.hpp | 13 + test/rocprim/test_block_radix_rank.hpp | 235 +++++++++++++++++- 3 files changed, 248 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e00641f08e2..d04be22adfe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,9 +19,11 @@ Full documentation for rocPRIM is available at [https://rocm.docs.amd.com/projec * Added initial value support for warp- and block-level inclusive scan. * Added support for building tests with device-side random data generation, making them finish faster. This requires rocRAND, and is enabled with the `WITH_ROCRAND=ON` build flag. * Added additional unit tests for `test_block_load.hpp` +* Added additional unit tests for `test_block_rank.hpp` * Added additional unit tests for `test_block_scan.hpp` * Added additional unit tests for `test_block_sort.hpp` * Added additional unit tests for `test_block_store.hpp` +* Added missing `rank_keys_desc` with `digit_extractor` parameter for `block_radix_rank_match.hpp` ### Changed diff --git a/rocprim/include/rocprim/block/detail/block_radix_rank_match.hpp b/rocprim/include/rocprim/block/detail/block_radix_rank_match.hpp index f92b3bad199..5690e05bf71 100644 --- a/rocprim/include/rocprim/block/detail/block_radix_rank_match.hpp +++ b/rocprim/include/rocprim/block/detail/block_radix_rank_match.hpp @@ -269,6 +269,19 @@ class block_radix_rank_match rank_keys_impl(keys, ranks, storage.get(), begin_bit, pass_bits); } + template + ROCPRIM_DEVICE void rank_keys_desc(const Key (&keys)[ItemsPerThread], + unsigned int (&ranks)[ItemsPerThread], + storage_type& storage, + DigitExtractor digit_extractor) + { + rank_keys_impl(keys, ranks, storage.get(), + [&digit_extractor](const Key & key){ + const unsigned int digit = digit_extractor(key); + return radix_digits - 1 - digit; + }); + } + template ROCPRIM_DEVICE void rank_keys(const Key (&keys)[ItemsPerThread], unsigned int (&ranks)[ItemsPerThread], diff --git a/test/rocprim/test_block_radix_rank.hpp b/test/rocprim/test_block_radix_rank.hpp index e078fdcfad2..9e0bc35b545 100644 --- a/test/rocprim/test_block_radix_rank.hpp +++ b/test/rocprim/test_block_radix_rank.hpp @@ -59,8 +59,14 @@ static constexpr size_t n_sizes = 12; static constexpr unsigned int items_per_thread[n_sizes] = {1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3}; static constexpr unsigned int rank_desc[n_sizes] = {false, false, false, false, false, false, true, true, true, true, true, true}; + // = {false, false, false, false, false, false, false, false, false, false, false, false}; +static constexpr unsigned int use_storage[n_sizes] + = {false, true, false, true, false, true, false, true, false, true, false, true}; +static constexpr unsigned int end_bits[n_sizes] + = {0x1, 0x3, 0x7, 0xf, 0x1, 0x3, 0x7, 0xf, 0x1, 0x3, 0x7, 0xf}; static constexpr unsigned int pass_start_bit[n_sizes] = {0, 0, 0, 6, 2, 1, 0, 0, 0, 1, 4, 7}; static constexpr unsigned int max_radix_bits[n_sizes] = {4, 3, 5, 3, 1, 5, 4, 2, 4, 3, 1, 2}; +static constexpr unsigned int max_radix_bits_extractor[n_sizes] = {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4}; static constexpr unsigned int pass_radix_bits[n_sizes] = {0, 0, 1, 0, 0, 2, 0, 0, 3, 0, 0, 1}; template +__global__ __launch_bounds__(BlockSize) void rank_kernel(const T* const items_input, + unsigned int* const ranks_output, + const bool descending, + const bool use_storage, + const unsigned int last_bits) +{ + using block_rank_type = rocprim::block_radix_rank; + using keys_exchange_type = rocprim::block_exchange; + using ranks_exchange_type = rocprim::block_exchange; + + constexpr bool warp_striped = Algorithm == rocprim::block_radix_rank_algorithm::match; + + constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * items_per_block; + + ROCPRIM_SHARED_MEMORY union + { + typename keys_exchange_type::storage_type keys_exchange; + typename block_rank_type::storage_type rank; + typename ranks_exchange_type::storage_type ranks_exchange; + } storage; + + T keys[ItemsPerThread]; + unsigned int ranks[ItemsPerThread]; + + rocprim::block_load_direct_blocked(lid, items_input + block_offset, keys); + if ROCPRIM_IF_CONSTEXPR(warp_striped) + { + // block_radix_rank_match requires warp striped input and output. Instead of using + // rocprim::block_load_direct_warp_striped though, we load directly and exchange the + // values manually, as we can also test with block sizes that do not divide the hardware + // warp size that way. + keys_exchange_type().blocked_to_warp_striped(keys, keys, storage.keys_exchange); + rocprim::syncthreads(); + } + + union converter{ + T in; + uint64_t out; + }; + + if(descending) + { + if (use_storage) + block_rank_type().rank_keys_desc(keys, ranks, storage.rank, [=](const T & key){ + converter c; + c.in = key; + uint64_t out = c.out & last_bits; + return out; + }); + else + block_rank_type().rank_keys_desc(keys, ranks, [=](const T & key){ + converter c; + c.in = key; + uint64_t out = c.out & last_bits; + return out; + }); + } + else + { + if (use_storage) + block_rank_type().rank_keys(keys, ranks, storage.rank, [=](const T & key){ + converter c; + c.in = key; + uint64_t out = c.out & last_bits; + return out; + }); + else + block_rank_type().rank_keys(keys, ranks, [=](const T & key){ + converter c; + c.in = key; + uint64_t out = c.out & last_bits; + return out; + }); } if ROCPRIM_IF_CONSTEXPR(warp_striped) @@ -130,6 +234,7 @@ template void test_block_radix_rank() { @@ -141,6 +246,7 @@ void test_block_radix_rank() constexpr size_t radix_bits = RadixBits; constexpr size_t end_bit = start_bit + radix_bits; constexpr bool descending = Descending; + constexpr bool use_storage = UseStorage; constexpr rocprim::block_radix_rank_algorithm algorithm = Algorithm; const size_t grid_size = 23; @@ -205,6 +311,7 @@ void test_block_radix_rank() d_keys_input.get(), d_ranks_output.get(), descending, + use_storage, start_bit, radix_bits); HIP_CHECK(hipGetLastError()); @@ -216,6 +323,111 @@ void test_block_radix_rank() } } +template +void test_block_radix_extractor_rank() +{ + constexpr size_t block_size = BlockSize; + constexpr size_t items_per_thread = ItemsPerThread; + constexpr size_t items_per_block = block_size * items_per_thread; + constexpr size_t max_radix_bits = MaxRadixBits; + constexpr size_t end_bits = EndBits; + constexpr bool descending = Descending; + constexpr bool use_storage = UseStorage; + constexpr rocprim::block_radix_rank_algorithm algorithm = Algorithm; + + const size_t grid_size = 23; + const size_t size = items_per_block * grid_size; + + SCOPED_TRACE(testing::Message() << "with block_size = " << block_size); + SCOPED_TRACE(testing::Message() << "with items_per_thread = " << items_per_thread); + SCOPED_TRACE(testing::Message() << "with descending = " << (descending ? "true" : "false")); + SCOPED_TRACE(testing::Message() << "with max_radix_bits = " << MaxRadixBits); + SCOPED_TRACE(testing::Message() << "with grid_size = " << size); + SCOPED_TRACE(testing::Message() << "with size = " << size); + + for(size_t seed_index = 0; seed_index < number_of_runs; ++seed_index) + { + seed_type seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + // Generate data + std::vector keys_input + = test_utils::get_random_data_wrapped(size, + common::generate_limits::min(), + common::generate_limits::max(), + seed_value); + + + union converter{ + T in; + uint64_t out; + }; + // Calculated expected results on host + std::vector expected(size); + for(size_t i = 0; i < grid_size; ++i) + { + size_t block_offset = i * items_per_block; + + // Perform an 'argsort', which gives a sorted sequence of indices into `keys_input`. + std::vector indices(items_per_block); + std::iota(indices.begin(), indices.end(), 0); + + std::stable_sort( + indices.begin(), + indices.end(), + [&](const int& i, const int& j) + { + converter c; + c.in = keys_input[block_offset + i]; + uint64_t left = c.out & end_bits; + + c.in = keys_input[block_offset + j]; + + uint64_t right = c.out & end_bits; + + return descending ? right < left : left < right; + }); + + // Invert the sorted indices sequence to obtain the ranks. + for(size_t j = 0; j < items_per_block; ++j) + { + expected[block_offset + indices[j]] = static_cast(j); + } + } + + common::device_ptr d_keys_input(keys_input); + common::device_ptr d_ranks_output(size); + + // Running kernel + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + rank_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + d_keys_input.get(), + d_ranks_output.get(), + descending, + use_storage, + end_bits); + HIP_CHECK(hipGetLastError()); + + // Getting results to host + auto ranks_output = d_ranks_output.load(); + + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(ranks_output, expected)); + } +} + template(); } static_for::run(); } + + static void run_extractor() + { + { + SCOPED_TRACE(testing::Message() << "TestID = " << First); + test_block_radix_extractor_rank(); + } + static_for::run_extractor(); + } }; template { static void run() {} + static void run_extractor() {} }; template @@ -264,6 +494,7 @@ void test_block_radix_rank_algorithm() } static_for<0, n_sizes, type, block_size, Algorithm>::run(); + static_for<0, n_sizes, type, block_size, Algorithm>::run_extractor(); } #endif // TEST_BLOCK_RADIX_RANK_KERNELS_HPP_ From 5f39ea32d7583c2c4109c2062dea6a128b645116 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 16 Apr 2025 11:27:46 -0600 Subject: [PATCH 13/14] removed debug comments --- test/rocprim/test_block_radix_rank.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/test/rocprim/test_block_radix_rank.hpp b/test/rocprim/test_block_radix_rank.hpp index 9e0bc35b545..7366d72ac85 100644 --- a/test/rocprim/test_block_radix_rank.hpp +++ b/test/rocprim/test_block_radix_rank.hpp @@ -59,7 +59,6 @@ static constexpr size_t n_sizes = 12; static constexpr unsigned int items_per_thread[n_sizes] = {1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3}; static constexpr unsigned int rank_desc[n_sizes] = {false, false, false, false, false, false, true, true, true, true, true, true}; - // = {false, false, false, false, false, false, false, false, false, false, false, false}; static constexpr unsigned int use_storage[n_sizes] = {false, true, false, true, false, true, false, true, false, true, false, true}; static constexpr unsigned int end_bits[n_sizes] From 9d88959b0567519e532b02a6ac1f65d8140df247 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 7 May 2025 12:31:09 -0600 Subject: [PATCH 14/14] implemented additional unit test for radix rank with prefix and count output --- test/rocprim/test_block_radix_rank.cpp.in | 13 +- test/rocprim/test_block_radix_rank.hpp | 263 +++++++++++++++++++++- 2 files changed, 272 insertions(+), 4 deletions(-) diff --git a/test/rocprim/test_block_radix_rank.cpp.in b/test/rocprim/test_block_radix_rank.cpp.in index 8e793b5f02c..656035aefd4 100644 --- a/test/rocprim/test_block_radix_rank.cpp.in +++ b/test/rocprim/test_block_radix_rank.cpp.in @@ -47,13 +47,20 @@ #if ROCPRIM_TEST_SUITE_SLICE == 0 TYPED_TEST_P(SUITE, RankBasic ) { test_block_radix_rank_algorithm(); } - REGISTER_TYPED_TEST_SUITE_P(SUITE, RankBasic); + TYPED_TEST_P(SUITE, RankBasicExtractor ) { test_block_radix_rank_extractor_algorithm(); } + TYPED_TEST_P(SUITE, RankBasicExtractorWithPrefixCount ) { test_block_radix_rank_extractor_with_prefix_count_algorithm(); } + REGISTER_TYPED_TEST_SUITE_P(SUITE, RankBasic, RankBasicExtractor, RankBasicExtractorWithPrefixCount); + #elif ROCPRIM_TEST_SUITE_SLICE == 1 TYPED_TEST_P(SUITE, RankMemoize) { test_block_radix_rank_algorithm(); } - REGISTER_TYPED_TEST_SUITE_P(SUITE, RankMemoize); + TYPED_TEST_P(SUITE, RankMemoizeExtractor) { test_block_radix_rank_extractor_algorithm(); } + TYPED_TEST_P(SUITE, RankMemoizeExtractorWithPrefixCount) { test_block_radix_rank_extractor_with_prefix_count_algorithm(); } + REGISTER_TYPED_TEST_SUITE_P(SUITE, RankMemoize, RankMemoizeExtractor, RankMemoizeExtractorWithPrefixCount); #elif ROCPRIM_TEST_SUITE_SLICE == 2 TYPED_TEST_P(SUITE, RankMatch ) { test_block_radix_rank_algorithm(); } - REGISTER_TYPED_TEST_SUITE_P(SUITE, RankMatch); + TYPED_TEST_P(SUITE, RankMatchExtractor ) { test_block_radix_rank_extractor_algorithm(); } + TYPED_TEST_P(SUITE, RankMatchExtractorWithPrefixCount ) { test_block_radix_rank_extractor_with_prefix_count_algorithm(); } + REGISTER_TYPED_TEST_SUITE_P(SUITE, RankMatch, RankMatchExtractor, RankMatchExtractorWithPrefixCount); #endif #if ROCPRIM_TEST_TYPE_SLICE == 0 diff --git a/test/rocprim/test_block_radix_rank.hpp b/test/rocprim/test_block_radix_rank.hpp index 7366d72ac85..e3b2b7b4f19 100644 --- a/test/rocprim/test_block_radix_rank.hpp +++ b/test/rocprim/test_block_radix_rank.hpp @@ -226,6 +226,76 @@ __global__ __launch_bounds__(BlockSize) void rank_kernel(const T* const ite rocprim::block_store_direct_blocked(lid, ranks_output + block_offset, ranks); } +template +__global__ __launch_bounds__(BlockSize) void rank_kernel(const T* const items_input, + unsigned int* const ranks_output, + unsigned int* prefix_output, + unsigned int* counts_output, + Extractor digit_extractor + ) +{ + using block_rank_type = rocprim::block_radix_rank; + using keys_exchange_type = rocprim::block_exchange; + using ranks_exchange_type = rocprim::block_exchange; + + constexpr bool warp_striped = Algorithm == rocprim::block_radix_rank_algorithm::match; + + constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * items_per_block; + + ROCPRIM_SHARED_MEMORY union + { + typename keys_exchange_type::storage_type keys_exchange; + typename block_rank_type::storage_type rank; + typename ranks_exchange_type::storage_type ranks_exchange; + } storage; + + T keys[ItemsPerThread]; + unsigned int ranks[ItemsPerThread]; + + const unsigned int digits_per_thread = block_rank_type().digits_per_thread; + + unsigned int prefix[digits_per_thread]; + unsigned int counts[digits_per_thread]; + rocprim::block_load_direct_blocked(lid, items_input + block_offset, keys); + if ROCPRIM_IF_CONSTEXPR(warp_striped) + { + // block_radix_rank_match requires warp striped input and output. Instead of using + // rocprim::block_load_direct_warp_striped though, we load directly and exchange the + // values manually, as we can also test with block sizes that do not divide the hardware + // warp size that way. + keys_exchange_type().blocked_to_warp_striped(keys, keys, storage.keys_exchange); + rocprim::syncthreads(); + } + + block_rank_type().rank_keys(keys, ranks, storage.rank, digit_extractor, prefix, counts); + + if ROCPRIM_IF_CONSTEXPR(warp_striped) + { + // See the comment above. + rocprim::syncthreads(); + ranks_exchange_type().warp_striped_to_blocked(ranks, ranks, storage.ranks_exchange); + } + rocprim::block_store_direct_blocked(lid, ranks_output + block_offset, ranks); + + // storing count and prefix output + const size_t pc_offset = (threadIdx.x * digits_per_thread) + (blockIdx.x * (1 << RadixBits)); + + for(size_t i = 0; i < digits_per_thread; i++){ + if((threadIdx.x * digits_per_thread) + i < (1 << RadixBits)){ + prefix_output[pc_offset + i] = prefix[i]; + counts_output[pc_offset + i] = counts[i]; + } + } +} + template::run_extractor(); } + + static void run_extractor_with_prefix_count() + { + { + SCOPED_TRACE(testing::Message() << "TestID = " << First); + test_block_radix_rank_with_prefix_and_count(); + } + static_for::run_extractor_with_prefix_count(); + } }; template { static void run() {} static void run_extractor() {} + static void run_extractor_with_prefix_count() {} }; template @@ -493,7 +727,34 @@ void test_block_radix_rank_algorithm() } static_for<0, n_sizes, type, block_size, Algorithm>::run(); +} + +template +void test_block_radix_rank_extractor_algorithm() +{ + using type = typename TestFixture::params::input_type; + constexpr size_t block_size = TestFixture::params::block_size; + + if(block_size > test_utils::get_max_block_size()) + { + GTEST_SKIP(); + } + static_for<0, n_sizes, type, block_size, Algorithm>::run_extractor(); } +template +void test_block_radix_rank_extractor_with_prefix_count_algorithm() +{ + using type = typename TestFixture::params::input_type; + constexpr size_t block_size = TestFixture::params::block_size; + + if(block_size > test_utils::get_max_block_size()) + { + GTEST_SKIP(); + } + + static_for<0, n_sizes, type, block_size, Algorithm>::run_extractor_with_prefix_count(); +} + #endif // TEST_BLOCK_RADIX_RANK_KERNELS_HPP_