From d1573068806c1cba20061c78c9abfade9893bbc9 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 6 Aug 2025 14:03:40 -0700 Subject: [PATCH 1/6] ported over fixes --- .../hipcub/test_hipcub_block_radix_rank.cpp | 2 +- .../test/hipcub/test_hipcub_block_scan.cpp | 6 + .../test/hipcub/test_hipcub_device_copy.cpp | 19 +- .../test/hipcub/test_hipcub_device_for.cpp | 9 + .../test/hipcub/test_hipcub_device_memcpy.cpp | 10 + .../test/hipcub/test_hipcub_device_spmv.cpp | 121 +-- .../hipcub/test/hipcub/test_hipcub_grid.cpp | 1 + .../test/hipcub/test_hipcub_iterators.cpp | 11 + .../hipcub/test_hipcub_thread_operators.cpp | 1 + .../test/hipcub/test_hipcub_util_ptx.cpp | 716 +++++++++--------- .../hipcub/test/hipcub/test_hipcub_vector.cpp | 3 + 11 files changed, 493 insertions(+), 406 deletions(-) diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp index cca727699d2..5e7c47188b2 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp @@ -780,4 +780,4 @@ TYPED_TEST(HipcubBlockRadixRank, BlockRadixRankMemoizeWithPrefixSumOutput) TYPED_TEST(HipcubBlockRadixRank, BlockRadixRankMatchWithPrefixSumOutput) { test_radix_rank_with_prefix_sum_output(); -} +} \ No newline at end of file diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_scan.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_scan.cpp index 6892222fcca..f689bbff70f 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_scan.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_scan.cpp @@ -2575,6 +2575,9 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveScanReduce) test_utils::assert_near(output_reductions, expected_reductions, test_utils::precision::value * block_size); + + HIP_CHECK(hipFree(device_output)); + HIP_CHECK(hipFree(device_output_reductions)); } } @@ -3416,6 +3419,9 @@ TYPED_TEST(HipcubBlockScanInputArrayTests, ExclusiveSumReduce) test_utils::assert_near(output_reductions, expected_reductions, test_utils::precision::value * block_size); + + HIP_CHECK(hipFree(device_output)); + HIP_CHECK(hipFree(device_output_reductions)); } } diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_copy.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_copy.cpp index d2192a62f72..50c91e09acb 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_copy.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_copy.cpp @@ -183,12 +183,14 @@ TYPED_TEST(DeviceBatchCopyTests, SizeAndTypeVariation) if(i < num_tlev_buffers) { size = test_utils::get_random_value(1, wlev_min_size - 1, rng()); - } else if(i < num_tlev_buffers + num_wlev_buffers) + } + else if(i < num_tlev_buffers + num_wlev_buffers) { size = test_utils::get_random_value(wlev_min_size, blev_min_size - 1, rng()); - } else + } + else { size = test_utils::get_random_value(blev_min_size, max_size, rng()); } @@ -252,7 +254,8 @@ TYPED_TEST(DeviceBatchCopyTests, SizeAndTypeVariation) { src_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); dst_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); - } else + } + else { src_offsets = std::vector(num_buffers); dst_offsets = std::vector(num_buffers); @@ -316,4 +319,14 @@ TYPED_TEST(DeviceBatchCopyTests, SizeAndTypeVariation) ASSERT_TRUE(test_utils::bit_equal(h_input[input_index], h_output[output_index])); } } + + // De-allocate memory. + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); + + HIP_CHECK(hipFree(d_buffer_srcs)); + HIP_CHECK(hipFree(d_buffer_dsts)); + HIP_CHECK(hipFree(d_buffer_sizes)); + + HIP_CHECK(hipFree(d_temp_storage)); } diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp index a69241ffee7..39f89203128 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_for.cpp @@ -252,6 +252,7 @@ TEST(HipcubDeviceForTests, ForEachTempStore) HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_temp_storage)); + HIP_CHECK(hipFree(d_count)); } } } @@ -492,6 +493,7 @@ TYPED_TEST(HipcubDeviceForTests, ForEachCopy) } HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_count)); } } @@ -576,6 +578,7 @@ TEST(HipcubDeviceForTests, ForEachCopyTempStore) HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_temp_storage)); + HIP_CHECK(hipFree(d_count)); } } } @@ -654,6 +657,7 @@ TYPED_TEST(HipcubDeviceForTests, ForEachCopyN) } HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_count)); } } @@ -709,6 +713,8 @@ TEST(HipcubDeviceForTests, ForCountingIterator) // Check if have same number of odd numbers ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(h_count, expected)); + + HIP_CHECK(hipFree(d_count)); } } } @@ -761,6 +767,8 @@ TEST(HipcubDeviceForTests, ForCopyCountingIterator) // Check if have same number of odd numbers ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(h_count, expected)); + + HIP_CHECK(hipFree(d_count)); } } } @@ -840,6 +848,7 @@ TEST(HipcubDeviceForTests, ForEachCopyNTempStore) HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_temp_storage)); + HIP_CHECK(hipFree(d_count)); } } } diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_memcpy.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_memcpy.cpp index 338ca0ec638..fe95f2fdde0 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_memcpy.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_memcpy.cpp @@ -323,4 +323,14 @@ TYPED_TEST(DeviceBatchMemcpyTests, SizeAndTypeVariation) ASSERT_TRUE(test_utils::bit_equal(h_input[input_index], h_output[output_index])); } } + + // De-allocate memory. + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); + + HIP_CHECK(hipFree(d_buffer_srcs)); + HIP_CHECK(hipFree(d_buffer_dsts)); + HIP_CHECK(hipFree(d_buffer_sizes)); + + HIP_CHECK(hipFree(d_temp_storage)); } diff --git a/projects/hipcub/test/hipcub/test_hipcub_device_spmv.cpp b/projects/hipcub/test/hipcub/test_hipcub_device_spmv.cpp index 9dec692afcc..7fb73de3e62 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_device_spmv.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_device_spmv.cpp @@ -70,31 +70,30 @@ using HipcubDeviceSpmvTestsParams = ::testing::Types>; template -static void -generate_matrix(CooMatrix &coo_matrix, - int32_t grid2d, - int32_t grid3d, - int32_t wheel, - int32_t dense) +static void generate_matrix(CooMatrix& coo_matrix, + int32_t grid2d, + int32_t grid3d, + int32_t wheel, + int32_t dense) { - if (grid2d > 0) + if(grid2d > 0) { // Generate 2D lattice coo_matrix.InitGrid2d(grid2d, false); } - else if (grid3d > 0) + else if(grid3d > 0) { // Generate 3D lattice coo_matrix.InitGrid3d(grid3d, false); } - else if (wheel > 0) + else if(wheel > 0) { // Generate wheel graph coo_matrix.InitWheel(wheel); } - else if (dense > 0) + else if(dense > 0) { - #if 0 +#if 0 // Generate dense graph OffsetType size = 1 << 24; // 16M nnz args.GetCmdLineArgument("size", size); @@ -102,27 +101,22 @@ generate_matrix(CooMatrix &coo_matrix, OffsetType rows = size / dense; printf("dense_%d_x_%d, ", rows, dense); fflush(stdout); coo_matrix.InitDense(rows, dense); - #endif +#endif } } -template < - typename T, - typename OffsetType> -void SpmvGold( - CsrMatrix& a, - const T* vector_x, - const T* vector_y_in, - T* vector_y_out, - T alpha, - T beta) +template +void SpmvGold(CsrMatrix& a, + const T* vector_x, + const T* vector_y_in, + T* vector_y_out, + T alpha, + T beta) { - for (OffsetType row = 0; row < a.num_rows; ++row) + for(OffsetType row = 0; row < a.num_rows; ++row) { T partial = beta * vector_y_in[row]; - for (OffsetType offset = a.row_offsets[row]; - offset < a.row_offsets[row + 1]; - ++offset) + for(OffsetType offset = a.row_offsets[row]; offset < a.row_offsets[row + 1]; ++offset) { partial += alpha * a.values[offset] * vector_x[a.column_indices[offset]]; } @@ -138,8 +132,8 @@ TYPED_TEST(HipcubDeviceSpmvTests, Spmv) SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); HIP_CHECK(hipSetDevice(device_id)); - using T = typename TestFixture::value_type; - using OffsetType = int32_t; + using T = typename TestFixture::value_type; + using OffsetType = int32_t; constexpr int32_t grid_2d = TestFixture::grid_2d; constexpr int32_t grid_3d = TestFixture::grid_3d; constexpr int32_t wheel = TestFixture::wheel; @@ -160,14 +154,14 @@ TYPED_TEST(HipcubDeviceSpmvTests, Spmv) csr_matrix.FromCoo(coo_matrix); // Allocate input and output vectors - T* vector_x = new T[csr_matrix.num_cols]; - T* vector_y_in = new T[csr_matrix.num_rows]; - T* vector_y_out = new T[csr_matrix.num_rows]; + T* vector_x = new T[csr_matrix.num_cols]; + T* vector_y_in = new T[csr_matrix.num_rows]; + T* vector_y_out = new T[csr_matrix.num_rows]; - for (int col = 0; col < csr_matrix.num_cols; ++col) + for(int col = 0; col < csr_matrix.num_cols; ++col) vector_x[col] = 1.0; - for (int row = 0; row < csr_matrix.num_rows; ++row) + for(int row = 0; row < csr_matrix.num_rows; ++row) vector_y_in[row] = 1.0; // Compute reference answer @@ -176,11 +170,16 @@ TYPED_TEST(HipcubDeviceSpmvTests, Spmv) // Allocate and initialize GPU problem hipcub::DeviceSpmv::SpmvParams params; - HIP_CHECK(g_allocator.DeviceAllocate((void **) ¶ms.d_values, sizeof(T) * csr_matrix.num_nonzeros)); - HIP_CHECK(g_allocator.DeviceAllocate((void **) ¶ms.d_row_end_offsets, sizeof(OffsetType) * (csr_matrix.num_rows + 1))); - HIP_CHECK(g_allocator.DeviceAllocate((void **) ¶ms.d_column_indices, sizeof(OffsetType) * csr_matrix.num_nonzeros)); - HIP_CHECK(g_allocator.DeviceAllocate((void **) ¶ms.d_vector_x, sizeof(T) * csr_matrix.num_cols)); - HIP_CHECK(g_allocator.DeviceAllocate((void **) ¶ms.d_vector_y, sizeof(T) * csr_matrix.num_rows)); + HIP_CHECK( + g_allocator.DeviceAllocate((void**)¶ms.d_values, sizeof(T) * csr_matrix.num_nonzeros)); + HIP_CHECK(g_allocator.DeviceAllocate((void**)¶ms.d_row_end_offsets, + sizeof(OffsetType) * (csr_matrix.num_rows + 1))); + HIP_CHECK(g_allocator.DeviceAllocate((void**)¶ms.d_column_indices, + sizeof(OffsetType) * csr_matrix.num_nonzeros)); + HIP_CHECK( + g_allocator.DeviceAllocate((void**)¶ms.d_vector_x, sizeof(T) * csr_matrix.num_cols)); + HIP_CHECK( + g_allocator.DeviceAllocate((void**)¶ms.d_vector_y, sizeof(T) * csr_matrix.num_rows)); params.num_rows = csr_matrix.num_rows; params.num_cols = csr_matrix.num_cols; @@ -188,11 +187,26 @@ TYPED_TEST(HipcubDeviceSpmvTests, Spmv) params.alpha = alpha_const; params.beta = beta_const; - HIP_CHECK(hipMemcpy(params.d_values, csr_matrix.values, sizeof(T) * csr_matrix.num_nonzeros, hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(params.d_row_end_offsets, csr_matrix.row_offsets, sizeof(OffsetType) * (csr_matrix.num_rows + 1), hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(params.d_column_indices, csr_matrix.column_indices, sizeof(OffsetType) * csr_matrix.num_nonzeros, hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(params.d_vector_x, vector_x, sizeof(T) * csr_matrix.num_cols, hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(params.d_vector_y, vector_y_in, sizeof(T) * csr_matrix.num_rows, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(params.d_values, + csr_matrix.values, + sizeof(T) * csr_matrix.num_nonzeros, + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(params.d_row_end_offsets, + csr_matrix.row_offsets, + sizeof(OffsetType) * (csr_matrix.num_rows + 1), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(params.d_column_indices, + csr_matrix.column_indices, + sizeof(OffsetType) * csr_matrix.num_nonzeros, + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(params.d_vector_x, + vector_x, + sizeof(T) * csr_matrix.num_cols, + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(params.d_vector_y, + vector_y_in, + sizeof(T) * csr_matrix.num_rows, + hipMemcpyHostToDevice)); // Allocate temporary storage size_t temp_storage_bytes = 0; @@ -217,7 +231,7 @@ TYPED_TEST(HipcubDeviceSpmvTests, Spmv) HIP_CHECK(hipDeviceSynchronize()); test_utils::GraphHelper gHelper; - if (TestFixture::use_graphs) + if(TestFixture::use_graphs) gHelper.startStreamCapture(stream); HIP_CHECK(hipcub::DeviceSpmv::CsrMV(d_temp_storage, @@ -232,10 +246,13 @@ TYPED_TEST(HipcubDeviceSpmvTests, Spmv) params.num_nonzeros, stream)); - if (TestFixture::use_graphs) + if(TestFixture::use_graphs) gHelper.createAndLaunchGraph(stream); - HIP_CHECK(hipMemcpy(vector_y_in, params.d_vector_y, sizeof(T) * params.num_rows, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(vector_y_in, + params.d_vector_y, + sizeof(T) * params.num_rows, + hipMemcpyDeviceToHost)); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); @@ -254,4 +271,16 @@ TYPED_TEST(HipcubDeviceSpmvTests, Spmv) gHelper.cleanupGraphHelper(); HIP_CHECK(hipStreamDestroy(stream)); } + + // De-allocate input and output vectors + delete[] vector_x; + delete[] vector_y_in; + delete[] vector_y_out; + + HIP_CHECK(g_allocator.DeviceFree(params.d_values)); + HIP_CHECK(g_allocator.DeviceFree(params.d_row_end_offsets)); + HIP_CHECK(g_allocator.DeviceFree(params.d_column_indices)); + HIP_CHECK(g_allocator.DeviceFree(params.d_vector_x)); + HIP_CHECK(g_allocator.DeviceFree(params.d_vector_y)); + HIP_CHECK(g_allocator.DeviceFree(d_temp_storage)); } diff --git a/projects/hipcub/test/hipcub/test_hipcub_grid.cpp b/projects/hipcub/test/hipcub/test_hipcub_grid.cpp index ef64ba2de9f..38d810ca95f 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_grid.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_grid.cpp @@ -314,5 +314,6 @@ TEST(HipcubGridTests, GridQueue) HIP_CHECK(hipFree(device_output)); HIP_CHECK(hipFree(device_output_reductions)); + HIP_CHECK(hipFree(queue_allocations)); } } diff --git a/projects/hipcub/test/hipcub/test_hipcub_iterators.cpp b/projects/hipcub/test/hipcub/test_hipcub_iterators.cpp index 14dcb1fed81..c0dea56251b 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_iterators.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_iterators.cpp @@ -181,6 +181,9 @@ void iterator_test_function(IteratorType d_itr, std::vector &h_reference) ASSERT_TRUE(d_itr == h_itrs[1]); HIP_CHECK(g_allocator.DeviceFree(device_output)); + HIP_CHECK(hipFree(d_itrs)); + free(h_itrs); + } TYPED_TEST_SUITE(HipcubIteratorTests, HipcubIteratorTestsParams); @@ -404,6 +407,9 @@ TYPED_TEST(HipcubIteratorTests, TestTexObj) iterator_test_function(d_obj_itr, h_reference); + HIP_CHECK(d_obj_itr.UnbindTexture()); + HIP_CHECK(d_obj_itr2.UnbindTexture()); + HIP_CHECK(g_allocator.DeviceFree(d_data)); HIP_CHECK(g_allocator.DeviceFree(d_dummy)); } @@ -478,6 +484,9 @@ TYPED_TEST(HipcubIteratorTests, TestTexRef) iterator_test_function(d_ref_itr, h_reference); + HIP_CHECK(d_ref_itr.UnbindTexture()); + HIP_CHECK(d_ref_itr2.UnbindTexture()); + HIP_CHECK(g_allocator.DeviceFree(d_data)); HIP_CHECK(g_allocator.DeviceFree(d_dummy)); } @@ -542,6 +551,8 @@ TYPED_TEST(HipcubIteratorTests, TestTexTransform) hipcub::TransformInputIterator, TextureIteratorType>, T>(xform_itr, h_reference); + HIP_CHECK(d_tex_itr.UnbindTexture()); + HIP_CHECK(g_allocator.DeviceFree(d_data)); } } diff --git a/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp b/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp index 1de4ae7199d..db351fbedf7 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_thread_operators.cpp @@ -692,6 +692,7 @@ TYPED_TEST(HipcubNCThreadOperatorsTests, ReduceByKeyOp) HIP_CHECK(hipFree(d_keys_expected)); HIP_CHECK(hipFree(d_expected)); HIP_CHECK(hipFree(d_unique_keys_expected)); + HIP_CHECK(hipFree(d_temp_storage)); } } diff --git a/projects/hipcub/test/hipcub/test_hipcub_util_ptx.cpp b/projects/hipcub/test/hipcub/test_hipcub_util_ptx.cpp index 7e806a63525..5a6b1436e78 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_util_ptx.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_util_ptx.cpp @@ -33,36 +33,31 @@ // Custom structure struct custom_notaligned { - short i; - double d; - float f; + short i; + double d; + float f; unsigned int u; - HIPCUB_HOST_DEVICE - custom_notaligned() : i(0), d(0), f(0), u(0) {}; - HIPCUB_HOST_DEVICE - ~custom_notaligned() {}; + HIPCUB_HOST_DEVICE custom_notaligned() : i(0), d(0), f(0), u(0){}; + HIPCUB_HOST_DEVICE ~custom_notaligned(){}; }; HIPCUB_HOST_DEVICE -inline bool operator==(const custom_notaligned& lhs, - const custom_notaligned& rhs) +inline bool + operator==(const custom_notaligned& lhs, const custom_notaligned& rhs) { - return lhs.i == rhs.i && lhs.d == rhs.d - && lhs.f == rhs.f &&lhs.u == rhs.u; + return lhs.i == rhs.i && lhs.d == rhs.d && lhs.f == rhs.f && lhs.u == rhs.u; } // Custom structure aligned to 16 bytes struct custom_16aligned { - int i; + int i; unsigned int u; - float f; + float f; - HIPCUB_HOST_DEVICE - custom_16aligned() {}; - HIPCUB_HOST_DEVICE - ~custom_16aligned() {}; + HIPCUB_HOST_DEVICE custom_16aligned(){}; + HIPCUB_HOST_DEVICE ~custom_16aligned(){}; } __attribute__((aligned(16))); inline HIPCUB_HOST_DEVICE @@ -75,7 +70,7 @@ bool operator==(const custom_16aligned& lhs, const custom_16aligned& rhs) template struct params { - using type = T; + using type = T; static constexpr unsigned int logical_warp_size = LogicalWarpSize; }; @@ -83,7 +78,7 @@ template class HipcubUtilPtxTests : public ::testing::Test { public: - using type = typename Params::type; + using type = typename Params::type; static constexpr unsigned int logical_warp_size = Params::logical_warp_size; }; @@ -114,15 +109,13 @@ __global__ void shuffle_up_kernel(T* data, unsigned int src_offset) { const unsigned int index = (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x; - T value = data[index]; + T value = data[index]; // first_thread argument is ignored in hipCUB with rocPRIM-backend const unsigned int first_thread = 0; // Using mask is not supported in rocPRIM, so we don't test other masks const unsigned int member_mask = 0xffffffff; - value = hipcub::ShuffleUp( - value, src_offset, first_thread, member_mask - ); + value = hipcub::ShuffleUp(value, src_offset, first_thread, member_mask); data[index] = value; } @@ -133,22 +126,26 @@ TYPED_TEST(HipcubUtilPtxTests, ShuffleUp) SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); HIP_CHECK(hipSetDevice(device_id)); - using T = typename TestFixture::type; - constexpr unsigned int logical_warp_size = TestFixture::logical_warp_size; - const unsigned int current_device_warp_size = HIPCUB_HOST_WARP_THREADS; - const size_t hardware_warp_size = (current_device_warp_size == HIPCUB_WARP_SIZE_32) ? HIPCUB_WARP_SIZE_32 : HIPCUB_WARP_SIZE_64; - const size_t size = hardware_warp_size; + using T = typename TestFixture::type; + constexpr unsigned int logical_warp_size = TestFixture::logical_warp_size; + const unsigned int current_device_warp_size = HIPCUB_HOST_WARP_THREADS; + const size_t hardware_warp_size = (current_device_warp_size == HIPCUB_WARP_SIZE_32) + ? HIPCUB_WARP_SIZE_32 + : HIPCUB_WARP_SIZE_64; + const size_t size = hardware_warp_size; - if (logical_warp_size > current_device_warp_size) + if(logical_warp_size > current_device_warp_size) { printf("Unsupported test warp size: %u Current device warp size: %u. Skipping test\n", - logical_warp_size, current_device_warp_size); + logical_warp_size, + current_device_warp_size); GTEST_SKIP(); } - for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { - unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate input @@ -159,61 +156,53 @@ TYPED_TEST(HipcubUtilPtxTests, ShuffleUp) std::vector output(input.size()); auto src_offsets = test_utils::get_random_data( - std::max(1, logical_warp_size/2), + std::max(1, logical_warp_size / 2), 1U, std::max(1, logical_warp_size - 1), - seed_value + seed_value_addition - ); + seed_value + seed_value_addition); T* device_data; - HIP_CHECK( - test_common_utils::hipMallocHelper( - &device_data, - input.size() * sizeof(typename decltype(input)::value_type) - ) - ); + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_data, + input.size() * sizeof(typename decltype(input)::value_type))); for(auto src_offset : src_offsets) { SCOPED_TRACE(testing::Message() << "where src_offset = " << src_offset); // Calculate expected results on host std::vector expected(size, test_utils::convert_to_device(0)); - for(size_t i = 0; i < input.size()/logical_warp_size; i++) + for(size_t i = 0; i < input.size() / logical_warp_size; i++) { for(size_t j = 0; j < logical_warp_size; j++) { - size_t index = j + logical_warp_size * i; - auto up_index = j > src_offset-1 ? index-src_offset : index; + size_t index = j + logical_warp_size * i; + auto up_index = j > src_offset - 1 ? index - src_offset : index; expected[index] = input[up_index]; } } // Writing to device memory - HIP_CHECK( - hipMemcpy( - device_data, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(device_data, + input.data(), + input.size() * sizeof(T), + hipMemcpyHostToDevice)); // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(shuffle_up_kernel), - dim3(1), dim3(hardware_warp_size), 0, 0, - device_data, src_offset - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(shuffle_up_kernel), + dim3(1), + dim3(hardware_warp_size), + 0, + 0, + device_data, + src_offset); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory - HIP_CHECK( - hipMemcpy( - output.data(), device_data, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output.data(), + device_data, + output.size() * sizeof(T), + hipMemcpyDeviceToHost)); for(size_t i = 0; i < output.size(); i++) { @@ -231,15 +220,13 @@ __global__ void shuffle_down_kernel(T* data, unsigned int src_offset) { const unsigned int index = (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x; - T value = data[index]; + T value = data[index]; // last_thread argument is ignored in hipCUB with rocPRIM-backend const unsigned int last_thread = LOGICAL_WARP_THREADS - 1; // Using mask is not supported in rocPRIM, so we don't test other masks const unsigned int member_mask = 0xffffffff; - value = hipcub::ShuffleDown( - value, src_offset, last_thread, member_mask - ); + value = hipcub::ShuffleDown(value, src_offset, last_thread, member_mask); data[index] = value; } @@ -250,22 +237,26 @@ TYPED_TEST(HipcubUtilPtxTests, ShuffleDown) SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); HIP_CHECK(hipSetDevice(device_id)); - using T = typename TestFixture::type; - constexpr unsigned int logical_warp_size = TestFixture::logical_warp_size; - const unsigned int current_device_warp_size = HIPCUB_HOST_WARP_THREADS; - const size_t hardware_warp_size = (current_device_warp_size == HIPCUB_WARP_SIZE_32) ? HIPCUB_WARP_SIZE_32 : HIPCUB_WARP_SIZE_64; - const size_t size = hardware_warp_size; + using T = typename TestFixture::type; + constexpr unsigned int logical_warp_size = TestFixture::logical_warp_size; + const unsigned int current_device_warp_size = HIPCUB_HOST_WARP_THREADS; + const size_t hardware_warp_size = (current_device_warp_size == HIPCUB_WARP_SIZE_32) + ? HIPCUB_WARP_SIZE_32 + : HIPCUB_WARP_SIZE_64; + const size_t size = hardware_warp_size; - if (logical_warp_size > current_device_warp_size) + if(logical_warp_size > current_device_warp_size) { printf("Unsupported test warp size: %u Current device warp size: %u. Skipping test\n", - logical_warp_size, current_device_warp_size); + logical_warp_size, + current_device_warp_size); GTEST_SKIP(); } - for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { - unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate input @@ -276,61 +267,54 @@ TYPED_TEST(HipcubUtilPtxTests, ShuffleDown) std::vector output(input.size()); auto src_offsets = test_utils::get_random_data( - std::max(1, logical_warp_size/2), + std::max(1, logical_warp_size / 2), 1U, std::max(1, logical_warp_size - 1), - seed_value + seed_value_addition - ); + seed_value + seed_value_addition); - T * device_data; - HIP_CHECK( - test_common_utils::hipMallocHelper( - &device_data, - input.size() * sizeof(typename decltype(input)::value_type) - ) - ); + T* device_data; + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_data, + input.size() * sizeof(typename decltype(input)::value_type))); for(auto src_offset : src_offsets) { SCOPED_TRACE(testing::Message() << "where src_offset = " << src_offset); // Calculate expected results on host std::vector expected(size, test_utils::convert_to_device(0)); - for(size_t i = 0; i < input.size()/logical_warp_size; i++) + for(size_t i = 0; i < input.size() / logical_warp_size; i++) { for(size_t j = 0; j < logical_warp_size; j++) { size_t index = j + logical_warp_size * i; - auto down_index = j+src_offset < logical_warp_size ? index+src_offset : index; + auto down_index + = j + src_offset < logical_warp_size ? index + src_offset : index; expected[index] = input[down_index]; } } // Writing to device memory - HIP_CHECK( - hipMemcpy( - device_data, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(device_data, + input.data(), + input.size() * sizeof(T), + hipMemcpyHostToDevice)); // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(shuffle_down_kernel), - dim3(1), dim3(hardware_warp_size), 0, 0, - device_data, src_offset - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(shuffle_down_kernel), + dim3(1), + dim3(hardware_warp_size), + 0, + 0, + device_data, + src_offset); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory - HIP_CHECK( - hipMemcpy( - output.data(), device_data, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output.data(), + device_data, + output.size() * sizeof(T), + hipMemcpyDeviceToHost)); for(size_t i = 0; i < output.size(); i++) { @@ -348,13 +332,14 @@ __global__ void shuffle_index_kernel(T* data, int* src_offsets) { const unsigned int index = (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x; - T value = data[index]; + T value = data[index]; // Using mask is not supported in rocPRIM, so we don't test other masks const unsigned int member_mask = 0xffffffff; - value = hipcub::ShuffleIndex( - value, src_offsets[hipThreadIdx_x/LOGICAL_WARP_THREADS], member_mask - ); + value = hipcub::ShuffleIndex( + value, + src_offsets[hipThreadIdx_x / LOGICAL_WARP_THREADS], + member_mask); data[index] = value; } @@ -365,22 +350,26 @@ TYPED_TEST(HipcubUtilPtxTests, ShuffleIndex) SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); HIP_CHECK(hipSetDevice(device_id)); - using T = typename TestFixture::type; - constexpr unsigned int logical_warp_size = TestFixture::logical_warp_size; - const unsigned int current_device_warp_size = HIPCUB_HOST_WARP_THREADS; - const size_t hardware_warp_size = (current_device_warp_size == HIPCUB_WARP_SIZE_32) ? HIPCUB_WARP_SIZE_32 : HIPCUB_WARP_SIZE_64; - const size_t size = hardware_warp_size; + using T = typename TestFixture::type; + constexpr unsigned int logical_warp_size = TestFixture::logical_warp_size; + const unsigned int current_device_warp_size = HIPCUB_HOST_WARP_THREADS; + const size_t hardware_warp_size = (current_device_warp_size == HIPCUB_WARP_SIZE_32) + ? HIPCUB_WARP_SIZE_32 + : HIPCUB_WARP_SIZE_64; + const size_t size = hardware_warp_size; - if (logical_warp_size > current_device_warp_size) + if(logical_warp_size > current_device_warp_size) { printf("Unsupported test warp size: %u Current device warp size: %u. Skipping test\n", - logical_warp_size, current_device_warp_size); + logical_warp_size, + current_device_warp_size); GTEST_SKIP(); } - for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { - unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate input @@ -390,73 +379,59 @@ TYPED_TEST(HipcubUtilPtxTests, ShuffleIndex) seed_value); std::vector output(input.size()); - auto src_offsets = test_utils::get_random_data( - hardware_warp_size/logical_warp_size, - 0, - std::max(1, logical_warp_size - 1), - seed_value + seed_value_addition - ); + auto src_offsets = test_utils::get_random_data(hardware_warp_size / logical_warp_size, + 0, + std::max(1, logical_warp_size - 1), + seed_value + seed_value_addition); // Calculate expected results on host std::vector expected(size, test_utils::convert_to_device(0)); - for(size_t i = 0; i < input.size()/logical_warp_size; i++) + for(size_t i = 0; i < input.size() / logical_warp_size; i++) { int src_index = src_offsets[i]; for(size_t j = 0; j < logical_warp_size; j++) { size_t index = j + logical_warp_size * i; - if(src_index >= int(logical_warp_size) || src_index < 0) src_index = index; + if(src_index >= int(logical_warp_size) || src_index < 0) + src_index = index; expected[index] = input[src_index + logical_warp_size * i]; } } // Writing to device memory - T* device_data; - int * device_src_offsets; - HIP_CHECK( - test_common_utils::hipMallocHelper( - &device_data, - input.size() * sizeof(typename decltype(input)::value_type) - ) - ); - HIP_CHECK( - test_common_utils::hipMallocHelper( - &device_src_offsets, - src_offsets.size() * sizeof(typename decltype(src_offsets)::value_type) - ) - ); - HIP_CHECK( - hipMemcpy( - device_data, input.data(), - input.size() * sizeof(typename decltype(input)::value_type), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - device_src_offsets, src_offsets.data(), - src_offsets.size() * sizeof(typename decltype(src_offsets)::value_type), - hipMemcpyHostToDevice - ) - ); + T* device_data; + int* device_src_offsets; + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_data, + input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_src_offsets, + src_offsets.size() * sizeof(typename decltype(src_offsets)::value_type))); + HIP_CHECK(hipMemcpy(device_data, + input.data(), + input.size() * sizeof(typename decltype(input)::value_type), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(device_src_offsets, + src_offsets.data(), + src_offsets.size() * sizeof(typename decltype(src_offsets)::value_type), + hipMemcpyHostToDevice)); // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(shuffle_index_kernel), - dim3(1), dim3(hardware_warp_size), 0, 0, - device_data, device_src_offsets - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(shuffle_index_kernel), + dim3(1), + dim3(hardware_warp_size), + 0, + 0, + device_data, + device_src_offsets); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory - HIP_CHECK( - hipMemcpy( - output.data(), device_data, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output.data(), + device_data, + output.size() * sizeof(T), + hipMemcpyDeviceToHost)); for(size_t i = 0; i < output.size(); i++) { @@ -472,103 +447,100 @@ TYPED_TEST(HipcubUtilPtxTests, ShuffleIndex) TEST(HipcubUtilPtxTests, ShuffleUpCustomStruct) { - using T = custom_notaligned; + using T = custom_notaligned; constexpr unsigned int logical_warp_size_32 = HIPCUB_WARP_SIZE_32; constexpr unsigned int logical_warp_size_64 = HIPCUB_WARP_SIZE_64; const unsigned int current_device_warp_size = HIPCUB_HOST_WARP_THREADS; - const unsigned int logical_warp_size = (current_device_warp_size == HIPCUB_WARP_SIZE_32) ? logical_warp_size_32 : logical_warp_size_64; - const size_t size = (current_device_warp_size == HIPCUB_WARP_SIZE_32) ? logical_warp_size_32 : logical_warp_size_64; + const unsigned int logical_warp_size = (current_device_warp_size == HIPCUB_WARP_SIZE_32) + ? logical_warp_size_32 + : logical_warp_size_64; + const size_t size = (current_device_warp_size == HIPCUB_WARP_SIZE_32) ? logical_warp_size_32 + : logical_warp_size_64; - for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { - unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate data - std::vector random_data = test_utils::get_random_data( - 4 * size, - static_cast(-100), - static_cast(100), - seed_value - ); + std::vector random_data + = test_utils::get_random_data(4 * size, + static_cast(-100), + static_cast(100), + seed_value); std::vector input(size); std::vector output(input.size()); - for(size_t i = 0; i < 4 * input.size(); i+=4) + for(size_t i = 0; i < 4 * input.size(); i += 4) { - input[i/4].i = random_data[i]; - input[i/4].d = random_data[i+1]; - input[i/4].f = random_data[i+2]; - input[i/4].u = random_data[i+3]; + input[i / 4].i = random_data[i]; + input[i / 4].d = random_data[i + 1]; + input[i / 4].f = random_data[i + 2]; + input[i / 4].u = random_data[i + 3]; } auto src_offsets = test_utils::get_random_data( - std::max(1, logical_warp_size/2), + std::max(1, logical_warp_size / 2), 1U, std::max(1, logical_warp_size - 1), - seed_value + seed_value_addition - ); + seed_value + seed_value_addition); T* device_data; - HIP_CHECK( - test_common_utils::hipMallocHelper( - &device_data, - input.size() * sizeof(typename decltype(input)::value_type) - ) - ); + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_data, + input.size() * sizeof(typename decltype(input)::value_type))); for(auto src_offset : src_offsets) { // Calculate expected results on host std::vector expected(size); - for(size_t i = 0; i < input.size()/logical_warp_size; i++) + for(size_t i = 0; i < input.size() / logical_warp_size; i++) { for(size_t j = 0; j < logical_warp_size; j++) { - size_t index = j + logical_warp_size * i; - auto up_index = j > src_offset-1 ? index-src_offset : index; + size_t index = j + logical_warp_size * i; + auto up_index = j > src_offset - 1 ? index - src_offset : index; expected[index] = input[up_index]; } } // Writing to device memory - HIP_CHECK( - hipMemcpy( - device_data, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); - - if (logical_warp_size == logical_warp_size_32) + HIP_CHECK(hipMemcpy(device_data, + input.data(), + input.size() * sizeof(T), + hipMemcpyHostToDevice)); + + if(logical_warp_size == logical_warp_size_32) { // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(shuffle_up_kernel), - dim3(1), dim3(HIPCUB_WARP_SIZE_32), 0, 0, - device_data, src_offset - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(shuffle_up_kernel), + dim3(1), + dim3(HIPCUB_WARP_SIZE_32), + 0, + 0, + device_data, + src_offset); } - else if (logical_warp_size == logical_warp_size_64) + else if(logical_warp_size == logical_warp_size_64) { // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(shuffle_up_kernel), - dim3(1), dim3(HIPCUB_WARP_SIZE_64), 0, 0, - device_data, src_offset - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(shuffle_up_kernel), + dim3(1), + dim3(HIPCUB_WARP_SIZE_64), + 0, + 0, + device_data, + src_offset); } HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory - HIP_CHECK( - hipMemcpy( - output.data(), device_data, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output.data(), + device_data, + output.size() * sizeof(T), + hipMemcpyDeviceToHost)); for(size_t i = 0; i < output.size(); i++) { @@ -583,104 +555,103 @@ TEST(HipcubUtilPtxTests, ShuffleUpCustomStruct) TEST(HipcubUtilPtxTests, ShuffleUpCustomAlignedStruct) { - using T = custom_16aligned; + using T = custom_16aligned; constexpr unsigned int logical_warp_size_32 = HIPCUB_WARP_SIZE_32; constexpr unsigned int logical_warp_size_64 = HIPCUB_WARP_SIZE_64; const unsigned int current_device_warp_size = HIPCUB_HOST_WARP_THREADS; - const unsigned int hardware_warp_size = (current_device_warp_size == HIPCUB_WARP_SIZE_32) ? HIPCUB_WARP_SIZE_32 : HIPCUB_WARP_SIZE_64; - const unsigned int logical_warp_size = (current_device_warp_size == HIPCUB_WARP_SIZE_32) ? logical_warp_size_32 : logical_warp_size_64; - const size_t size = (current_device_warp_size == HIPCUB_WARP_SIZE_32) ? logical_warp_size_32 : logical_warp_size_64; - - for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + const unsigned int hardware_warp_size = (current_device_warp_size == HIPCUB_WARP_SIZE_32) + ? HIPCUB_WARP_SIZE_32 + : HIPCUB_WARP_SIZE_64; + const unsigned int logical_warp_size = (current_device_warp_size == HIPCUB_WARP_SIZE_32) + ? logical_warp_size_32 + : logical_warp_size_64; + const size_t size = (current_device_warp_size == HIPCUB_WARP_SIZE_32) ? logical_warp_size_32 + : logical_warp_size_64; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { - unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate data - std::vector random_data = test_utils::get_random_data( - 3 * size, - static_cast(-100), - static_cast(100), - seed_value - ); + std::vector random_data + = test_utils::get_random_data(3 * size, + static_cast(-100), + static_cast(100), + seed_value); std::vector input(size); std::vector output(input.size()); - for(size_t i = 0; i < 3 * input.size(); i+=3) + for(size_t i = 0; i < 3 * input.size(); i += 3) { - input[i/3].i = random_data[i]; - input[i/3].u = random_data[i+1]; - input[i/3].f = random_data[i+2]; + input[i / 3].i = random_data[i]; + input[i / 3].u = random_data[i + 1]; + input[i / 3].f = random_data[i + 2]; } auto src_offsets = test_utils::get_random_data( - std::max(1, logical_warp_size/2), + std::max(1, logical_warp_size / 2), 1U, std::max(1, logical_warp_size - 1), - seed_value + seed_value_addition - ); + seed_value + seed_value_addition); T* device_data; - HIP_CHECK( - test_common_utils::hipMallocHelper( - &device_data, - input.size() * sizeof(typename decltype(input)::value_type) - ) - ); + HIP_CHECK(test_common_utils::hipMallocHelper( + &device_data, + input.size() * sizeof(typename decltype(input)::value_type))); for(auto src_offset : src_offsets) { // Calculate expected results on host std::vector expected(size); - for(size_t i = 0; i < input.size()/logical_warp_size; i++) + for(size_t i = 0; i < input.size() / logical_warp_size; i++) { for(size_t j = 0; j < logical_warp_size; j++) { - size_t index = j + logical_warp_size * i; - auto up_index = j > src_offset-1 ? index-src_offset : index; + size_t index = j + logical_warp_size * i; + auto up_index = j > src_offset - 1 ? index - src_offset : index; expected[index] = input[up_index]; } } // Writing to device memory - HIP_CHECK( - hipMemcpy( - device_data, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); - - if (logical_warp_size == logical_warp_size_32) + HIP_CHECK(hipMemcpy(device_data, + input.data(), + input.size() * sizeof(T), + hipMemcpyHostToDevice)); + + if(logical_warp_size == logical_warp_size_32) { // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(shuffle_up_kernel), - dim3(1), dim3(hardware_warp_size), 0, 0, - device_data, src_offset - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(shuffle_up_kernel), + dim3(1), + dim3(hardware_warp_size), + 0, + 0, + device_data, + src_offset); } - else if (logical_warp_size == logical_warp_size_64) + else if(logical_warp_size == logical_warp_size_64) { // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(shuffle_up_kernel), - dim3(1), dim3(hardware_warp_size), 0, 0, - device_data, src_offset - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(shuffle_up_kernel), + dim3(1), + dim3(hardware_warp_size), + 0, + 0, + device_data, + src_offset); } HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory - HIP_CHECK( - hipMemcpy( - output.data(), device_data, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output.data(), + device_data, + output.size() * sizeof(T), + hipMemcpyDeviceToHost)); for(size_t i = 0; i < output.size(); i++) { @@ -697,52 +668,50 @@ __global__ void warp_id_kernel(unsigned int* output) { const unsigned int index = (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x; - output[index] = hipcub::WarpId(); + output[index] = hipcub::WarpId(); } TEST(HipcubUtilPtxTests, WarpId) { const unsigned int current_device_warp_size = HIPCUB_HOST_WARP_THREADS; - const unsigned int hardware_warp_size = (current_device_warp_size == HIPCUB_WARP_SIZE_32) ? HIPCUB_WARP_SIZE_32 : HIPCUB_WARP_SIZE_64; - const size_t block_size = (current_device_warp_size == HIPCUB_WARP_SIZE_32) ? 4 * HIPCUB_WARP_SIZE_32 : 4 * HIPCUB_WARP_SIZE_64; - const size_t size = 16 * block_size; + const unsigned int hardware_warp_size = (current_device_warp_size == HIPCUB_WARP_SIZE_32) + ? HIPCUB_WARP_SIZE_32 + : HIPCUB_WARP_SIZE_64; + const size_t block_size = (current_device_warp_size == HIPCUB_WARP_SIZE_32) + ? 4 * HIPCUB_WARP_SIZE_32 + : 4 * HIPCUB_WARP_SIZE_64; + const size_t size = 16 * block_size; std::vector output(size); - unsigned int* device_output; + unsigned int* device_output; HIP_CHECK( - test_common_utils::hipMallocHelper( - &device_output, - output.size() * sizeof(unsigned int) - ) - ); + test_common_utils::hipMallocHelper(&device_output, output.size() * sizeof(unsigned int))); // Launching kernel - hipLaunchKernelGGL( - warp_id_kernel, - dim3(size/block_size), dim3(block_size), 0, 0, - device_output - ); + hipLaunchKernelGGL(warp_id_kernel, + dim3(size / block_size), + dim3(block_size), + 0, + 0, + device_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory - HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(unsigned int), - hipMemcpyDeviceToHost - ) - ); - - std::vector warp_ids(block_size/hardware_warp_size, 0); - for(size_t i = 0; i < output.size()/hardware_warp_size; i++) + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(unsigned int), + hipMemcpyDeviceToHost)); + + std::vector warp_ids(block_size / hardware_warp_size, 0); + for(size_t i = 0; i < output.size() / hardware_warp_size; i++) { auto prev = output[i * hardware_warp_size]; for(size_t j = 0; j < hardware_warp_size; j++) { auto index = j + i * hardware_warp_size; // less than number of warps in thread block - ASSERT_LT(output[index], block_size/hardware_warp_size); + ASSERT_LT(output[index], block_size / hardware_warp_size); ASSERT_GE(output[index], 0U); // > 0 ASSERT_EQ(output[index], prev); // all in warp_ids in warp are the same } @@ -751,146 +720,181 @@ TEST(HipcubUtilPtxTests, WarpId) // Check if each warp_id appears the same number of times. for(auto warp_id_no : warp_ids) { - ASSERT_EQ(warp_id_no, size/block_size); + ASSERT_EQ(warp_id_no, size / block_size); } + + HIP_CHECK(hipFree(device_output)); } -enum class TestStatus : uint8_t { +enum class TestStatus : uint8_t +{ Failed = 0, Passed = 1 }; -std::ostream& operator<<(std::ostream& lhs, TestStatus rhs) { - switch(rhs) { - case TestStatus::Failed: - return lhs << "F"; - case TestStatus::Passed: - return lhs << "P"; +std::ostream& operator<<(std::ostream& lhs, TestStatus rhs) +{ + switch(rhs) + { + case TestStatus::Failed: return lhs << "F"; + case TestStatus::Passed: return lhs << "P"; } return lhs << "Unknown(" << static_cast(rhs) << ")"; } -HIPCUB_DEVICE bool is_lane_in_mask(const uint64_t mask, const unsigned int lane) { +HIPCUB_DEVICE +bool is_lane_in_mask(const uint64_t mask, const unsigned int lane) +{ return (uint64_t(1) << lane) & mask; } -template +template HIPCUB_DEVICE std::enable_if_t<(HIPCUB_DEVICE_WARP_THREADS >= LogicalWarpSize), TestStatus> -test_warp_mask_pow_two() { + test_warp_mask_pow_two() +{ const unsigned int logical_warp_id = hipcub::LaneId() / LogicalWarpSize; - const uint64_t mask = hipcub::WarpMask(logical_warp_id); + const uint64_t mask = hipcub::WarpMask(logical_warp_id); - const unsigned int warp_start = logical_warp_id * LogicalWarpSize; + const unsigned int warp_start = logical_warp_id * LogicalWarpSize; const unsigned int next_warp_start = (logical_warp_id + 1) * LogicalWarpSize; - for (unsigned int lane = 0; lane < warp_start; ++lane) { - if(is_lane_in_mask(mask, lane)) { + for(unsigned int lane = 0; lane < warp_start; ++lane) + { + if(is_lane_in_mask(mask, lane)) + { return TestStatus::Failed; } } - for (unsigned int lane = warp_start; lane < next_warp_start; ++lane) { - if(!is_lane_in_mask(mask, lane)) { + for(unsigned int lane = warp_start; lane < next_warp_start; ++lane) + { + if(!is_lane_in_mask(mask, lane)) + { return TestStatus::Failed; } } - for (unsigned int lane = next_warp_start; lane < 64; ++lane) { - if(is_lane_in_mask(mask, lane)) { + for(unsigned int lane = next_warp_start; lane < 64; ++lane) + { + if(is_lane_in_mask(mask, lane)) + { return TestStatus::Failed; } } return TestStatus::Passed; } -template +template HIPCUB_DEVICE std::enable_if_t= LogicalWarpSize), TestStatus> -test_warp_mask_pow_two() { + test_warp_mask_pow_two() +{ return TestStatus::Passed; } -template +template HIPCUB_DEVICE std::enable_if_t<(HIPCUB_DEVICE_WARP_THREADS >= LogicalWarpSize), TestStatus> -test_warp_mask_non_pow_two() { + test_warp_mask_non_pow_two() +{ const unsigned int logical_warp_id = hipcub::LaneId() / LogicalWarpSize; - const uint64_t mask = hipcub::WarpMask(logical_warp_id); + const uint64_t mask = hipcub::WarpMask(logical_warp_id); - for (unsigned int lane = 0; lane < LogicalWarpSize; ++lane) { - if(!is_lane_in_mask(mask, lane)) { + for(unsigned int lane = 0; lane < LogicalWarpSize; ++lane) + { + if(!is_lane_in_mask(mask, lane)) + { return TestStatus::Failed; } } - for (unsigned int lane = LogicalWarpSize; lane < 64; ++lane) { - if(is_lane_in_mask(mask, lane)) { + for(unsigned int lane = LogicalWarpSize; lane < 64; ++lane) + { + if(is_lane_in_mask(mask, lane)) + { return TestStatus::Failed; } } return TestStatus::Passed; } -template +template HIPCUB_DEVICE std::enable_if_t= LogicalWarpSize), TestStatus> -test_warp_mask_non_pow_two() { + test_warp_mask_non_pow_two() +{ return TestStatus::Passed; } template -__global__ void device_test_warp_mask(TestStatus* statuses) { +__global__ +void device_test_warp_mask(TestStatus* statuses) +{ constexpr bool is_power_of_two = test_utils::is_power_of_two(LogicalWarpSize); - statuses[threadIdx.x] = is_power_of_two - ? test_warp_mask_pow_two() - : test_warp_mask_non_pow_two(); + statuses[threadIdx.x] = is_power_of_two ? test_warp_mask_pow_two() + : test_warp_mask_non_pow_two(); } -template -void test_warp_size(std::vector& statuses, TestStatus* d_statuses, const unsigned int device_warp_size) { - if(LogicalWarpSize > device_warp_size) { +template +void test_warp_size(std::vector& statuses, + TestStatus* d_statuses, + const unsigned int device_warp_size) +{ + if(LogicalWarpSize > device_warp_size) + { return; } statuses.clear(); statuses.insert(statuses.begin(), device_warp_size, TestStatus::Failed); - HIP_CHECK(hipMemcpy(d_statuses, statuses.data(), + HIP_CHECK(hipMemcpy(d_statuses, + statuses.data(), statuses.size() * sizeof(statuses[0]), hipMemcpyHostToDevice)); - hipLaunchKernelGGL(device_test_warp_mask, dim3(1), - dim3(device_warp_size), 0, 0, d_statuses); + hipLaunchKernelGGL(device_test_warp_mask, + dim3(1), + dim3(device_warp_size), + 0, + 0, + d_statuses); HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipMemcpy(statuses.data(), d_statuses, + HIP_CHECK(hipMemcpy(statuses.data(), + d_statuses, statuses.size() * sizeof(statuses[0]), hipMemcpyDeviceToHost)); SCOPED_TRACE(testing::Message() << "where LogicalWarpSize = " << LogicalWarpSize); - ASSERT_TRUE(std::all_of( - statuses.begin(), statuses.end(), - [](const TestStatus status) { return status == TestStatus::Passed; })); + ASSERT_TRUE(std::all_of(statuses.begin(), + statuses.end(), + [](const TestStatus status) { return status == TestStatus::Passed; })); } template -void test_all_warp_sizes(std::integer_sequence) { +void test_all_warp_sizes(std::integer_sequence) +{ const int device_warp_size = HIPCUB_HOST_WARP_THREADS; ASSERT_GT(device_warp_size, 0); SCOPED_TRACE(testing::Message() << "where device warp size = " << device_warp_size); TestStatus* d_statuses = nullptr; - HIP_CHECK(test_common_utils::hipMallocHelper( - &d_statuses, static_cast(device_warp_size))); + HIP_CHECK( + test_common_utils::hipMallocHelper(&d_statuses, static_cast(device_warp_size))); // Call the test with each logical warp size in the range [1, 64] - auto statuses = std::vector(); - const auto ignore = - {(test_warp_size(statuses, d_statuses, static_cast(device_warp_size)), 0)...}; + auto statuses = std::vector(); + const auto ignore + = {(test_warp_size(statuses, + d_statuses, + static_cast(device_warp_size)), + 0)...}; static_cast(ignore); HIP_CHECK(hipFree(d_statuses)); } -TEST(HipcubUtilPtxTests, WarpMask) { +TEST(HipcubUtilPtxTests, WarpMask) +{ using sequence = std::make_integer_sequence; test_all_warp_sizes(sequence{}); } diff --git a/projects/hipcub/test/hipcub/test_hipcub_vector.cpp b/projects/hipcub/test/hipcub/test_hipcub_vector.cpp index fdf6eb16521..9317be257cc 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_vector.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_vector.cpp @@ -130,6 +130,9 @@ void run_vector_test() expected_num)); } } + + HIP_CHECK(hipFree(device_input)); + HIP_CHECK(hipFree(device_output)); } TYPED_TEST(HipcubVector, Vector1) From d5d51131bb3d52b74daa369f00aa642ae0a1b1a4 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 6 Aug 2025 21:09:55 +0000 Subject: [PATCH 2/6] added back missing newlines --- projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp | 2 +- projects/hipcub/test/hipcub/test_hipcub_block_radix_sort.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp index 5e7c47188b2..cca727699d2 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_radix_rank.cpp @@ -780,4 +780,4 @@ TYPED_TEST(HipcubBlockRadixRank, BlockRadixRankMemoizeWithPrefixSumOutput) TYPED_TEST(HipcubBlockRadixRank, BlockRadixRankMatchWithPrefixSumOutput) { test_radix_rank_with_prefix_sum_output(); -} \ No newline at end of file +} diff --git a/projects/hipcub/test/hipcub/test_hipcub_block_radix_sort.cpp b/projects/hipcub/test/hipcub/test_hipcub_block_radix_sort.cpp index 3992a082bb1..5a3e311b278 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_block_radix_sort.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_block_radix_sort.cpp @@ -665,4 +665,4 @@ TYPED_TEST(HipcubBlockRadixSort, SortKeysValues) HIP_CHECK(hipFree(device_keys_output)); HIP_CHECK(hipFree(device_values_output)); } -} \ No newline at end of file +} From 9a47b5e27b17bcb036ea8f9547b3bfbc5fbc83c9 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 6 Aug 2025 21:32:36 +0000 Subject: [PATCH 3/6] disabled test hipcub caching device when using valgrind --- .../test/hipcub/test_hipcub_caching_device_allocator.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/projects/hipcub/test/hipcub/test_hipcub_caching_device_allocator.cpp b/projects/hipcub/test/hipcub/test_hipcub_caching_device_allocator.cpp index 73b310a1d6c..1ce0490d16a 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_caching_device_allocator.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_caching_device_allocator.cpp @@ -30,6 +30,7 @@ #include "common_test_header.hpp" #include "hipcub/util_allocator.hpp" +#include __global__ void EmptyKernel() { } @@ -37,6 +38,13 @@ __global__ void EmptyKernel() { } TEST(HipcubCachingDeviceAllocatorTests, Test1) { + // This test is very timing sensitive. Valgrind signigficantly slows down + // kernel execution and therefore messes up the test. If valgrind is being + // used we should disable this test + if (RUNNING_ON_VALGRIND) { + GTEST_SKIP() << "Skipping test under Valgrind"; + } + // Get number of GPUs and current GPU int num_gpus; int initial_gpu; From 44a35585aee1db5ea989128dfb5d571a5d0be391 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 6 Aug 2025 22:18:29 +0000 Subject: [PATCH 4/6] changed from using valgrind to cstdlib --- .../test/hipcub/test_hipcub_caching_device_allocator.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/hipcub/test/hipcub/test_hipcub_caching_device_allocator.cpp b/projects/hipcub/test/hipcub/test_hipcub_caching_device_allocator.cpp index 1ce0490d16a..e228e68007a 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_caching_device_allocator.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_caching_device_allocator.cpp @@ -30,7 +30,7 @@ #include "common_test_header.hpp" #include "hipcub/util_allocator.hpp" -#include +#include __global__ void EmptyKernel() { } @@ -41,7 +41,7 @@ TEST(HipcubCachingDeviceAllocatorTests, Test1) // This test is very timing sensitive. Valgrind signigficantly slows down // kernel execution and therefore messes up the test. If valgrind is being // used we should disable this test - if (RUNNING_ON_VALGRIND) { + if (std::getenv("UNDER_VALGRIND")) { GTEST_SKIP() << "Skipping test under Valgrind"; } From 045c5ba684a3acecaec63145bfdb415637ab9be1 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Thu, 7 Aug 2025 08:52:06 -0700 Subject: [PATCH 5/6] switched back to using valgrind header, but with a check for header now --- .../test_hipcub_caching_device_allocator.cpp | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/projects/hipcub/test/hipcub/test_hipcub_caching_device_allocator.cpp b/projects/hipcub/test/hipcub/test_hipcub_caching_device_allocator.cpp index e228e68007a..dc801d939cc 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_caching_device_allocator.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_caching_device_allocator.cpp @@ -32,18 +32,28 @@ #include "hipcub/util_allocator.hpp" #include +#if __has_include() + #include + #define HAS_VALGRIND_H 1 +#else + #define HAS_VALGRIND_H 0 +#endif + __global__ void EmptyKernel() { } // Hipified test/test_allocator.cu TEST(HipcubCachingDeviceAllocatorTests, Test1) { - // This test is very timing sensitive. Valgrind signigficantly slows down - // kernel execution and therefore messes up the test. If valgrind is being - // used we should disable this test - if (std::getenv("UNDER_VALGRIND")) { + +#if HAS_VALGRIND_H + // This test is very timing sensitive. Valgrind significantly slows down + // kernel execution and therefore messes up the timing of the test. + // If valgrind is being used we should disable this test otherwise it will fail + if (RUNNING_ON_VALGRIND) { GTEST_SKIP() << "Skipping test under Valgrind"; } +#endif //HAS_VALGRIND_H // Get number of GPUs and current GPU int num_gpus; From a103d98656798e1930370e72ed980b249c60b824 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Thu, 28 Aug 2025 17:32:32 +0000 Subject: [PATCH 6/6] updated copyright --- .../hipcub/test/hipcub/test_hipcub_caching_device_allocator.cpp | 2 +- projects/hipcub/test/hipcub/test_hipcub_grid.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/hipcub/test/hipcub/test_hipcub_caching_device_allocator.cpp b/projects/hipcub/test/hipcub/test_hipcub_caching_device_allocator.cpp index f6caca095f1..b09991fe19f 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_caching_device_allocator.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_caching_device_allocator.cpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2025, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/projects/hipcub/test/hipcub/test_hipcub_grid.cpp b/projects/hipcub/test/hipcub/test_hipcub_grid.cpp index 244ae217dde..33552dfac97 100644 --- a/projects/hipcub/test/hipcub/test_hipcub_grid.cpp +++ b/projects/hipcub/test/hipcub/test_hipcub_grid.cpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2024, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2025, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: