diff --git a/CHANGELOG.md b/CHANGELOG.md index 7001c9ecb..592347b3f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,14 @@ Documentation for rocRAND is available at ### Added * gfx950 support +* Additional unit tests for `test_log_normal_distribution.cpp` +* Additional unit tests for `test_normal_distribution.cpp` +* Additional unit tests for `test_uniform_distribution.cpp` +* Additional unit tests for `test_rocrand_threefry2x32_20_prng.cpp` +* Additional unit tests for `test_rocrand_threefry2x64_20_prng.cpp` +* Additional unit tests for `test_rocrand_threefry4x32_20_prng.cpp` +* Additional unit tests for `test_rocrand_threefry4x64_20_prng.cpp` +* New unit tests for `include/rocrand/rocrand_discrete.h` in `test_discrete_distribution.cpp` ### Changed diff --git a/test/internal/test_discrete_distribution.cpp b/test/internal/test_discrete_distribution.cpp new file mode 100644 index 000000000..16ec00104 --- /dev/null +++ b/test/internal/test_discrete_distribution.cpp @@ -0,0 +1,678 @@ +// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include +#include + +#include + +#include +#include +#include + +#define HIP_CHECK(cmd) \ + do \ + { \ + auto error = (cmd); \ + if(error != hipSuccess) \ + { \ + std::cerr << "Encountered HIP error (" << hipGetErrorString(error) << ") at line " \ + << __LINE__ << " in file " << __FILE__ << "\n"; \ + exit(-1); \ + } \ + } while(0) \ + +#define ROCRAND_CHECK(cmd) \ + do \ + { \ + auto status = cmd; \ + if(status != 0){ \ + std::cerr << "Encountered ROCRAND error: " << status << "at line" \ + << __LINE__ << " in file " << __FILE__ << "\n"; \ + exit(-1); \ + } \ + } while(0) \ + + + +struct GlobalSizes { + static constexpr size_t items_per_thread = 256; + static constexpr size_t block_size = 32; + static constexpr size_t items_per_block = items_per_thread * block_size; + static constexpr size_t grid_size = 1234; + static constexpr size_t size = grid_size * items_per_block; +}; + +using DiscreteDataType = ::testing::Types; + +template +class InternalDiscreteDistributionTests : public ::testing::Test{ + public: + using T = DT; +}; + +TYPED_TEST_SUITE(InternalDiscreteDistributionTests, DiscreteDataType); + +template +__global__ void internal_discrete_kernel(T * device_input, unsigned int * device_output, rocrand_discrete_distribution_st &dis, const DiscreteFunc & f){ + const size_t items_per_block = GlobalSizes::items_per_thread * GlobalSizes::block_size; + const size_t offset = (items_per_block * blockIdx.x) + (GlobalSizes::items_per_thread * threadIdx.x); + + for(size_t i = 0; i < GlobalSizes::items_per_thread; i++){ + device_output[offset + i] = f(device_input[offset + i], dis); + } +} + +template +void run_internal_discrete_tests(const DiscreteFunc & f){ + std::vector> all_distributions = { + {10, 10, 10, 10}, + {1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 1}, + {1234, 1677, 1519, 1032, 561, 254, 98, 33, 10, 2}, + {1, 2, 8, 4, 3, 2, 1} + }; + + std::random_device rd; + std::mt19937 gen(rd()); + + T * host_input = new T[GlobalSizes::size]; + unsigned int * host_output = new unsigned int[GlobalSizes::size]; + + // Check for different types of data input and generate the input data + if constexpr (std::is_same_v){ + std::uniform_real_distribution dis(0, 1); + for(size_t i = 0; i < GlobalSizes::size; i++) host_input[i] = dis(gen); + } + else if constexpr(std::is_same_v || std::is_same_v){ + std::uniform_int_distribution dis(0, std::numeric_limits::max()); + for(size_t i = 0; i < GlobalSizes::size; i++) host_input[i] = dis(gen); + } + else{ + std::uniform_int_distribution dis(0, std::numeric_limits::max()); + for(size_t i = 0; i < GlobalSizes::size; i++) host_input[i] = dis(gen); + } + + T * device_input; + unsigned int * device_output; + + HIP_CHECK(hipMalloc(&device_input, sizeof(T) * GlobalSizes::size)); + HIP_CHECK(hipMalloc(&device_output, sizeof(unsigned int) * GlobalSizes::size)); + + HIP_CHECK(hipMemcpy(device_input, host_input, sizeof(T) * GlobalSizes::size, hipMemcpyHostToDevice)); + + // Generate different discrete distributions and check them against expected + for(std::vector distribution : all_distributions){ + // Getting expected Results + double sum = std::accumulate(distribution.begin(), distribution.end(), 0); + std::vector expected_prob(distribution.size()); + for(size_t i = 0; i < distribution.size(); i++) + expected_prob[i] = distribution[i] / sum; + + // Creating the discrete distribution + rocrand_discrete_distribution discrete_distribution; + ROCRAND_CHECK(rocrand_create_discrete_distribution(expected_prob.data(), expected_prob.size(), 0, &discrete_distribution)); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(internal_discrete_kernel), + dim3(GlobalSizes::grid_size), dim3(GlobalSizes::block_size), 0, 0, + device_input, device_output, *discrete_distribution, f + ); + + HIP_CHECK(hipMemcpy(host_output, device_output, sizeof(unsigned int) * GlobalSizes::size, hipMemcpyDeviceToHost)); + + std::vector histogram(distribution.size()); + + // Calculating the actual results + for(size_t i = 0; i < GlobalSizes::size; i++) + histogram[host_output[i]]++; + + std::vector actual_prob(distribution.size()); + for(size_t i = 0; i < actual_prob.size(); i++) + actual_prob[i] = histogram[i] / static_cast(GlobalSizes::size); + + // If the original probability is bigger than 5% then expected should be within 1% difference. + // Otherwise it should be within 0.01 + for(size_t i = 0; i < expected_prob.size(); i++){ + double eps = expected_prob[i] > 0.05 ? expected_prob[i] * 0.01 : 0.01; + ASSERT_NEAR(expected_prob[i], actual_prob[i], eps); + } + + ROCRAND_CHECK(rocrand_destroy_discrete_distribution(discrete_distribution)); + } + + delete [] host_input; + delete [] host_output; + + HIP_CHECK(hipFree(device_input)); + HIP_CHECK(hipFree(device_output)); +} + +TYPED_TEST(InternalDiscreteDistributionTests, InternalDiscreteAliasTest){ + using T = TestFixture::T; + run_internal_discrete_tests( + [=] __device__(T val, rocrand_discrete_distribution_st & dis) + {return rocrand_device::detail::discrete_alias(val, dis);} + ); +} + +TYPED_TEST(InternalDiscreteDistributionTests, InternalDiscreteCDFTest){ + using T = TestFixture::T; + run_internal_discrete_tests( + [=] __device__(T val, rocrand_discrete_distribution_st & dis) + {return rocrand_device::detail::discrete_cdf(val, dis);} + ); +} + +template +__global__ void block_wide_external_discrete_kernel( + RocRandPrngType * states, + unsigned int * device_output, + rocrand_discrete_distribution_st & dis, + size_t items_per_thread, + size_t block_size +){ + const size_t items_per_block = items_per_thread * block_size; + const size_t offset = (items_per_block * blockIdx.x) + (items_per_thread * threadIdx.x); + + for(size_t i = 0; i < items_per_thread; i++){ + __shared__ RocRandPrngType state; + if(threadIdx.x == 0) + state = states[blockIdx.x]; + __syncthreads(); + + device_output[offset + i] = rocrand_discrete(&state, &dis); + + if(threadIdx.x == 0) + states[blockIdx.x] = state; + } +} + +template +__global__ void external_discrete_kernel( + RocRandPrngType * states, + unsigned int * device_output, + rocrand_discrete_distribution_st & dis, + size_t items_per_thread, + size_t block_size +){ + const size_t items_per_block = items_per_thread * block_size; + const size_t offset = (items_per_block * blockIdx.x) + (items_per_thread * threadIdx.x); + + for(size_t i = 0; i < items_per_thread; i++){ + auto local_state = states[offset + i]; + device_output[offset + i] = rocrand_discrete(&local_state, &dis); + states[offset + i] = local_state; + } +} + +template +void run_external_discrete_tests( + PrngState & device_states, + size_t items_per_thread = GlobalSizes::items_per_thread, + size_t block_size = GlobalSizes::block_size, + size_t grid_size = GlobalSizes::grid_size, + size_t size = GlobalSizes::size +){ + + std::vector> all_distributions = { + {10, 10, 10, 10}, + {1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 1}, + {1234, 1677, 1519, 1032, 561, 254, 98, 33, 10, 2}, + {1, 2, 8, 4, 3, 2, 1} + }; + + unsigned int * host_output = new unsigned int[size]; + unsigned int * device_output; + HIP_CHECK(hipMalloc(&device_output, sizeof(unsigned int) * size)); + + for(std::vector distribution : all_distributions){ + + // Getting expected Results + double sum = std::accumulate(distribution.begin(), distribution.end(), 0); + std::vector expected_prob(distribution.size()); + for(size_t i = 0; i < distribution.size(); i++) + expected_prob[i] = distribution[i] / sum; + + // Creating the discrete distribution + rocrand_discrete_distribution discrete_distribution; + ROCRAND_CHECK(rocrand_create_discrete_distribution(expected_prob.data(), expected_prob.size(), 0, &discrete_distribution)); + + if constexpr(block_wide){ + hipLaunchKernelGGL( + HIP_KERNEL_NAME(block_wide_external_discrete_kernel), + dim3(grid_size), dim3(block_size), 0, 0, + device_states, device_output, *discrete_distribution, items_per_thread, block_size + ); + } + else{ + hipLaunchKernelGGL( + HIP_KERNEL_NAME(external_discrete_kernel), + dim3(grid_size), dim3(block_size), 0, 0, + device_states, device_output, *discrete_distribution, items_per_thread, block_size + ); + } + + HIP_CHECK(hipMemcpy(host_output, device_output, sizeof(unsigned int) * size, hipMemcpyDeviceToHost)); + std::vector histogram(distribution.size()); + + // Calculating the actual results + for(size_t i = 0; i < size; i++) + histogram[host_output[i]]++; + + std::vector actual_prob(distribution.size()); + for(size_t i = 0; i < actual_prob.size(); i++) + actual_prob[i] = histogram[i] / static_cast(size); + + // If the original probability is bigger than 5% then expected should be within 1% difference. + // Otherwise it should be within 0.01 + for(size_t i = 0; i < expected_prob.size(); i++){ + double eps = expected_prob[i] > 0.05 ? expected_prob[i] * 0.01 : 0.01; + ASSERT_NEAR(expected_prob[i], actual_prob[i], eps); + } + + ROCRAND_CHECK(rocrand_destroy_discrete_distribution(discrete_distribution)); + } + + delete [] host_output; + HIP_CHECK(hipFree(device_output)); +} + +template +__global__ void init_rocrand_states_kernel(RocRandPrngType * states, const InitFunc & f){ + constexpr size_t items_per_block = GlobalSizes::items_per_thread * GlobalSizes::block_size; + const size_t offset = (items_per_block * blockIdx.x) + (GlobalSizes::items_per_thread * threadIdx.x); + + for(size_t i = 0; i < GlobalSizes::items_per_thread; i++) + f(i, offset, &states[offset + i]); + // rocrand_init((123456 ^ i), offset + i, 0, &states[offset + i]); +} + +TEST(ExternalDiscreteDistributionTests, Philox4x32_10Test){ + // Initialize the prng state + rocrand_state_philox4x32_10 * device_states; + HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_philox4x32_10) * GlobalSizes::size)); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(init_rocrand_states_kernel), + dim3(GlobalSizes::grid_size), dim3(GlobalSizes::block_size), 0, 0, + device_states, + [] __device__ (size_t index, size_t offset, rocrand_state_philox4x32_10 * state){ + rocrand_init((123456 ^ index), offset + index, 0, state); + } + ); + + run_external_discrete_tests(device_states); + + HIP_CHECK(hipFree(device_states)); +} + +TEST(ExternalDiscreteDistributionTests, Mrg31k3pTest){ + // Initialize the prng state + rocrand_state_mrg31k3p * device_states; + HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_mrg31k3p) * GlobalSizes::size)); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(init_rocrand_states_kernel), + dim3(GlobalSizes::grid_size), dim3(GlobalSizes::block_size), 0, 0, + device_states, + [] __device__ (size_t index, size_t offset, rocrand_state_mrg31k3p * state){ + rocrand_init((123456 ^ index), offset + index, 0, state); + } + ); + + run_external_discrete_tests(device_states); + + HIP_CHECK(hipFree(device_states)); +} + +TEST(ExternalDiscreteDistributionTests, Mrg32k3aTest){ + // Initialize the prng state + rocrand_state_mrg32k3a * device_states; + HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_mrg32k3a) * GlobalSizes::size)); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(init_rocrand_states_kernel), + dim3(GlobalSizes::grid_size), dim3(GlobalSizes::block_size), 0, 0, + device_states, + [] __device__ (size_t index, size_t offset, rocrand_state_mrg32k3a * state){ + rocrand_init((123456 ^ index), offset + index, 0, state); + } + ); + + run_external_discrete_tests(device_states); + + HIP_CHECK(hipFree(device_states)); +} + +TEST(ExternalDiscreteDistributionTests, XorwowTest){ + // Initialize the prng state + rocrand_state_xorwow * device_states; + HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_xorwow) * GlobalSizes::size)); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(init_rocrand_states_kernel), + dim3(GlobalSizes::grid_size), dim3(GlobalSizes::block_size), 0, 0, + device_states, + [] __device__ (size_t index, size_t offset, rocrand_state_xorwow * state){ + rocrand_init((123456 ^ index), offset + index, 0, state); + } + ); + + run_external_discrete_tests(device_states); + + HIP_CHECK(hipFree(device_states)); +} + +TEST(ExternalDiscreteDistributionTests, Mtgp32Test){ + constexpr size_t items_per_thread = 1024; + constexpr size_t block_size = 256; + constexpr size_t grid_size = 12; + constexpr size_t items_per_block = block_size * items_per_thread; + + constexpr size_t test_size = items_per_block * grid_size; + rocrand_state_mtgp32 * states; + + HIP_CHECK(hipMalloc(&states, sizeof(rocrand_state_mtgp32) * grid_size)); + rocrand_make_state_mtgp32(states, mtgp32dc_params_fast_11213, grid_size, 123456); + HIP_CHECK(hipDeviceSynchronize()); + + run_external_discrete_tests( + states, + items_per_thread, + block_size, + grid_size, + test_size + ); + HIP_CHECK(hipFree(states)); +} + +TEST(ExternalDiscreteDistributionTests, Lfsr113Test){ + // Initialize the prng state + rocrand_state_lfsr113 * device_states; + HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_lfsr113) * GlobalSizes::size)); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(init_rocrand_states_kernel), + dim3(GlobalSizes::grid_size), dim3(GlobalSizes::block_size), 0, 0, + device_states, + [] __device__ (size_t index, size_t offset, rocrand_state_lfsr113 * state){ + rocrand_init( + { + (123456 ^ index), + (123456 ^ index) << 1, + (123456 ^ index) << 2, + (123456 ^ index) << 3 + }, + offset + index, + 0, + state + ); + } + ); + + run_external_discrete_tests(device_states); + + HIP_CHECK(hipFree(device_states)); +} + +TEST(ExternalDiscreteDistributionTests, Sobol32Test){ + // Initialize the prng state + rocrand_state_sobol32 * host_states = new rocrand_state_sobol32[GlobalSizes::size]; + const unsigned int* directions; + ROCRAND_CHECK(rocrand_get_direction_vectors32(&directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6)); + + // 640000 is the size of directions. This is to prevent overflow stuff + for(size_t i = 0; i < GlobalSizes::size; i++) + rocrand_init(directions, i % 640000, host_states + i); + + rocrand_state_sobol32 * device_states; + HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_sobol32) * GlobalSizes::size)); + HIP_CHECK(hipMemcpy(device_states, host_states, sizeof(rocrand_state_sobol32) * GlobalSizes::size, hipMemcpyHostToDevice)); + + + run_external_discrete_tests(device_states); + + delete [] host_states; + HIP_CHECK(hipFree(device_states)); +} + +TEST(ExternalDiscreteDistributionTests, ScrambledSobol32Test){ + // Initialize the prng state + rocrand_state_scrambled_sobol32 * host_states = new rocrand_state_scrambled_sobol32[GlobalSizes::size]; + const unsigned int* directions; + ROCRAND_CHECK(rocrand_get_direction_vectors32(&directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6)); + + // 640000 is the size of directions. This is to prevent overflow stuff + for(size_t i = 0; i < GlobalSizes::size; i++) + rocrand_init(directions, 123456 ^ i, i % 640000, host_states + i); + + rocrand_state_scrambled_sobol32 * device_states; + HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_scrambled_sobol32) * GlobalSizes::size)); + HIP_CHECK(hipMemcpy(device_states, host_states, sizeof(rocrand_state_scrambled_sobol32) * GlobalSizes::size, hipMemcpyHostToDevice)); + + + run_external_discrete_tests(device_states); + + delete [] host_states; + HIP_CHECK(hipFree(device_states)); +} + +TEST(ExternalDiscreteDistributionTests, Sobol64Test){ + // Initialize the prng state + rocrand_state_sobol64 * host_states = new rocrand_state_sobol64[GlobalSizes::size]; + const unsigned long long* directions; + ROCRAND_CHECK(rocrand_get_direction_vectors64(&directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6)); + + // 1280000 is the size of directions. This is to prevent overflow stuff + for(size_t i = 0; i < GlobalSizes::size; i++) + rocrand_init(directions, i % 1280000, host_states + i); + + rocrand_state_sobol64 * device_states; + HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_sobol64) * GlobalSizes::size)); + HIP_CHECK(hipMemcpy(device_states, host_states, sizeof(rocrand_state_sobol64) * GlobalSizes::size, hipMemcpyHostToDevice)); + + + run_external_discrete_tests(device_states); + + delete [] host_states; + HIP_CHECK(hipFree(device_states)); +} + +TEST(ExternalDiscreteDistributionTests, ScrambledSobol64Test){ + // Initialize the prng state + rocrand_state_scrambled_sobol64 * host_states = new rocrand_state_scrambled_sobol64[GlobalSizes::size]; + const unsigned long long* directions; + ROCRAND_CHECK(rocrand_get_direction_vectors64(&directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6)); + + // 1280000 is the size of directions. This is to prevent overflow stuff + for(size_t i = 0; i < GlobalSizes::size; i++) + rocrand_init(directions, 123456 ^ i, i % 1280000, host_states + i); + + rocrand_state_scrambled_sobol64 * device_states; + HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_scrambled_sobol64) * GlobalSizes::size)); + HIP_CHECK(hipMemcpy(device_states, host_states, sizeof(rocrand_state_scrambled_sobol64) * GlobalSizes::size, hipMemcpyHostToDevice)); + + + run_external_discrete_tests(device_states); + + delete [] host_states; + HIP_CHECK(hipFree(device_states)); +} + +TEST(ExternalDiscreteDistributionTests, Threefry2x32_20Test){ + // Initialize the prng state + rocrand_state_threefry2x32_20 * device_states; + HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_threefry2x32_20) * GlobalSizes::size)); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(init_rocrand_states_kernel), + dim3(GlobalSizes::grid_size), dim3(GlobalSizes::block_size), 0, 0, + device_states, + [] __device__ (size_t index, size_t offset, rocrand_state_threefry2x32_20 * state){ + rocrand_init((123456 ^ index), offset + index, 0, state); + } + ); + + run_external_discrete_tests(device_states); + + HIP_CHECK(hipFree(device_states)); +} + +TEST(ExternalDiscreteDistributionTests, Threefry2x64_20Test){ + // Initialize the prng state + rocrand_state_threefry2x64_20 * device_states; + HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_threefry2x64_20) * GlobalSizes::size)); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(init_rocrand_states_kernel), + dim3(GlobalSizes::grid_size), dim3(GlobalSizes::block_size), 0, 0, + device_states, + [] __device__ (size_t index, size_t offset, rocrand_state_threefry2x64_20 * state){ + rocrand_init((123456 ^ index), offset + index, 0, state); + } + ); + + run_external_discrete_tests(device_states); + + HIP_CHECK(hipFree(device_states)); +} + +TEST(ExternalDiscreteDistributionTests, Threefry4x32_20Test){ + // Initialize the prng state + rocrand_state_threefry4x32_20 * device_states; + HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_threefry4x32_20) * GlobalSizes::size)); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(init_rocrand_states_kernel), + dim3(GlobalSizes::grid_size), dim3(GlobalSizes::block_size), 0, 0, + device_states, + [] __device__ (size_t index, size_t offset, rocrand_state_threefry4x32_20 * state){ + rocrand_init((123456 ^ index), offset + index, 0, state); + } + ); + + run_external_discrete_tests(device_states); + + HIP_CHECK(hipFree(device_states)); +} + +TEST(ExternalDiscreteDistributionTests, Threefry4x64_20Test){ + // Initialize the prng state + rocrand_state_threefry4x64_20 * device_states; + HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_threefry4x64_20) * GlobalSizes::size)); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(init_rocrand_states_kernel), + dim3(GlobalSizes::grid_size), dim3(GlobalSizes::block_size), 0, 0, + device_states, + [] __device__ (size_t index, size_t offset, rocrand_state_threefry4x64_20 * state){ + rocrand_init((123456 ^ index), offset + index, 0, state); + } + ); + + run_external_discrete_tests(device_states); + + HIP_CHECK(hipFree(device_states)); +} + +template +__global__ void uint4_kernel(rocrand_state_philox4x32_10 * states, uint4 * device_output, rocrand_discrete_distribution_st & dis){ + const size_t items_per_block = items_per_thread * block_size; + const size_t offset = (items_per_block * blockIdx.x) + (items_per_thread * threadIdx.x); + + for(size_t i = 0; i < items_per_thread; i++){ + auto local_state = states[offset + i]; + device_output[offset + i] = rocrand_discrete4(&local_state, &dis); + states[offset + i] = local_state; + } +} + +TEST(ExternalDiscreteDistributionTests, Philox4x32_10WithUIN4OutputTest) +{ + // Initialize the prng state + rocrand_state_philox4x32_10 * device_states; + HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_philox4x32_10) * GlobalSizes::size)); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(init_rocrand_states_kernel), + dim3(GlobalSizes::grid_size), dim3(GlobalSizes::block_size), 0, 0, + device_states, + [] __device__ (size_t index, size_t offset, rocrand_state_philox4x32_10 * state){ + rocrand_init((123456 ^ index), offset + index, 0, state); + } + ); + + std::vector> all_distributions = { + {10, 10, 10, 10}, + {1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 1}, + {1234, 1677, 1519, 1032, 561, 254, 98, 33, 10, 2}, + {1, 2, 8, 4, 3, 2, 1} + }; + + uint4 * host_output = new uint4[GlobalSizes::size]; + uint4 * device_output; + HIP_CHECK(hipMalloc(&device_output, sizeof(uint4) * GlobalSizes::size)); + + for(std::vector distribution : all_distributions){ + + // Getting expected Results + double sum = std::accumulate(distribution.begin(), distribution.end(), 0); + std::vector expected_prob(distribution.size()); + for(size_t i = 0; i < distribution.size(); i++) + expected_prob[i] = distribution[i] / sum; + + // Creating the discrete distribution + rocrand_discrete_distribution discrete_distribution; + ROCRAND_CHECK(rocrand_create_discrete_distribution(expected_prob.data(), expected_prob.size(), 0, &discrete_distribution)); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(uint4_kernel), + dim3(GlobalSizes::grid_size), dim3(GlobalSizes::block_size), 0, 0, + device_states, device_output, * discrete_distribution + ); + + HIP_CHECK(hipMemcpy(host_output, device_output, sizeof(uint4) * GlobalSizes::size, hipMemcpyDeviceToHost)); + std::vector histogram(distribution.size()); + + // Calculating the actual results + for(size_t i = 0; i < GlobalSizes::size; i++){ + histogram[host_output[i].w]++; + histogram[host_output[i].x]++; + histogram[host_output[i].y]++; + histogram[host_output[i].z]++; + } + + std::vector actual_prob(distribution.size()); + for(size_t i = 0; i < actual_prob.size(); i++) + actual_prob[i] = histogram[i] / static_cast(GlobalSizes::size * 4); + + // If the original probability is bigger than 5% then expected should be within 1% difference. + // Otherwise it should be within 0.01 + for(size_t i = 0; i < expected_prob.size(); i++){ + double eps = expected_prob[i] > 0.05 ? expected_prob[i] * 0.01 : 0.01; + ASSERT_NEAR(expected_prob[i], actual_prob[i], eps); + } + + ROCRAND_CHECK(rocrand_destroy_discrete_distribution(discrete_distribution)); + } + + delete [] host_output; + HIP_CHECK(hipFree(device_output)); +} diff --git a/test/internal/test_log_normal_distribution.cpp b/test/internal/test_log_normal_distribution.cpp index efdf0f411..40a297422 100644 --- a/test/internal/test_log_normal_distribution.cpp +++ b/test/internal/test_log_normal_distribution.cpp @@ -24,6 +24,9 @@ #include #include +#include + +#define HIP_CHECK(state) ASSERT_EQ(state, hipSuccess) using namespace rocrand_impl::host; @@ -403,11 +406,874 @@ TYPED_TEST(sobol_log_normal_distribution_tests, half_test) std += std::pow(__half2float(val[i]) - mean, 2); } std = std::sqrt(std / size); - + float expected_mean = std::exp(0.2f + 0.5f * 0.5f / 2); float expected_std - = std::sqrt((std::exp(0.5f * 0.5f) - 1.0) * std::exp(2 * 0.2f + 0.5f * 0.5f)); - + = std::sqrt((std::exp(0.5f * 0.5f) - 1.0) * std::exp(2 * 0.2f + 0.5f * 0.5f)); + EXPECT_NEAR(expected_mean, mean, expected_mean * 0.1f); EXPECT_NEAR(expected_std, std, expected_std * 0.1f); } + + +template +struct StatesLND{ + template + void run_test(const FuncCall & f, size_t testSize = 4000000){ + double iMean = 0; + double iStd = 1; + + float * output = new float [testSize]; + OutType out; + + double mean = 0; + + for(size_t i = 0; i <= testSize; i += 4){ + f(out, iMean, iStd); + + output[i] = out.w; + output[i + 1] = out.x; + output[i + 2] = out.y; + output[i + 3] = out.z; + mean += out.w + out.x + out.y + out.z; + } + + mean /= testSize; + + double std = 0.0; + for(size_t i = 0; i < testSize; i++) + std += std::pow(output[i] - mean, 2); + + std = std::sqrt(std / testSize); + + double eMean = std::exp(iMean + (iStd * iStd) / 2); + double eStd = std::sqrt(std::log(1 + (iStd * iStd)/(iMean * iMean))); + ASSERT_NEAR(mean, eMean, eMean * 0.1) << "Expected Mean: " << eMean << " Actual Mean: " << mean << " Eps: " << eMean * 0.1; + ASSERT_NEAR(std, eStd, eStd * 0.1) << "Expected Std: " << eStd << " Actual Std: " << std << " Eps: " << eStd * 0.1; + } +}; + +TEST(log_normal_distribution_tests, philox4x32_10_test){ + rocrand_state_philox4x32_10 states; + rocrand_init(123456, 654321, 0, &states); + + StatesLND testFloat; + StatesLND testDouble; + + #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE + testFloat.run_test()( + [&] __host__ __device__ (float4 & output, float mean, float std){ + output = { + rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std), + rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std) + }; + ) + testDouble.run_test()( + [&] __host__ __device__ (double4 & output, double mean, double std){ + output = { + rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std), + rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std) + }; + } + ) + #endif // ROCRAND_DETAIL_BM_NOT_IN_STATE + + testFloat.run_test( + [&] __host__ __device__ (float4 & output, float mean, float std){ + float2 o1 = rocrand_log_normal2(&states, mean, std); + float2 o2 = rocrand_log_normal2(&states, mean, std); + output = { + o1.x, o2.x, o1.y, o2.y + }; + } + ); + + testFloat.run_test( + [&] __host__ __device__ (float4 & output, float mean, float std){ + output = rocrand_log_normal4(&states, mean, std); + } + ); + + testDouble.run_test( + [&] __host__ __device__ (double4 & output, double mean, double std){ + double2 o1 = rocrand_log_normal_double2(&states, mean, std); + double2 o2 = rocrand_log_normal_double2(&states, mean, std); + output = { + o1.x, o2.x, o1.y, o2.y + }; + } + ); + + testDouble.run_test( + [&] __host__ __device__ (double4 & output, double mean, double std){ + output = rocrand_log_normal_double4(&states, mean, std); + } + ); +} + +TEST(log_normal_distribution_tests, mrg31k3p_test){ + rocrand_state_mrg31k3p states; + rocrand_init(123456, 654321, 0, &states); + + StatesLND testFloat; + StatesLND testDouble; + + #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE + testFloat.run_test()( + [&] __host__ __device__ (float4 & output, float mean, float std){ + output = { + rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std), + rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std) + }; + ) + testDouble.run_test()( + [&] __host__ __device__ (double4 & output, double mean, double std){ + output = { + rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std), + rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std) + }; + } + ) + #endif // ROCRAND_DETAIL_BM_NOT_IN_STATE + + testFloat.run_test( + [&] __host__ __device__ (float4 & output, float mean, float std){ + float2 o1 = rocrand_log_normal2(&states, mean, std); + float2 o2 = rocrand_log_normal2(&states, mean, std); + output = { + o1.x, o2.x, o1.y, o2.y + }; + } + ); + + testDouble.run_test( + [&] __host__ __device__ (double4 & output, double mean, double std){ + double2 o1 = rocrand_log_normal_double2(&states, mean, std); + double2 o2 = rocrand_log_normal_double2(&states, mean, std); + output = { + o1.x, o2.x, o1.y, o2.y + }; + } + ); +} + +TEST(log_normal_distribution_tests, mrg32k3a_test){ + rocrand_state_mrg32k3a states; + rocrand_init(123456, 654321, 0, &states); + + StatesLND testFloat; + StatesLND testDouble; + + #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE + testFloat.run_test()( + [&] __host__ __device__ (float4 & output, float mean, float std){ + output = { + rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std), + rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std) + }; + ) + testDouble.run_test()( + [&] __host__ __device__ (double4 & output, double mean, double std){ + output = { + rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std), + rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std) + }; + } + ) + #endif // ROCRAND_DETAIL_BM_NOT_IN_STATE + + testFloat.run_test( + [&] __host__ __device__ (float4 & output, float mean, float std){ + float2 o1 = rocrand_log_normal2(&states, mean, std); + float2 o2 = rocrand_log_normal2(&states, mean, std); + output = { + o1.x, o2.x, o1.y, o2.y + }; + } + ); + + testDouble.run_test( + [&] __host__ __device__ (double4 & output, double mean, double std){ + double2 o1 = rocrand_log_normal_double2(&states, mean, std); + double2 o2 = rocrand_log_normal_double2(&states, mean, std); + output = { + o1.x, o2.x, o1.y, o2.y + }; + } + ); +} + +TEST(log_normal_distribution_tests, xorwow_test){ + rocrand_state_xorwow states; + rocrand_init(123456, 654321, 0, &states); + + StatesLND testFloat; + StatesLND testDouble; + + #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE + testFloat.run_test()( + [&] __host__ __device__ (float4 & output, float mean, float std){ + output = { + rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std), + rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std) + }; + ) + testDouble.run_test()( + [&] __host__ __device__ (double4 & output, double mean, double std){ + output = { + rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std), + rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std) + }; + } + ) + #endif // ROCRAND_DETAIL_BM_NOT_IN_STATE + + testFloat.run_test( + [&] __host__ __device__ (float4 & output, float mean, float std){ + float2 o1 = rocrand_log_normal2(&states, mean, std); + float2 o2 = rocrand_log_normal2(&states, mean, std); + output = { + o1.x, o2.x, o1.y, o2.y + }; + } + ); + + testDouble.run_test( + [&] __host__ __device__ (double4 & output, double mean, double std){ + double2 o1 = rocrand_log_normal_double2(&states, mean, std); + double2 o2 = rocrand_log_normal_double2(&states, mean, std); + output = { + o1.x, o2.x, o1.y, o2.y + }; + } + ); +} + +TEST(log_normal_distribution_tests, sobol32_test){ + rocrand_state_sobol32 states; + const unsigned int* directions; + HIP_CHECK(rocrand_get_direction_vectors32(&directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6)); + rocrand_init(directions, 0, &states); + + StatesLND testFloat; + + testFloat.run_test( + [&] __host__ __device__ (float4 & output, float mean, float std){ + output = { + rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std), + rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std) + }; + } + ); + + StatesLND testDouble; + + testDouble.run_test( + [&] __host__ __device__ (double4 & output, double mean, double std){ + output = { + rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std), + rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std) + }; + } + ); +} + +TEST(log_normal_distribution_tests, scrambled_sobol32_test){ + rocrand_state_scrambled_sobol32 states; + const unsigned int* directions; + HIP_CHECK(rocrand_get_direction_vectors32(&directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6)); + rocrand_init(directions, 123456, 0, &states); + + StatesLND testFloat; + + testFloat.run_test( + [&] __host__ __device__ (float4 & output, float mean, float std){ + output = { + rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std), + rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std) + }; + } + ); + + StatesLND testDouble; + + testDouble.run_test( + [&] __host__ __device__ (double4 & output, double mean, double std){ + output = { + rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std), + rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std) + }; + } + ); +} + +TEST(log_normal_distribution_tests, sobol64_test){ + rocrand_state_sobol64 states; + const unsigned long long* directions; + HIP_CHECK(rocrand_get_direction_vectors64(&directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6)); + rocrand_init(directions, 0, &states); + + StatesLND testFloat; + + testFloat.run_test( + [&] __host__ __device__ (float4 & output, float mean, float std){ + output = { + rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std), + rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std) + }; + } + ); + + StatesLND testDouble; + + testDouble.run_test( + [&] __host__ __device__ (double4 & output, double mean, double std){ + output = { + rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std), + rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std) + }; + } + ); +} + +TEST(log_normal_distribution_tests, scrambled_sobol64_test){ + rocrand_state_scrambled_sobol64 states; + const unsigned long long* directions; + HIP_CHECK(rocrand_get_direction_vectors64(&directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6)); + rocrand_init(directions, 123456, 0, &states); + + StatesLND testFloat; + + testFloat.run_test( + [&] __host__ __device__ (float4 & output, float mean, float std){ + output = { + rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std), + rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std) + }; + } + ); + + StatesLND testDouble; + + testDouble.run_test( + [&] __host__ __device__ (double4 & output, double mean, double std){ + output = { + rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std), + rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std) + }; + } + ); +} + +TEST(log_normal_distribution_tests, lfsr113_test){ + rocrand_state_lfsr113 states; + rocrand_init(static_cast(12), 0, &states); + + StatesLND testFloat; + + testFloat.run_test( + [&] __host__ __device__ (float4 & output, float mean, float std){ + output = { + rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std), + rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std) + }; + } + ); + + testFloat.run_test( + [&] __host__ __device__ (float4 & output, float mean, float std){ + float2 o1 = rocrand_log_normal2(&states, mean, std); + float2 o2 = rocrand_log_normal2(&states, mean, std); + output = { + o1.x, o2.x, o1.y, o2.y + }; + } + ); + + StatesLND testDouble; + + testDouble.run_test( + [&] __host__ __device__ (double4 & output, double mean, double std){ + output = { + rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std), + rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std) + }; + } + ); + + testDouble.run_test( + [&] __host__ __device__ (double4 & output, double mean, double std){ + double2 o1 = rocrand_log_normal_double2(&states, mean, std); + double2 o2 = rocrand_log_normal_double2(&states, mean, std); + output = { + o1.x, o2.x, o1.y, o2.y + }; + } + ); +} + +TEST(log_normal_distribution_tests, threefry2x32_20_test){ + rocrand_state_threefry2x32_20 states; + rocrand_init(123456, 654321, 0, &states); + + StatesLND testFloat; + + testFloat.run_test( + [&] __host__ __device__ (float4 & output, float mean, float std){ + output = { + rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std), + rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std) + }; + } + ); + + testFloat.run_test( + [&] __host__ __device__ (float4 & output, float mean, float std){ + float2 o1 = rocrand_log_normal2(&states, mean, std); + float2 o2 = rocrand_log_normal2(&states, mean, std); + output = { + o1.x, o2.x, o1.y, o2.y + }; + } + ); + + StatesLND testDouble; + + testDouble.run_test( + [&] __host__ __device__ (double4 & output, double mean, double std){ + output = { + rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std), + rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std) + }; + } + ); + + testDouble.run_test( + [&] __host__ __device__ (double4 & output, double mean, double std){ + double2 o1 = rocrand_log_normal_double2(&states, mean, std); + double2 o2 = rocrand_log_normal_double2(&states, mean, std); + output = { + o1.x, o2.x, o1.y, o2.y + }; + } + ); +} + +TEST(log_normal_distribution_tests, threefry2x64_20_test){ + rocrand_state_threefry2x64_20 states; + rocrand_init(123456, 654321, 0, &states); + + StatesLND testFloat; + + testFloat.run_test( + [&] __host__ __device__ (float4 & output, float mean, float std){ + output = { + rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std), + rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std) + }; + } + ); + + testFloat.run_test( + [&] __host__ __device__ (float4 & output, float mean, float std){ + float2 o1 = rocrand_log_normal2(&states, mean, std); + float2 o2 = rocrand_log_normal2(&states, mean, std); + output = { + o1.x, o2.x, o1.y, o2.y + }; + } + ); + + StatesLND testDouble; + + testDouble.run_test( + [&] __host__ __device__ (double4 & output, double mean, double std){ + output = { + rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std), + rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std) + }; + } + ); + + testDouble.run_test( + [&] __host__ __device__ (double4 & output, double mean, double std){ + double2 o1 = rocrand_log_normal_double2(&states, mean, std); + double2 o2 = rocrand_log_normal_double2(&states, mean, std); + output = { + o1.x, o2.x, o1.y, o2.y + }; + } + ); +} + +TEST(log_normal_distribution_tests, threefry4x32_20_test){ + rocrand_state_threefry4x32_20 states; + rocrand_init(123456, 654321, 0, &states); + + StatesLND testFloat; + + testFloat.run_test( + [&] __host__ __device__ (float4 & output, float mean, float std){ + output = { + rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std), + rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std) + }; + } + ); + + testFloat.run_test( + [&] __host__ __device__ (float4 & output, float mean, float std){ + float2 o1 = rocrand_log_normal2(&states, mean, std); + float2 o2 = rocrand_log_normal2(&states, mean, std); + output = { + o1.x, o2.x, o1.y, o2.y + }; + } + ); + + StatesLND testDouble; + + testDouble.run_test( + [&] __host__ __device__ (double4 & output, double mean, double std){ + output = { + rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std), + rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std) + }; + } + ); + + testDouble.run_test( + [&] __host__ __device__ (double4 & output, double mean, double std){ + double2 o1 = rocrand_log_normal_double2(&states, mean, std); + double2 o2 = rocrand_log_normal_double2(&states, mean, std); + output = { + o1.x, o2.x, o1.y, o2.y + }; + } + ); +} + +TEST(log_normal_distribution_tests, threefry4x64_20_test){ + rocrand_state_threefry4x64_20 states; + rocrand_init(123456, 654321, 0, &states); + + StatesLND testFloat; + + testFloat.run_test( + [&] __host__ __device__ (float4 & output, float mean, float std){ + output = { + rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std), + rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std) + }; + } + ); + + testFloat.run_test( + [&] __host__ __device__ (float4 & output, float mean, float std){ + float2 o1 = rocrand_log_normal2(&states, mean, std); + float2 o2 = rocrand_log_normal2(&states, mean, std); + output = { + o1.x, o2.x, o1.y, o2.y + }; + } + ); + + StatesLND testDouble; + + testDouble.run_test( + [&] __host__ __device__ (double4 & output, double mean, double std){ + output = { + rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std), + rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std) + }; + } + ); + + testDouble.run_test( + [&] __host__ __device__ (double4 & output, double mean, double std){ + double2 o1 = rocrand_log_normal_double2(&states, mean, std); + double2 o2 = rocrand_log_normal_double2(&states, mean, std); + output = { + o1.x, o2.x, o1.y, o2.y + }; + } + ); +} + +template +__global__ void mtgp32_kernel (rocrand_state_mtgp32 * states, T * output, const size_t N, const pType mean, const pType std ,const LNDFunction & f){ + const unsigned int state_id = blockIdx.x; + const unsigned int thread_id = threadIdx.x; + unsigned int index = blockIdx.x * blockDim.x + threadIdx.x; + + if(index >= N) + return; + + __shared__ rocrand_state_mtgp32 state; + if(thread_id == 0) + state = states[state_id]; + __syncthreads(); + + output[index] = f(&state, mean, std); + + if(thread_id == 0) + states[state_id] = state; +} + +TEST(log_normal_distribution_tests, float_mtgp32_test){ + size_t testSize = 40192; + size_t threads = 256; + size_t blocks = std::ceil(static_cast(testSize) / static_cast(threads)); + + float iMean = 0; + float iStd = 1; + + rocrand_state_mtgp32 * states; + size_t state_size = blocks, seed = 654321; + HIP_CHECK(hipMalloc(&states, state_size * sizeof(rocrand_state_mtgp32))); + rocrand_make_state_mtgp32(states, mtgp32dc_params_fast_11213, state_size, seed); + + float * hOut = new float[testSize]; + float * dOut; + HIP_CHECK(hipMalloc(&dOut, sizeof(float) * testSize)); + HIP_CHECK(hipDeviceSynchronize()); + + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(mtgp32_kernel), + dim3(blocks), + dim3(threads), + 0, + 0, + states, + dOut, + testSize, + iMean, + iStd, + + [] __device__ (rocrand_state_mtgp32 * state, float mean, float std){ + return rocrand_log_normal(state, mean, std); + } + ); + + HIP_CHECK(hipMemcpy(hOut, dOut, sizeof(float) * testSize, hipMemcpyDeviceToHost)); + + double mean = 0.0; + for(size_t i = 0; i < testSize; i++) + mean += hOut[i]; + + + mean /= testSize; + + double std = 0.0; + for(size_t i = 0; i < testSize; i++) + std += std::pow(hOut[i] - mean, 2); + + std = std::sqrt(std / testSize); + + double eMean = std::exp(iMean + (iStd * iStd) / 2); + double eStd = std::sqrt(std::log(1 + (iStd * iStd)/(iMean * iMean))); + + ASSERT_NEAR(mean, eMean, eMean * 0.1) << "Expected Mean: " << eMean << " Actual Mean: " << mean << " Eps: " << eMean * 0.1; + ASSERT_NEAR(std, eStd, eStd * 0.1) << "Expected Std: " << eStd << " Actual Std: " << std << " Eps: " << eStd * 0.1; + + HIP_CHECK(hipFree(states)); + HIP_CHECK(hipFree(dOut)); + + delete [] hOut; +} + +TEST(log_normal_distribution_tests, float2_mtgp32_test){ + size_t testSize = 40192; + size_t threads = 256; + size_t blocks = std::ceil(static_cast(testSize) / static_cast(threads)); + + float iMean = 0; + float iStd = 1; + + rocrand_state_mtgp32 * states; + size_t state_size = blocks, seed = 654321; + HIP_CHECK(hipMalloc(&states, state_size * sizeof(rocrand_state_mtgp32))); + rocrand_make_state_mtgp32(states, mtgp32dc_params_fast_11213, state_size, seed); + + float2 * hOut = new float2[testSize]; + float2 * dOut; + HIP_CHECK(hipMalloc(&dOut, sizeof(float2) * testSize)); + HIP_CHECK(hipDeviceSynchronize()); + + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(mtgp32_kernel), + dim3(blocks), + dim3(threads), + 0, + 0, + states, + dOut, + testSize, + iMean, + iStd, + [] __device__ (rocrand_state_mtgp32 * state, float mean, float std){ + return rocrand_log_normal2(state, mean, std); + } + ); + + HIP_CHECK(hipMemcpy(hOut, dOut, sizeof(float2) * testSize, hipMemcpyDeviceToHost)); + + double mean = 0.0; + for(size_t i = 0; i < testSize; i++){ + mean += hOut[i].x; + mean += hOut[i].y; + } + + + mean /= (testSize * 2); + + double std = 0.0; + for(size_t i = 0; i < testSize; i++){ + std += std::pow(hOut[i].x - mean, 2); + std += std::pow(hOut[i].y - mean, 2); + } + + std = std::sqrt(std / (testSize * 2)); + + double eMean = std::exp(iMean + (iStd * iStd) / 2); + double eStd = std::sqrt(std::log(1 + (iStd * iStd)/(iMean * iMean))); + + ASSERT_NEAR(mean, eMean, eMean * 0.1) << "Expected Mean: " << eMean << " Actual Mean: " << mean << " Eps: " << eMean * 0.1; + ASSERT_NEAR(std, eStd, eStd * 0.1) << "Expected Std: " << eStd << " Actual Std: " << std << " Eps: " << eStd * 0.1; + + HIP_CHECK(hipFree(states)); + HIP_CHECK(hipFree(dOut)); + + delete [] hOut; +} + +TEST(log_normal_distribution_tests, double_mtgp32_test){ + size_t testSize = 40192; + size_t threads = 256; + size_t blocks = std::ceil(static_cast(testSize) / static_cast(threads)); + + double iMean = 0; + double iStd = 1; + + rocrand_state_mtgp32 * states; + size_t state_size = blocks, seed = 654321; + HIP_CHECK(hipMalloc(&states, state_size * sizeof(rocrand_state_mtgp32))); + rocrand_make_state_mtgp32(states, mtgp32dc_params_fast_11213, state_size, seed); + + double * hOut = new double[testSize]; + double * dOut; + HIP_CHECK(hipMalloc(&dOut, sizeof(double) * testSize)); + HIP_CHECK(hipDeviceSynchronize()); + + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(mtgp32_kernel), + dim3(blocks), + dim3(threads), + 0, + 0, + states, + dOut, + testSize, + iMean, + iStd, + + [] __device__ (rocrand_state_mtgp32 * state, double mean, double std){ + return rocrand_log_normal_double(state, mean, std); + } + ); + + HIP_CHECK(hipMemcpy(hOut, dOut, sizeof(double) * testSize, hipMemcpyDeviceToHost)); + + double mean = 0.0; + for(size_t i = 0; i < testSize; i++) + mean += hOut[i]; + + + mean /= testSize; + + double std = 0.0; + for(size_t i = 0; i < testSize; i++) + std += std::pow(hOut[i] - mean, 2); + + std = std::sqrt(std / testSize); + + double eMean = std::exp(iMean + (iStd * iStd) / 2); + double eStd = std::sqrt(std::log(1 + (iStd * iStd)/(iMean * iMean))); + + ASSERT_NEAR(mean, eMean, eMean * 0.1) << "Expected Mean: " << eMean << " Actual Mean: " << mean << " Eps: " << eMean * 0.1; + ASSERT_NEAR(std, eStd, eStd * 0.1) << "Expected Std: " << eStd << " Actual Std: " << std << " Eps: " << eStd * 0.1; + + HIP_CHECK(hipFree(states)); + HIP_CHECK(hipFree(dOut)); + + delete [] hOut; +} + +TEST(log_normal_distribution_tests, double2_mtgp32_test){ + size_t testSize = 40192; + size_t threads = 256; + size_t blocks = std::ceil(static_cast(testSize) / static_cast(threads)); + + double iMean = 0; + double iStd = 1; + + rocrand_state_mtgp32 * states; + size_t state_size = blocks, seed = 654321; + HIP_CHECK(hipMalloc(&states, state_size * sizeof(rocrand_state_mtgp32))); + rocrand_make_state_mtgp32(states, mtgp32dc_params_fast_11213, state_size, seed); + + double2 * hOut = new double2[testSize]; + double2 * dOut; + HIP_CHECK(hipMalloc(&dOut, sizeof(double2) * testSize)); + HIP_CHECK(hipDeviceSynchronize()); + + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(mtgp32_kernel), + dim3(blocks), + dim3(threads), + 0, + 0, + states, + dOut, + testSize, + iMean, + iStd, + [] __device__ (rocrand_state_mtgp32 * state, double mean, double std){ + return rocrand_log_normal_double2(state, mean, std); + } + ); + + HIP_CHECK(hipMemcpy(hOut, dOut, sizeof(double2) * testSize, hipMemcpyDeviceToHost)); + + double mean = 0.0; + for(size_t i = 0; i < testSize; i++){ + mean += hOut[i].x; + mean += hOut[i].y; + } + + + mean /= (testSize * 2); + + double std = 0.0; + for(size_t i = 0; i < testSize; i++){ + std += std::pow(hOut[i].x - mean, 2); + std += std::pow(hOut[i].y - mean, 2); + } + + std = std::sqrt(std / (testSize * 2)); + + double eMean = std::exp(iMean + (iStd * iStd) / 2); + double eStd = std::sqrt(std::log(1 + (iStd * iStd)/(iMean * iMean))); + + ASSERT_NEAR(mean, eMean, eMean * 0.1) << "Expected Mean: " << eMean << " Actual Mean: " << mean << " Eps: " << eMean * 0.1; + ASSERT_NEAR(std, eStd, eStd * 0.1) << "Expected Std: " << eStd << " Actual Std: " << std << " Eps: " << eStd * 0.1; + + HIP_CHECK(hipFree(states)); + HIP_CHECK(hipFree(dOut)); + + delete [] hOut; +} diff --git a/test/internal/test_normal_distribution.cpp b/test/internal/test_normal_distribution.cpp index b1226fd64..f7fb8b46e 100644 --- a/test/internal/test_normal_distribution.cpp +++ b/test/internal/test_normal_distribution.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -24,7 +24,9 @@ #include #include +#include +#define HIP_CHECK(state) ASSERT_EQ(state, hipSuccess) using namespace rocrand_impl::host; TEST(normal_distribution_tests, float_test) @@ -103,6 +105,215 @@ TEST(normal_distribution_tests, double_test) EXPECT_NEAR(5.0, std, 1.0); // 20% } +TEST(normal_distribution_tests, float_out_uint2_in_test) +{ + + struct nd + { + const float mean; + const float stddev; + + nd(float mean, float stddev) : mean(mean), stddev(stddev) {} + + __forceinline__ __host__ __device__ + void operator()(const uint2 &input, float (&output)[2]) const + { + float2 v = rocrand_device::detail::normal_distribution2(input); + output[0] = mean + v.x * stddev; + output[1] = mean + v.y * stddev; + } + }; + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution dis; + + const size_t size = 4000; + float val[size]; + nd u(2.0, 5.0); + + // Calculate mean + double mean = 0.0; + for(size_t i = 0; i < size; i += 2) + { + uint2 input; + float output[2]; + input.x = dis(gen); + input.y = dis(gen); + u(input, output); + val[i] = output[0]; + val[i + 1] = output[1]; + mean += (output[0] + output[1]); + } + mean /= size; + + // Calculate stddev + double std = 0.0; + for(size_t i = 0; i < size; i++) + { + std += std::pow(val[i] - mean, 2); + } + std = std::sqrt(std / size); + + EXPECT_NEAR(2.0, mean, 0.4); // 20% + EXPECT_NEAR(5.0, std, 1.0); // 20% +} + + +TEST(normal_distribution_tests, float2_out_longlong_in_test) +{ + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution dis; + + const size_t size = 4000; + float val[size]; + normal_distribution u(2, 5); + + // Calculate mean + double mean = 0.0; + for(size_t i = 0; i < size; i += 2) + { + unsigned long long input[1]; + float output[2]; + unsigned long long l = static_cast(dis(gen)); + unsigned long long r = static_cast(dis(gen)); + input[0] = (l << 32) | r; + u(input, output); + val[i] = output[0]; + val[i + 1] = output[1]; + mean += (output[0] + output[1]); + } + mean /= size; + + // Calculate stddev + double std = 0.0; + for(size_t i = 0; i < size; i++) + { + std += std::pow(val[i] - mean, 2) / size; + } + std = std::sqrt(std); + + EXPECT_NEAR(2, mean, 0.4) << "Mean: " << mean << " Expected: " << 2; // 20% + EXPECT_NEAR(5, std, 1.0) << "Stddev: " << std << " Expected: " << 5; // 20% +} + +TEST(normal_distribution_tests, float4_outputs){ + struct normal_distribution_float_4_out{ + const float mean; + const float stddev; + + normal_distribution_float_4_out(float mean, float stddev) : mean(mean), stddev(stddev) {} + + __forceinline__ __host__ __device__ + void uint4_in(const uint4 &input, float (&output)[4]) const + { + float4 v = rocrand_device::detail::normal_distribution4(input); + output[0] = mean + v.w * stddev; + output[1] = mean + v.x * stddev; + output[2] = mean + v.y * stddev; + output[3] = mean + v.z * stddev; + } + + __forceinline__ __host__ __device__ + void longlong2_in(const longlong2 &input, float (&output)[4]) const + { + float4 v = rocrand_device::detail::normal_distribution4(input); + output[0] = mean + v.w * stddev; + output[1] = mean + v.x * stddev; + output[2] = mean + v.y * stddev; + output[3] = mean + v.z * stddev; + } + + __forceinline__ __host__ __device__ + void ull_2_in(const unsigned long long (&input)[2], float (&output)[4]) const + { + float4 v = rocrand_device::detail::normal_distribution4(input[0], input[1]); + output[0] = mean + v.w * stddev; + output[1] = mean + v.x * stddev; + output[2] = mean + v.y * stddev; + output[3] = mean + v.z * stddev; + } + }; + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution dis; + + const size_t size = 4000; + float vull2[size], vll2[size], vui4[size]; + normal_distribution_float_4_out u(2, 5); + + // Calculate mean + double mui4 = 0, mll2 = 0, mull2 = 0; + for(size_t i = 0; i < size; i += 4) + { + unsigned long long ull2[2]; + longlong2 ll2; + uint4 ui4; + float output[4]; + unsigned long long l = static_cast(dis(gen)); + unsigned long long r = static_cast(dis(gen)); + + unsigned long long in1 = (l << 32) | r; + + ull2[0] = in1; + ll2.x = in1; + ui4.w = l; ui4.x = r; + + l = static_cast(dis(gen)); + r = static_cast(dis(gen)); + unsigned long long in2 = (l << 32) | r; + + ull2[1] = in2; + ll2.y = in2; + ui4.y = l; ui4.z = r; + + u.uint4_in(ui4, output); + vui4[i] = output[0]; + vui4[i + 1] = output[1]; + vui4[i + 2] = output[2]; + vui4[i + 3] = output[3]; + mui4 += (output[0] + output[1] + output[2] + output[3]) / size; + + u.longlong2_in(ll2, output); + vll2[i] = output[0]; + vll2[i + 1] = output[1]; + vll2[i + 2] = output[2]; + vll2[i + 3] = output[3]; + mll2 += (output[0] + output[1] + output[2] + output[3]) / size; + + u.ull_2_in(ull2, output); + vull2[i] = output[0]; + vull2[i + 1] = output[1]; + vull2[i + 2] = output[2]; + vull2[i + 3] = output[3]; + mull2 += (output[0] + output[1] + output[2] + output[3]) / size; + } + + double sui4 = 0, sll2 = 0, sull2 = 0; + for(size_t i = 0; i < size; i++) + { + sui4 += std::pow(vui4[i] - mui4, 2); + sll2 += std::pow(vll2[i] - mll2, 2); + sull2 += std::pow(vull2[i] - mull2, 2); + } + + sui4 = sqrt(sui4 / size); + sll2 = sqrt(sll2 / size); + sull2 = sqrt(sull2 / size); + + EXPECT_NEAR(2, mui4, 0.4) << "Mean: " << mui4 << " Expected: " << 2; // 20% + EXPECT_NEAR(5, sui4, 1.0) << "Stddev: " << sui4 << " Expected: " << 5; // 20% + + EXPECT_NEAR(2, mll2, 0.4) << "Mean: " << mll2 << " Expected: " << 2; // 20% + EXPECT_NEAR(5, sll2, 1.0) << "Stddev: " << sll2 << " Expected: " << 5; // 20% + + EXPECT_NEAR(2, mull2, 0.4) << "Mean: " << mull2 << " Expected: " << 2; // 20% + EXPECT_NEAR(5, sull2, 1.0) << "Stddev: " << sull2 << " Expected: " << 5; // 20% +} + TEST(normal_distribution_tests, half_test) { std::random_device rd; @@ -375,3 +586,552 @@ TEST(sobol_normal_distribution_tests, half_test) EXPECT_NEAR(2.0f, mean, 0.4f); // 20% EXPECT_NEAR(5.0f, std, 1.0f); // 20% } + +template +void run_normal_dist_with_state_out1(State * states){ + double expected_mean = 2.0f, expected_std = 5.0f; + + struct single_out{ + double mean, std; + State * states; + + single_out(double mean, double std, State * states){ + this->mean = mean; + this->std = std; + this->states = states; + } + + float __forceinline__ __device__ __host__ fOp(){ + return mean + rocrand_normal(states) * std; + } + + double __forceinline__ __device__ __host__ dOp(){ + return mean + rocrand_normal_double(states) * std; + } + }; + + const size_t size = 4000; + + float fOut[size]; + double dOut[size]; + + single_out s(expected_mean, expected_std, states); + + float fMean = 0, fStd = 0; + double dMean = 0, dStd = 0; + + for(size_t i = 0; i < size; i++){ + fOut[i] = s.fOp(); + dOut[i] = s.dOp(); + + fMean += fOut[i] / size; + dMean += dOut[i] / size; + } + + for(size_t i = 0; i < size; i++){ + fStd += std::pow(fOut[i] - fMean, 2) / size; + dStd += std::pow(dOut[i] - dMean, 2) / size; + } + + fStd = std::sqrt(fStd); + dStd = std::sqrt(dStd); + + EXPECT_NEAR(expected_mean, fMean, (expected_mean * 0.2) + 1e-1) << "Mean: " << fMean << " Expected: " << expected_mean; // 20% + EXPECT_NEAR(expected_std, fStd, (expected_std * 0.2) + 1e-1) << "Stddev: " << fStd << " Expected: " << expected_std; // 20% + + EXPECT_NEAR(expected_mean, dMean, (expected_mean * 0.2) + 1e-1) << "Mean: " << dMean << " Expected: " << expected_mean; // 20% + EXPECT_NEAR(expected_std, dStd, (expected_std * 0.2) + 1e-1) << "Stddev: " << dStd << " Expected: " << expected_std; // 20% +} + +template +void run_normal_dist_with_state_out2(State * states){ + double expected_mean = 2.0f, expected_std = 5.0f; + struct nd{ + double mean, std; + State * states; + nd(double mean, double stddev, State * states) { + this->mean = mean; + this->std = stddev; + this->states = states; + } + __forceinline__ __host__ __device__ void operator()(float(&output)[2]){ + float2 v = rocrand_normal2(states); + output[0] = static_cast(mean) + v.x * static_cast(std); + output[1] = static_cast(mean) + v.y * static_cast(std); + } + + __forceinline__ __host__ __device__ void operator()(double(&output)[2]){ + double2 v = rocrand_normal_double2(states); + output[0] = mean + v.x * std; + output[1] = mean + v.y * std; + } + }; + const size_t size = 4000; + float valF[size]; + double valD[size]; + nd u(expected_mean, expected_std, states); + + double meanF = 0, meanD = 0; + for(size_t i = 0; i < size; i += 2) + { + float fOut[2]; + double dOut[2]; + + u(fOut); + valF[i] = fOut[0]; + valF[i + 1] = fOut[1]; + meanF += (fOut[0] + fOut[1]) / size; + + u(dOut); + valD[i] = dOut[0]; + valD[i + 1] = dOut[1]; + meanD += (dOut[0] + dOut[1]) / size; + } + + // Calculate stddev + double stdF = 0, stdD = 0; + for(size_t i = 0; i < size; i++){ + stdF += std::pow(valF[i] - meanF, 2) / size; + stdD += std::pow(valD[i] - meanD, 2) / size; + } + stdF = std::sqrt(stdF); + stdD = std::sqrt(stdD); + + EXPECT_NEAR(expected_mean, meanF, (expected_mean * 0.2) + 1e-1) << "Mean: " << meanF << " Expected: " << expected_mean; // 20% + EXPECT_NEAR(expected_std, stdF, (expected_std * 0.2) + 1e-1) << "Stddev: " << stdF << " Expected: " << expected_std; // 20% + + EXPECT_NEAR(expected_mean, meanD, (expected_mean * 0.2) + 1e-1) << "Mean: " << meanD << " Expected: " << expected_mean; // 20% + EXPECT_NEAR(expected_std, stdD, (expected_std * 0.2) + 1e-1) << "Stddev: " << stdD << " Expected: " << expected_std; // 20% +} + +template +void run_normal_dist_with_state_out4(State * states){ + double expected_mean = 2.0f, expected_std = 5.0f; + + struct nd{ + double mean, std; + State * states; + nd(double mean, double stddev, State * states) { + this->mean = mean; + this->std = stddev; + this->states = states; + } + __forceinline__ __host__ __device__ void operator()(float(&output)[4]){ + float4 v = rocrand_normal4(states); + output[0] = static_cast(mean) + v.w * static_cast(std); + output[1] = static_cast(mean) + v.x * static_cast(std); + output[2] = static_cast(mean) + v.y * static_cast(std); + output[3] = static_cast(mean) + v.z * static_cast(std); + } + + __forceinline__ __host__ __device__ void operator()(double(&output)[4]){ + double4 v = rocrand_normal_double4(states); + output[0] = mean + v.w * std; + output[1] = mean + v.x * std; + output[2] = mean + v.y * std; + output[3] = mean + v.z * std; + } + }; + const size_t size = 4000; + float valF[size]; + double valD[size]; + + nd u(expected_mean, expected_std, states); + + double meanF = 0, meanD = 0; + for(size_t i = 0; i < size; i += 4) + { + float fOut[4]; + double dOut[4]; + + u(fOut); + valF[i] = fOut[0]; + valF[i + 1] = fOut[1]; + valF[i + 2] = fOut[2]; + valF[i + 3] = fOut[3]; + meanF += (fOut[0] + fOut[1] + fOut[2] + fOut[3]) / size; + + u(dOut); + valD[i] = dOut[0]; + valD[i + 1] = dOut[1]; + valD[i + 2] = dOut[2]; + valD[i + 3] = dOut[3]; + meanD += (dOut[0] + dOut[1] + dOut[2] + dOut[3]) / size; + } + + // Calculate stddev + double stdF = 0, stdD = 0; + for(size_t i = 0; i < size; i++){ + stdF += std::pow(valF[i] - meanF, 2) / size; + stdD += std::pow(valD[i] - meanD, 2) / size; + } + stdF = std::sqrt(stdF); + stdD = std::sqrt(stdD); + + EXPECT_NEAR(expected_mean, meanF, (expected_mean * 0.2) + 1e-2) << "Mean: " << meanF << " Expected: " << expected_mean; // 20% + EXPECT_NEAR(expected_std, stdF, (expected_std * 0.2) + 1e-2) << "Stddev: " << stdF << " Expected: " << expected_std; // 20% + + EXPECT_NEAR(expected_mean, meanD, (expected_mean * 0.2) + 1e-2) << "Mean: " << meanD << " Expected: " << expected_mean; // 20% + EXPECT_NEAR(expected_std, stdD, (expected_std * 0.2) + 1e-2) << "Stddev: " << stdD << " Expected: " << expected_std; // 20% +} + +TEST(normal_distribution_with_states, philox4x32_10){ + rocrand_state_philox4x32_10 states; + rocrand_init(123456, 654321, 0, &states); + run_normal_dist_with_state_out2(&states); + run_normal_dist_with_state_out4(&states); +} + +TEST(normal_distribution_with_states, mrg31k3p){ + rocrand_state_mrg31k3p states; + rocrand_init(123456, 654321, 0, &states); + run_normal_dist_with_state_out2(&states); +} + +TEST(normal_distribution_with_states, mrg32k3a){ + rocrand_state_mrg32k3a states; + rocrand_init(123456, 654321, 0, &states); + run_normal_dist_with_state_out2(&states); +} + +TEST(normal_distribution_with_states, xorwow){ + rocrand_state_xorwow states; + rocrand_init(123456, 654321, 0, &states); + run_normal_dist_with_state_out2(&states); +} + +TEST(normal_distribution_with_states, sobol32){ + rocrand_state_sobol32 states; + const unsigned int* directions; + HIP_CHECK(rocrand_get_direction_vectors32(&directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6)); + + rocrand_init(directions, 0, &states); + + run_normal_dist_with_state_out1(&states); + +} + +TEST(normal_distribution_with_states, scarambled_sobol32){ + rocrand_state_scrambled_sobol32 states; + const unsigned int* directions; + HIP_CHECK(rocrand_get_direction_vectors32(&directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6)); + + rocrand_init(directions, 123456, 0, &states); + + run_normal_dist_with_state_out1(&states); +} + +TEST(normal_distribution_with_states, sobol64){ + rocrand_state_sobol64 states; + const unsigned long long* directions; + HIP_CHECK(rocrand_get_direction_vectors64(&directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6)); + + rocrand_init(directions, 0, &states); + + run_normal_dist_with_state_out1(&states); + +} + +TEST(normal_distribution_with_states, scarambled_sobol64){ + rocrand_state_scrambled_sobol64 states; + const unsigned long long* directions; + HIP_CHECK(rocrand_get_direction_vectors64(&directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6)); + + rocrand_init(directions, 123456, 0, &states); + + run_normal_dist_with_state_out1(&states); +} + + +TEST(normal_distribution_with_states, lfsr113){ + rocrand_state_lfsr113 states; + rocrand_init(static_cast(12), 0, &states); + + run_normal_dist_with_state_out1(&states); + run_normal_dist_with_state_out2(&states); +} + +TEST(normal_distribution_with_states, threefry2x32_20){ + rocrand_state_threefry2x32_20 states; + rocrand_init(123456, 654321, 0, & states); + + run_normal_dist_with_state_out1(&states); + run_normal_dist_with_state_out2(&states); +} + +TEST(normal_distribution_with_states, threefry2x64_20){ + rocrand_state_threefry2x64_20 states; + rocrand_init(123456, 654321, 0, & states); + + run_normal_dist_with_state_out1(&states); + run_normal_dist_with_state_out2(&states); +} + +TEST(normal_distribution_with_states, rocrand_state_threefry4x32_20){ + rocrand_state_threefry4x32_20 states; + rocrand_init(123456, 654321, 0, & states); + + run_normal_dist_with_state_out1(&states); + run_normal_dist_with_state_out2(&states); +} + +TEST(normal_distribution_with_states, rocrand_state_threefry4x64_20){ + rocrand_state_threefry4x64_20 states; + rocrand_init(123456, 654321, 0, & states); + + run_normal_dist_with_state_out1(&states); + run_normal_dist_with_state_out2(&states); +} + +template +__global__ void mtgp32_kernel (rocrand_state_mtgp32 * states, T * output, const size_t N, const UDFunction & f){ + const unsigned int state_id = blockIdx.x; + const unsigned int thread_id = threadIdx.x; + unsigned int index = blockIdx.x * blockDim.x + threadIdx.x; + + if(index >= N) + return; + + __shared__ rocrand_state_mtgp32 state; + if(thread_id == 0) + state = states[state_id]; + __syncthreads(); + + // output[index] = rocrand_uniform(&state); + output[index] = f(&state); + + if(thread_id == 0) + states[state_id] = state; +} + +TEST(normal_distribution_with_states, float_mtgp32){ + + size_t testSize = 40192; + size_t threads = 256; + size_t blocks = std::ceil(static_cast(testSize) / static_cast(threads)); + + rocrand_state_mtgp32 * states; + size_t state_size = blocks, seed = 654321; + HIP_CHECK(hipMalloc(&states, state_size * sizeof(rocrand_state_mtgp32))); + rocrand_make_state_mtgp32(states, mtgp32dc_params_fast_11213, state_size, seed); + + float * hOut = new float[testSize]; + float * dOut; + HIP_CHECK(hipMalloc(&dOut, sizeof(float) * testSize)); + HIP_CHECK(hipDeviceSynchronize()); + + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(mtgp32_kernel), + dim3(blocks), + dim3(threads), + 0, + 0, + states, + dOut, + testSize, + [] __device__ (rocrand_state_mtgp32 * state){ + return rocrand_normal(state); + } + ); + + HIP_CHECK(hipMemcpy(hOut, dOut, sizeof(float) * testSize, hipMemcpyDeviceToHost)); + + double mean = 0.0; + for(size_t i = 0; i < testSize; i++) + mean += hOut[i]; + + + mean /= testSize; + + double std = 0.0; + for(size_t i = 0; i < testSize; i++) + std += std::pow(hOut[i] - mean, 2); + + std = std::sqrt(std / testSize); + + double eMean = 0; + double eStd = 1; + + ASSERT_NEAR(mean, eMean, 0.01) << "Expected Mean: " << eMean << " Actual Mean: " << mean << " Eps: " << 0.01; + ASSERT_NEAR(std, eStd, eStd * 0.1) << "Expected Std: " << eStd << " Actual Std: " << std << " Eps: " << eStd * 0.1; + + HIP_CHECK(hipFree(states)); + HIP_CHECK(hipFree(dOut)); + + delete [] hOut; +} + +TEST(normal_distribution_with_states, float2_mtgp32){ + + size_t testSize = 20224; + size_t threads = 256; + size_t blocks = std::ceil(static_cast(testSize) / static_cast(threads)); + + rocrand_state_mtgp32 * states; + size_t state_size = blocks, seed = 654321; + HIP_CHECK(hipMalloc(&states, state_size * sizeof(rocrand_state_mtgp32))); + rocrand_make_state_mtgp32(states, mtgp32dc_params_fast_11213, state_size, seed); + + float2 * hOut = new float2[testSize]; + float2 * dOut; + HIP_CHECK(hipMalloc(&dOut, sizeof(float2) * testSize)); + HIP_CHECK(hipDeviceSynchronize()); + + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(mtgp32_kernel), + dim3(blocks), + dim3(threads), + 0, + 0, + states, + dOut, + testSize, + [] __device__ (rocrand_state_mtgp32 * state){ + return rocrand_normal2(state); + } + ); + + HIP_CHECK(hipMemcpy(hOut, dOut, sizeof(float2) * testSize, hipMemcpyDeviceToHost)); + + double mean = 0.0; + for(size_t i = 0; i < testSize; i++) + mean += hOut[i].x + hOut[i].y; + + + mean /= (testSize * 2); + + double std = 0.0; + for(size_t i = 0; i < testSize; i++) + std += std::pow(hOut[i].x - mean, 2) + std::pow(hOut[i].y - mean, 2); + + std = std::sqrt(std / (testSize * 2)); + + double eMean = 0; + double eStd = 1; + + ASSERT_NEAR(mean, eMean, 0.01) << "Expected Mean: " << eMean << " Actual Mean: " << mean << " Eps: " << 0.01; + ASSERT_NEAR(std, eStd, eStd * 0.1) << "Expected Std: " << eStd << " Actual Std: " << std << " Eps: " << eStd * 0.1; + + HIP_CHECK(hipFree(states)); + HIP_CHECK(hipFree(dOut)); + + delete [] hOut; +} + +TEST(normal_distribution_with_states, double_mtgp32){ + + size_t testSize = 40192; + size_t threads = 256; + size_t blocks = std::ceil(static_cast(testSize) / static_cast(threads)); + + rocrand_state_mtgp32 * states; + size_t state_size = blocks, seed = 654321; + HIP_CHECK(hipMalloc(&states, state_size * sizeof(rocrand_state_mtgp32))); + rocrand_make_state_mtgp32(states, mtgp32dc_params_fast_11213, state_size, seed); + + double * hOut = new double[testSize]; + double * dOut; + HIP_CHECK(hipMalloc(&dOut, sizeof(double) * testSize)); + HIP_CHECK(hipDeviceSynchronize()); + + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(mtgp32_kernel), + dim3(blocks), + dim3(threads), + 0, + 0, + states, + dOut, + testSize, + [] __device__ (rocrand_state_mtgp32 * state){ + return rocrand_normal_double(state); + } + ); + + HIP_CHECK(hipMemcpy(hOut, dOut, sizeof(double) * testSize, hipMemcpyDeviceToHost)); + + double mean = 0.0; + for(size_t i = 0; i < testSize; i++) + mean += hOut[i]; + + + mean /= testSize; + + double std = 0.0; + for(size_t i = 0; i < testSize; i++) + std += std::pow(hOut[i] - mean, 2); + + std = std::sqrt(std / testSize); + + double eMean = 0; + double eStd = 1; + + ASSERT_NEAR(mean, eMean, 0.01) << "Expected Mean: " << eMean << " Actual Mean: " << mean << " Eps: " << 0.01; + ASSERT_NEAR(std, eStd, eStd * 0.1) << "Expected Std: " << eStd << " Actual Std: " << std << " Eps: " << eStd * 0.1; + + HIP_CHECK(hipFree(states)); + HIP_CHECK(hipFree(dOut)); + + delete [] hOut; +} + +TEST(normal_distribution_with_states, double2_mtgp32){ + + size_t testSize = 20224; + size_t threads = 256; + size_t blocks = std::ceil(static_cast(testSize) / static_cast(threads)); + + rocrand_state_mtgp32 * states; + size_t state_size = blocks, seed = 654321; + HIP_CHECK(hipMalloc(&states, state_size * sizeof(rocrand_state_mtgp32))); + rocrand_make_state_mtgp32(states, mtgp32dc_params_fast_11213, state_size, seed); + + double2 * hOut = new double2[testSize]; + double2 * dOut; + HIP_CHECK(hipMalloc(&dOut, sizeof(double2) * testSize)); + HIP_CHECK(hipDeviceSynchronize()); + + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(mtgp32_kernel), + dim3(blocks), + dim3(threads), + 0, + 0, + states, + dOut, + testSize, + [] __device__ (rocrand_state_mtgp32 * state){ + return rocrand_normal_double2(state); + } + ); + + HIP_CHECK(hipMemcpy(hOut, dOut, sizeof(double2) * testSize, hipMemcpyDeviceToHost)); + + double mean = 0.0; + for(size_t i = 0; i < testSize; i++) + mean += hOut[i].x + hOut[i].y; + + + mean /= (testSize * 2); + + double std = 0.0; + for(size_t i = 0; i < testSize; i++) + std += std::pow(hOut[i].x - mean, 2) + std::pow(hOut[i].y - mean, 2); + + std = std::sqrt(std / (testSize * 2)); + + double eMean = 0; + double eStd = 1; + + ASSERT_NEAR(mean, eMean, 0.01) << "Expected Mean: " << eMean << " Actual Mean: " << mean << " Eps: " << 0.01; + ASSERT_NEAR(std, eStd, eStd * 0.1) << "Expected Std: " << eStd << " Actual Std: " << std << " Eps: " << eStd * 0.1; + + HIP_CHECK(hipFree(states)); + HIP_CHECK(hipFree(dOut)); + + delete [] hOut; +} diff --git a/test/internal/test_rocrand_threefry2x32_20_prng.cpp b/test/internal/test_rocrand_threefry2x32_20_prng.cpp index fa6667e41..478df90bc 100644 --- a/test/internal/test_rocrand_threefry2x32_20_prng.cpp +++ b/test/internal/test_rocrand_threefry2x32_20_prng.cpp @@ -161,3 +161,93 @@ TEST(threefry_prng_state_tests, discard_sequence_test) EXPECT_EQ(state.counter.y, 457U); EXPECT_EQ(state.substate, 0U); } + +TEST(threefry_additional_tests, rocrand_init_test) +{ + // making sure the outputs are the same when initialized with same parameters + rocrand_state_threefry2x32_20 state1, state2; + + using ull = unsigned long long; + + ull seeds[] = {0, 123, 321, 123456, 654321}; + ull subsequences[] = {0xf, 0xff, 0x1f, 0x1ff, 0x1f1}; + ull offsets[] = {0, 1, 2, 3, 4}; + + for(int i = 0; i < 5; i++){ + rocrand_init(seeds[i], subsequences[i], offsets[i], &state1); + rocrand_init(seeds[i], subsequences[i], offsets[i], &state2); + + for(int j = 0; j < 5000; j++) + ASSERT_EQ(rocrand(&state1), rocrand(&state2)); + } +} + +TEST(threefry_additional_tests, rocrand_test) +{ + // making sure the outputs are uniformly distributed! + rocrand_state_threefry2x32_20 state; + + rocrand_init(0, 0, 0, &state); + size_t testSize = 40000; + + unsigned int * output = new unsigned int[testSize]; + + double mean = 0; + for(size_t i = 0; i < testSize; i++){ + output[i] = rocrand(&state); + mean += static_cast(output[i]); + } + mean /= testSize; + + double std = 0.0; + for(size_t i = 0; i < testSize; i++) + std += std::pow(output[i] - mean, 2); + + std = std::sqrt(std / testSize); + + double maxi = (double) std::numeric_limits::max(); + double eMean = 0.5 * (maxi); // 0.5(a + b) + double eStd = (maxi) / (2 * std::sqrt(3)); // (b - a) / (2*3^0.5) + + ASSERT_NEAR(mean, eMean, eMean * 0.1); + ASSERT_NEAR(std, eStd, eStd * 0.1); + + delete [] output; +} + +TEST(threefry_additional_tests, rocrand2_test) +{ + // making sure the outputs are uniformly distributed! + rocrand_state_threefry2x32_20 state; + + rocrand_init(0, 0, 0, &state); + size_t testSize = 40000; + + unsigned int * output = new unsigned int[testSize]; + + double mean = 0; + for(size_t i = 0; i < testSize; i += 2){ + uint2 t = rocrand2(&state); + output[i] = t.x; + output[i + 1] = t.y; + mean += static_cast(output[i]); + mean += static_cast(output[i + 1]); + } + mean /= testSize; + + double std = 0.0; + for(size_t i = 0; i < testSize; i++) + std += std::pow(output[i] - mean, 2); + + std = std::sqrt(std / testSize); + + double maxi = (double) std::numeric_limits::max(); + // min val is 0 + double eMean = 0.5 * (maxi); // 0.5(a + b) + double eStd = (maxi) / (2 * std::sqrt(3)); // (b - a) / (2*3^0.5) + + ASSERT_NEAR(mean, eMean, eMean * 0.1); + ASSERT_NEAR(std, eStd, eStd * 0.1);\ + + delete [] output; +} diff --git a/test/internal/test_rocrand_threefry2x64_20_prng.cpp b/test/internal/test_rocrand_threefry2x64_20_prng.cpp index cd13ea3bb..4a1d776fd 100644 --- a/test/internal/test_rocrand_threefry2x64_20_prng.cpp +++ b/test/internal/test_rocrand_threefry2x64_20_prng.cpp @@ -169,3 +169,73 @@ TEST(threefry_prng_state_tests, discard_sequence_test) EXPECT_EQ(state.counter.y, 457ULL); EXPECT_EQ(state.substate, 0U); } + +TEST(threefry_additional_tests, rocrand_test) +{ + // making sure the outputs are uniformly distributed! + rocrand_state_threefry2x64_20 state; + + rocrand_init(0, 0, 0, &state); + size_t testSize = 40000; + + unsigned long long * output = new unsigned long long[testSize]; + + double mean = 0; + for(size_t i = 0; i < testSize; i++){ + output[i] = rocrand(&state); + mean += static_cast(output[i]); + } + mean /= testSize; + + double std = 0.0; + for(size_t i = 0; i < testSize; i++) + std += std::pow(output[i] - mean, 2); + + std = std::sqrt(std / testSize); + + double maxi = (double) std::numeric_limits::max(); + double eMean = 0.5 * (maxi); // 0.5(a + b) + double eStd = (maxi) / (2 * std::sqrt(3)); // (b - a) / (2*3^0.5) + + ASSERT_NEAR(mean, eMean, eMean * 0.1); + ASSERT_NEAR(std, eStd, eStd * 0.1); + + delete [] output; +} + +TEST(threefry_additional_tests, rocrand2_test) +{ + // making sure the outputs are uniformly distributed! + rocrand_state_threefry2x64_20 state; + + rocrand_init(0, 0, 0, &state); + size_t testSize = 40000; + + unsigned long long * output = new unsigned long long[testSize]; + + double mean = 0; + for(size_t i = 0; i < testSize; i += 2){ + ulonglong2 t = rocrand2(&state); + output[i] = t.x; + output[i + 1] = t.y; + mean += static_cast(output[i]); + mean += static_cast(output[i + 1]); + } + mean /= testSize; + + double std = 0.0; + for(size_t i = 0; i < testSize; i++) + std += std::pow(output[i] - mean, 2); + + std = std::sqrt(std / testSize); + + double maxi = (double) std::numeric_limits::max(); + // min val is 0 + double eMean = 0.5 * (maxi); // 0.5(a + b) + double eStd = (maxi) / (2 * std::sqrt(3)); // (b - a) / (2*3^0.5) + + ASSERT_NEAR(mean, eMean, eMean * 0.1); + ASSERT_NEAR(std, eStd, eStd * 0.1);\ + + delete [] output; +} diff --git a/test/internal/test_rocrand_threefry4x32_20_prng.cpp b/test/internal/test_rocrand_threefry4x32_20_prng.cpp index 3ce1be60e..c69df6fbf 100644 --- a/test/internal/test_rocrand_threefry4x32_20_prng.cpp +++ b/test/internal/test_rocrand_threefry4x32_20_prng.cpp @@ -245,3 +245,97 @@ TEST(threefry_prng_state_tests, discard_sequence_test) EXPECT_EQ(state.counter.w, 6U); EXPECT_EQ(state.substate, 0U); } + +TEST(threefry_additional_tests, rocrand_init_test) +{ + // making sure the outputs are the same when initialized with same parameters + rocrand_state_threefry4x32_20 state1, state2; + + using ull = unsigned long long; + + ull seeds[] = {0, 123, 321, 123456, 654321}; + ull subsequences[] = {0xf, 0xff, 0x1f, 0x1ff, 0x1f1}; + ull offsets[] = {0, 1, 2, 3, 4}; + + for(int i = 0; i < 5; i++){ + rocrand_init(seeds[i], subsequences[i], offsets[i], &state1); + rocrand_init(seeds[i], subsequences[i], offsets[i], &state2); + + for(int j = 0; j < 5000; j++) + ASSERT_EQ(rocrand(&state1), rocrand(&state2)); + } +} + +TEST(threefry_additional_tests, rocrand_test) +{ + // making sure the outputs are uniformly distributed! + rocrand_state_threefry4x32_20 state; + + rocrand_init(0, 0, 0, &state); + size_t testSize = 40000; + + unsigned int * output = new unsigned int[testSize]; + + double mean = 0; + for(size_t i = 0; i < testSize; i++){ + output[i] = rocrand(&state); + mean += static_cast(output[i]); + } + mean /= testSize; + + double std = 0.0; + for(size_t i = 0; i < testSize; i++) + std += std::pow(output[i] - mean, 2); + + std = std::sqrt(std / testSize); + + double maxi = (double) std::numeric_limits::max(); + double eMean = 0.5 * (maxi); // 0.5(a + b) + double eStd = (maxi) / (2 * std::sqrt(3)); // (b - a) / (2*3^0.5) + + ASSERT_NEAR(mean, eMean, eMean * 0.1); + ASSERT_NEAR(std, eStd, eStd * 0.1); + + delete [] output; +} + +TEST(threefry_additional_tests, rocrand4_test) +{ + // making sure the outputs are uniformly distributed! + rocrand_state_threefry4x32_20 state; + + rocrand_init(0, 0, 0, &state); + size_t testSize = 40000; + + unsigned int * output = new unsigned int[testSize]; + + double mean = 0; + for(size_t i = 0; i < testSize; i += 4){ + uint4 t = rocrand4(&state); + output[i] = t.w; + output[i + 1] = t.x; + output[i + 2] = t.y; + output[i + 3] = t.z; + mean += static_cast(output[i]); + mean += static_cast(output[i + 1]); + mean += static_cast(output[i + 2]); + mean += static_cast(output[i + 3]); + } + mean /= testSize; + + double std = 0.0; + for(size_t i = 0; i < testSize; i++) + std += std::pow(output[i] - mean, 2); + + std = std::sqrt(std / testSize); + + double maxi = (double) std::numeric_limits::max(); + // min val is 0 + double eMean = 0.5 * (maxi); // 0.5(a + b) + double eStd = (maxi) / (2 * std::sqrt(3)); // (b - a) / (2*3^0.5) + + ASSERT_NEAR(mean, eMean, eMean * 0.1); + ASSERT_NEAR(std, eStd, eStd * 0.1); + + delete [] output; +} diff --git a/test/internal/test_rocrand_threefry4x64_20_prng.cpp b/test/internal/test_rocrand_threefry4x64_20_prng.cpp index 280c333d2..cff39bacb 100644 --- a/test/internal/test_rocrand_threefry4x64_20_prng.cpp +++ b/test/internal/test_rocrand_threefry4x64_20_prng.cpp @@ -261,3 +261,97 @@ TEST(threefry_prng_state_tests, discard_sequence_test) EXPECT_EQ(state.counter.w, 6ULL); EXPECT_EQ(state.substate, 0U); } + +TEST(threefry_additional_tests, rocrand_init_test) +{ + // making sure the outputs are the same when initialized with same parameters + rocrand_state_threefry4x64_20 state1, state2; + + using ull = unsigned long long; + + ull seeds[] = {0, 123, 321, 123456, 654321}; + ull subsequences[] = {0xf, 0xff, 0x1f, 0x1ff, 0x1f1}; + ull offsets[] = {0, 1, 2, 3, 4}; + + for(int i = 0; i < 5; i++){ + rocrand_init(seeds[i], subsequences[i], offsets[i], &state1); + rocrand_init(seeds[i], subsequences[i], offsets[i], &state2); + + for(int j = 0; j < 5000; j++) + ASSERT_EQ(rocrand(&state1), rocrand(&state2)); + } +} + +TEST(threefry_additional_tests, rocrand_test) +{ + // making sure the outputs are uniformly distributed! + rocrand_state_threefry4x64_20 state; + + rocrand_init(0, 0, 0, &state); + size_t testSize = 40000; + + unsigned long long * output = new unsigned long long[testSize]; + + double mean = 0; + for(size_t i = 0; i < testSize; i++){ + output[i] = rocrand(&state); + mean += static_cast(output[i]); + } + mean /= testSize; + + double std = 0.0; + for(size_t i = 0; i < testSize; i++) + std += std::pow(output[i] - mean, 2); + + std = std::sqrt(std / testSize); + + double maxi = (double) std::numeric_limits::max(); + double eMean = 0.5 * (maxi); // 0.5(a + b) + double eStd = (maxi) / (2 * std::sqrt(3)); // (b - a) / (2*3^0.5) + + ASSERT_NEAR(mean, eMean, eMean * 0.1); + ASSERT_NEAR(std, eStd, eStd * 0.1); + + delete [] output; +} + +TEST(threefry_additional_tests, rocrand4_test) +{ + // making sure the outputs are uniformly distributed! + rocrand_state_threefry4x64_20 state; + + rocrand_init(0, 0, 0, &state); + size_t testSize = 40000; + + unsigned long long * output = new unsigned long long[testSize]; + + double mean = 0; + for(size_t i = 0; i < testSize; i += 4){ + ulonglong4 t = rocrand4(&state); + output[i] = t.w; + output[i + 1] = t.x; + output[i + 2] = t.y; + output[i + 3] = t.z; + mean += static_cast(output[i]); + mean += static_cast(output[i + 1]); + mean += static_cast(output[i + 2]); + mean += static_cast(output[i + 3]); + } + mean /= testSize; + + double std = 0.0; + for(size_t i = 0; i < testSize; i++) + std += std::pow(output[i] - mean, 2); + + std = std::sqrt(std / testSize); + + double maxi = (double) std::numeric_limits::max(); + // min val is 0 + double eMean = 0.5 * (maxi); // 0.5(a + b) + double eStd = (maxi) / (2 * std::sqrt(3)); // (b - a) / (2*3^0.5) + + ASSERT_NEAR(mean, eMean, eMean * 0.1); + ASSERT_NEAR(std, eStd, eStd * 0.1); + + delete [] output; +} diff --git a/test/internal/test_uniform_distribution.cpp b/test/internal/test_uniform_distribution.cpp index e897e0144..37453c6db 100644 --- a/test/internal/test_uniform_distribution.cpp +++ b/test/internal/test_uniform_distribution.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -24,6 +24,9 @@ #include #include +#include + +#define HIP_CHECK(state) ASSERT_EQ(state, hipSuccess) using namespace rocrand_impl::host; @@ -358,3 +361,764 @@ TEST(sobol_uniform_distribution_tests, half_test) EXPECT_GT(__half2float(output[0]), 0.0f); EXPECT_LT(__half2float(output[0]), 1e-4f); } + +template +struct NumericUD{ + + template + void run_test(UD & dis, const FuncCall & f){ + std::random_device rd; + std::mt19937 gen(rd()); + + const size_t testSize = 4000000; + + float * output = new float [testSize]; + + InType input; + OutType out; + + double mean = 0; + + for(size_t i = 0; i <= testSize; i += 4){ + input = {dis(gen), dis(gen), dis(gen), dis(gen)}; + + f(input, out); + + output[i] = out.w; + output[i + 1] = out.x; + output[i + 2] = out.y; + output[i + 3] = out.z; + + ASSERT_GT(out.w, 0); + ASSERT_GT(out.x, 0); + ASSERT_GT(out.y, 0); + ASSERT_GT(out.z, 0); + + ASSERT_LE(out.w, 1); + ASSERT_LE(out.x, 1); + ASSERT_LE(out.y, 1); + ASSERT_LE(out.z, 1); + + mean += out.w + out.x + out.y + out.z; + } + + mean /= testSize; + + double std = 0.0; + for(size_t i = 0; i < testSize; i++) + std += std::pow(output[i] - mean, 2); + + std = std::sqrt(std / testSize); + + double eMean = 0.5 * (0 + 1); // 0.5(a + b) + double eStd = (1 - 0) / (2 * std::sqrt(3)); // (b - a) / (2*3^0.5) + + ASSERT_NEAR(mean, eMean, eMean * 0.1) << "Expected Mean: " << eMean << " Actual Mean: " << mean << " Eps: " << eMean * 0.1; + ASSERT_NEAR(std, eStd, eStd * 0.1) << "Expected Std: " << eStd << " Actual Std: " << std << " Eps: " << eStd * 0.1; + } + +}; + +TEST(uniform_distribution_tests, float4_uint4_in_test){ + unsigned int mini = 0; + unsigned int maxi = std::numeric_limits::max(); + std::uniform_int_distribution dis(mini, maxi); + + NumericUD> test; + test.run_test( + dis, + [] __host__ __device__ (uint4 & input, float4 & output){ + output = rocrand_device::detail::uniform_distribution4(input); + } + ); +} + +TEST(uniform_distribution_tests, float4_ulonglong4_test){ + unsigned long long mini = 0; + unsigned long long maxi = std::numeric_limits::max(); + std::uniform_int_distribution dis(mini, maxi); + + NumericUD> test; + test.run_test( + dis, + [] __host__ __device__ (ulonglong4 & input, float4 & output){ + output = rocrand_device::detail::uniform_distribution4(input); + } + ); +} + +TEST(uniform_distribution_tests, double4_uint4_test){ + unsigned int mini = 0; + unsigned int maxi = std::numeric_limits::max(); + std::uniform_int_distribution dis(mini, maxi); + + std::random_device rd; + std::mt19937 gen(rd()); + + NumericUD> test; + test.run_test( + dis, + [&] __host__ __device__ (uint4 & input, double4 & output){ + output = rocrand_device::detail::uniform_distribution_double4(input, input); + } + ); +} + +TEST(uniform_distribution_tests, double4_ulonglong4_test){ + unsigned long long mini = 0; + unsigned long long maxi = std::numeric_limits::max(); + std::uniform_int_distribution dis(mini, maxi); + + NumericUD> test; + test.run_test( + dis, + [] __host__ __device__ (ulonglong4 & input, double4 & output){ + output = rocrand_device::detail::uniform_distribution_double4(input); + } + ); +} + +TEST(uniform_distribution_tests, double2_uint4_in_test){ + unsigned int mini = 0; + unsigned int maxi = std::numeric_limits::max(); + std::uniform_int_distribution dis(mini, maxi); + + std::random_device rd; + std::mt19937 gen(rd()); + + NumericUD> test; + test.run_test( + dis, + [&] __host__ __device__ (uint4 & input, double4 & output){ + + uint4 secondInput = {dis(gen), dis(gen), dis(gen), dis(gen)}; + double2 o1 = rocrand_device::detail::uniform_distribution_double2(input); + double2 o2 = rocrand_device::detail::uniform_distribution_double2(secondInput); + + output.w = o1.x; output.x = o1.y; + output.y = o2.x; output.z = o2.y; + } + ); +} + +TEST(uniform_distribution_tests, double2_ulonglong2_in_test){ + unsigned long long mini = 0; + unsigned long long maxi = std::numeric_limits::max(); + std::uniform_int_distribution dis(mini, maxi); + + NumericUD> test; + test.run_test( + dis, + [&] __host__ __device__ (ulonglong4 & input, double4 & output){ + + ulonglong2 i1 = {input.w, input.x}; + ulonglong2 i2 = {input.y, input.z}; + + double2 o1 = rocrand_device::detail::uniform_distribution_double2(i1); + double2 o2 = rocrand_device::detail::uniform_distribution_double2(i2); + + output.w = o1.x; output.x = o1.y; + output.y = o2.x; output.z = o2.y; + } + ); +} + +TEST(uniform_distribution_tests, double2_ulonglong4_in_test){ + unsigned long long mini = 0; + unsigned long long maxi = std::numeric_limits::max(); + std::uniform_int_distribution dis(mini, maxi); + + std::random_device rd; + std::mt19937 gen(rd()); + + NumericUD> test; + test.run_test( + dis, + [&] __host__ __device__ (ulonglong4 & input, double4 & output){ + ulonglong4 secondInput = {dis(gen), dis(gen), dis(gen), dis(gen)}; + double2 o1 = rocrand_device::detail::uniform_distribution_double2(input); + double2 o2 = rocrand_device::detail::uniform_distribution_double2(secondInput); + + output.w = o1.x; output.x = o1.y; + output.y = o2.x; output.z = o2.y; + } + ); +} + +template +struct StatesUD{ + template + void run_test(const FuncCall & f, size_t testSize = 4000000){ + float * output = new float [testSize]; + OutType out; + + double mean = 0; + + for(size_t i = 0; i <= testSize; i += 4){ + f(out); + + output[i] = out.w; + output[i + 1] = out.x; + output[i + 2] = out.y; + output[i + 3] = out.z; + + ASSERT_GT(out.w, 0); + ASSERT_GT(out.x, 0); + ASSERT_GT(out.y, 0); + ASSERT_GT(out.z, 0); + + ASSERT_LE(out.w, 1); + ASSERT_LE(out.x, 1); + ASSERT_LE(out.y, 1); + ASSERT_LE(out.z, 1); + + mean += out.w + out.x + out.y + out.z; + } + + mean /= testSize; + + double std = 0.0; + for(size_t i = 0; i < testSize; i++) + std += std::pow(output[i] - mean, 2); + + std = std::sqrt(std / testSize); + + double eMean = 0.5 * (0 + 1); // 0.5(a + b) + double eStd = (1 - 0) / (2 * std::sqrt(3)); // (b - a) / (2*3^0.5) + + ASSERT_NEAR(mean, eMean, eMean * 0.1) << "Expected Mean: " << eMean << " Actual Mean: " << mean << " Eps: " << eMean * 0.1; + ASSERT_NEAR(std, eStd, eStd * 0.1) << "Expected Std: " << eStd << " Actual Std: " << std << " Eps: " << eStd * 0.1; + } +}; + +TEST(uniform_distribution_tests, philox4x32_10_test){ + rocrand_state_philox4x32_10 states; + rocrand_init(123456, 654321, 0, &states); + + StatesUD testFloat; + + testFloat.run_test( + [&] __host__ __device__ (float4 & output){ + output = { + rocrand_uniform(&states), rocrand_uniform(&states), + rocrand_uniform(&states), rocrand_uniform(&states) + }; + } + ); + + testFloat.run_test( + [&] __host__ __device__ (float4 & output){ + float2 o1 = rocrand_uniform2(&states); + float2 o2 = rocrand_uniform2(&states); + + output.w = o1.x; output.x = o1.y; + output.y = o2.x; output.z = o2.y; + } + ); + + testFloat.run_test( + [&] __host__ __device__ (float4 & output){ + output = rocrand_uniform4(&states); + } + ); + + StatesUD testDouble; + + testDouble.run_test( + [&] __host__ __device__ (double4 & output){ + output = { + rocrand_uniform_double(&states), rocrand_uniform_double(&states), + rocrand_uniform_double(&states), rocrand_uniform_double(&states) + }; + } + ); + + testDouble.run_test( + [&] __host__ __device__ (double4 & output){ + double2 o1 = rocrand_uniform_double2(&states); + double2 o2 = rocrand_uniform_double2(&states); + + output.w = o1.x; output.x = o1.y; + output.y = o2.x; output.z = o2.y; + } + ); + + testDouble.run_test( + [&] __host__ __device__ (double4 & output){ + output = rocrand_uniform_double4(&states); + } + ); +} + +TEST(uniform_distribution_tests, mrg31k3p_test){ + rocrand_state_mrg31k3p states; + rocrand_init(123456, 654321, 0, &states); + + StatesUD testFloat; + + testFloat.run_test( + [&] __host__ __device__ (float4 & output){ + output = { + rocrand_uniform(&states), rocrand_uniform(&states), + rocrand_uniform(&states), rocrand_uniform(&states) + }; + } + ); + + StatesUD testDouble; + + testDouble.run_test( + [&] __host__ __device__ (double4 & output){ + output = { + rocrand_uniform_double(&states), rocrand_uniform_double(&states), + rocrand_uniform_double(&states), rocrand_uniform_double(&states) + }; + } + ); +} + +TEST(uniform_distribution_tests, mrg32k3a_test){ + rocrand_state_mrg32k3a states; + rocrand_init(123456, 654321, 0, &states); + + StatesUD testFloat; + + testFloat.run_test( + [&] __host__ __device__ (float4 & output){ + output = { + rocrand_uniform(&states), rocrand_uniform(&states), + rocrand_uniform(&states), rocrand_uniform(&states) + }; + } + ); + + StatesUD testDouble; + + testDouble.run_test( + [&] __host__ __device__ (double4 & output){ + output = { + rocrand_uniform_double(&states), rocrand_uniform_double(&states), + rocrand_uniform_double(&states), rocrand_uniform_double(&states) + }; + } + ); +} + +TEST(uniform_distribution_tests, xorwow_test){ + rocrand_state_xorwow states; + rocrand_init(123456, 654321, 0, &states); + + StatesUD testFloat; + + testFloat.run_test( + [&] __host__ __device__ (float4 & output){ + output = { + rocrand_uniform(&states), rocrand_uniform(&states), + rocrand_uniform(&states), rocrand_uniform(&states) + }; + } + ); + + StatesUD testDouble; + + testDouble.run_test( + [&] __host__ __device__ (double4 & output){ + output = { + rocrand_uniform_double(&states), rocrand_uniform_double(&states), + rocrand_uniform_double(&states), rocrand_uniform_double(&states) + }; + } + ); +} + +TEST(uniform_distribution_tests, sobol32_test){ + rocrand_state_sobol32 states; + const unsigned int* directions; + HIP_CHECK(rocrand_get_direction_vectors32(&directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6)); + rocrand_init(directions, 0, &states); + + StatesUD testFloat; + + testFloat.run_test( + [&] __host__ __device__ (float4 & output){ + output = { + rocrand_uniform(&states), rocrand_uniform(&states), + rocrand_uniform(&states), rocrand_uniform(&states) + }; + } + ); + + StatesUD testDouble; + + testDouble.run_test( + [&] __host__ __device__ (double4 & output){ + output = { + rocrand_uniform_double(&states), rocrand_uniform_double(&states), + rocrand_uniform_double(&states), rocrand_uniform_double(&states) + }; + } + ); +} + +TEST(uniform_distribution_tests, scrambled_sobol32_test){ + rocrand_state_scrambled_sobol32 states; + const unsigned int* directions; + HIP_CHECK(rocrand_get_direction_vectors32(&directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6)); + rocrand_init(directions, 123456, 0, &states); + + StatesUD testFloat; + + testFloat.run_test( + [&] __host__ __device__ (float4 & output){ + output = { + rocrand_uniform(&states), rocrand_uniform(&states), + rocrand_uniform(&states), rocrand_uniform(&states) + }; + } + ); + + StatesUD testDouble; + + testDouble.run_test( + [&] __host__ __device__ (double4 & output){ + output = { + rocrand_uniform_double(&states), rocrand_uniform_double(&states), + rocrand_uniform_double(&states), rocrand_uniform_double(&states) + }; + } + ); +} + +TEST(uniform_distribution_tests, sobol64_test){ + rocrand_state_sobol64 states; + const unsigned long long* directions; + HIP_CHECK(rocrand_get_direction_vectors64(&directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6)); + rocrand_init(directions, 0, &states); + + StatesUD testFloat; + + testFloat.run_test( + [&] __host__ __device__ (float4 & output){ + output = { + rocrand_uniform(&states), rocrand_uniform(&states), + rocrand_uniform(&states), rocrand_uniform(&states) + }; + } + ); + + StatesUD testDouble; + + testDouble.run_test( + [&] __host__ __device__ (double4 & output){ + output = { + rocrand_uniform_double(&states), rocrand_uniform_double(&states), + rocrand_uniform_double(&states), rocrand_uniform_double(&states) + }; + } + ); +} + +TEST(uniform_distribution_tests, scrambled_sobol64_test){ + rocrand_state_scrambled_sobol64 states; + const unsigned long long* directions; + HIP_CHECK(rocrand_get_direction_vectors64(&directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6)); + rocrand_init(directions, 123456, 0, &states); + + StatesUD testFloat; + + testFloat.run_test( + [&] __host__ __device__ (float4 & output){ + output = { + rocrand_uniform(&states), rocrand_uniform(&states), + rocrand_uniform(&states), rocrand_uniform(&states) + }; + } + ); + + StatesUD testDouble; + + testDouble.run_test( + [&] __host__ __device__ (double4 & output){ + output = { + rocrand_uniform_double(&states), rocrand_uniform_double(&states), + rocrand_uniform_double(&states), rocrand_uniform_double(&states) + }; + } + ); +} + +TEST(uniform_distribution_tests, lfsr113_test){ + rocrand_state_lfsr113 states; + rocrand_init(static_cast(12), 0, &states); + + StatesUD testFloat; + + testFloat.run_test( + [&] __host__ __device__ (float4 & output){ + output = { + rocrand_uniform(&states), rocrand_uniform(&states), + rocrand_uniform(&states), rocrand_uniform(&states) + }; + } + ); + + StatesUD testDouble; + + testDouble.run_test( + [&] __host__ __device__ (double4 & output){ + output = { + rocrand_uniform_double(&states), rocrand_uniform_double(&states), + rocrand_uniform_double(&states), rocrand_uniform_double(&states) + }; + } + ); +} + +TEST(uniform_distribution_tests, threefry2x32_20_test){ + rocrand_state_threefry2x32_20 states; + rocrand_init(123456, 654321, 0, & states); + + StatesUD testFloat; + + testFloat.run_test( + [&] __host__ __device__ (float4 & output){ + output = { + rocrand_uniform(&states), rocrand_uniform(&states), + rocrand_uniform(&states), rocrand_uniform(&states) + }; + } + ); + + StatesUD testDouble; + + testDouble.run_test( + [&] __host__ __device__ (double4 & output){ + output = { + rocrand_uniform_double(&states), rocrand_uniform_double(&states), + rocrand_uniform_double(&states), rocrand_uniform_double(&states) + }; + } + ); +} + +TEST(uniform_distribution_tests, threefry2x64_20_test){ + rocrand_state_threefry2x64_20 states; + rocrand_init(123456, 654321, 0, & states); + + StatesUD testFloat; + + testFloat.run_test( + [&] __host__ __device__ (float4 & output){ + output = { + rocrand_uniform(&states), rocrand_uniform(&states), + rocrand_uniform(&states), rocrand_uniform(&states) + }; + } + ); + + StatesUD testDouble; + + testDouble.run_test( + [&] __host__ __device__ (double4 & output){ + output = { + rocrand_uniform_double(&states), rocrand_uniform_double(&states), + rocrand_uniform_double(&states), rocrand_uniform_double(&states) + }; + } + ); +} + +TEST(uniform_distribution_tests, threefry4x32_20_test){ + rocrand_state_threefry4x32_20 states; + rocrand_init(123456, 654321, 0, & states); + + StatesUD testFloat; + + testFloat.run_test( + [&] __host__ __device__ (float4 & output){ + output = { + rocrand_uniform(&states), rocrand_uniform(&states), + rocrand_uniform(&states), rocrand_uniform(&states) + }; + } + ); + + StatesUD testDouble; + + testDouble.run_test( + [&] __host__ __device__ (double4 & output){ + output = { + rocrand_uniform_double(&states), rocrand_uniform_double(&states), + rocrand_uniform_double(&states), rocrand_uniform_double(&states) + }; + } + ); +} + +TEST(uniform_distribution_tests, threefry4x64_20_test){ + rocrand_state_threefry4x64_20 states; + rocrand_init(123456, 654321, 0, & states); + + StatesUD testFloat; + + testFloat.run_test( + [&] __host__ __device__ (float4 & output){ + output = { + rocrand_uniform(&states), rocrand_uniform(&states), + rocrand_uniform(&states), rocrand_uniform(&states) + }; + } + ); + + StatesUD testDouble; + + testDouble.run_test( + [&] __host__ __device__ (double4 & output){ + output = { + rocrand_uniform_double(&states), rocrand_uniform_double(&states), + rocrand_uniform_double(&states), rocrand_uniform_double(&states) + }; + } + ); +} + +template +__global__ void mtgp32_kernel (rocrand_state_mtgp32 * states, T * output, const size_t N, const UDFunction & f){ + const unsigned int state_id = blockIdx.x; + const unsigned int thread_id = threadIdx.x; + unsigned int index = blockIdx.x * blockDim.x + threadIdx.x; + + if(index >= N) + return; + + __shared__ rocrand_state_mtgp32 state; + if(thread_id == 0) + state = states[state_id]; + __syncthreads(); + + // output[index] = rocrand_uniform(&state); + output[index] = f(&state); + + if(thread_id == 0) + states[state_id] = state; +} + +TEST(uniform_distribution_tests, float_mtgp32_test){ + size_t testSize = 40192; + size_t threads = 256; + size_t blocks = std::ceil(static_cast(testSize) / static_cast(threads)); + + rocrand_state_mtgp32 * states; + size_t state_size = blocks, seed = 654321; + HIP_CHECK(hipMalloc(&states, state_size * sizeof(rocrand_state_mtgp32))); + rocrand_make_state_mtgp32(states, mtgp32dc_params_fast_11213, state_size, seed); + + float * hOut = new float[testSize]; + float * dOut; + HIP_CHECK(hipMalloc(&dOut, sizeof(float) * testSize)); + HIP_CHECK(hipDeviceSynchronize()); + + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(mtgp32_kernel), + dim3(blocks), + dim3(threads), + 0, + 0, + states, + dOut, + testSize, + [] __device__ (rocrand_state_mtgp32 * state){ + return rocrand_uniform(state); + } + ); + + HIP_CHECK(hipMemcpy(hOut, dOut, sizeof(float) * testSize, hipMemcpyDeviceToHost)); + + double mean = 0.0; + for(size_t i = 0; i < testSize; i++){ + ASSERT_GT(hOut[i], 0.0); + ASSERT_LE(hOut[i], 1.0); + + mean += hOut[i]; + } + + mean /= testSize; + + double std = 0.0; + for(size_t i = 0; i < testSize; i++) + std += std::pow(hOut[i] - mean, 2); + + std = std::sqrt(std / testSize); + + double eMean = 0.5 * (0 + 1); // 0.5(a + b) + double eStd = (1 - 0) / (2 * std::sqrt(3)); // (b - a) / (2*3^0.5) + + ASSERT_NEAR(mean, eMean, eMean * 0.1) << "Expected Mean: " << eMean << " Actual Mean: " << mean << " Eps: " << eMean * 0.1; + ASSERT_NEAR(std, eStd, eStd * 0.1) << "Expected Std: " << eStd << " Actual Std: " << std << " Eps: " << eStd * 0.1; + + HIP_CHECK(hipFree(states)); + HIP_CHECK(hipFree(dOut)); + + delete [] hOut; +} + +TEST(uniform_distribution_tests, double_mtgp32_test){ + size_t testSize = 40192; + size_t threads = 256; + size_t blocks = std::ceil(static_cast(testSize) / static_cast(threads)); + + rocrand_state_mtgp32 * states; + size_t state_size = blocks, seed = 654321; + HIP_CHECK(hipMalloc(&states, state_size * sizeof(rocrand_state_mtgp32))); + rocrand_make_state_mtgp32(states, mtgp32dc_params_fast_11213, state_size, seed); + + double * hOut = new double[testSize]; + double * dOut; + HIP_CHECK(hipMalloc(&dOut, sizeof(double) * testSize)); + HIP_CHECK(hipDeviceSynchronize()); + + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(mtgp32_kernel), + dim3(blocks), + dim3(threads), + 0, + 0, + states, + dOut, + testSize, + [] __device__ (rocrand_state_mtgp32 * state){ + return rocrand_uniform_double(state); + } + ); + + HIP_CHECK(hipMemcpy(hOut, dOut, sizeof(double) * testSize, hipMemcpyDeviceToHost)); + + double mean = 0.0; + for(size_t i = 0; i < testSize; i++){ + ASSERT_GT(hOut[i], 0.0); + ASSERT_LE(hOut[i], 1.0); + + mean += hOut[i]; + } + + mean /= testSize; + + double std = 0.0; + for(size_t i = 0; i < testSize; i++) + std += std::pow(hOut[i] - mean, 2); + + std = std::sqrt(std / testSize); + + double eMean = 0.5 * (0 + 1); // 0.5(a + b) + double eStd = (1 - 0) / (2 * std::sqrt(3)); // (b - a) / (2*3^0.5) + + ASSERT_NEAR(mean, eMean, eMean * 0.1) << "Expected Mean: " << eMean << " Actual Mean: " << mean << " Eps: " << eMean * 0.1; + ASSERT_NEAR(std, eStd, eStd * 0.1) << "Expected Std: " << eStd << " Actual Std: " << std << " Eps: " << eStd * 0.1; + + HIP_CHECK(hipFree(states)); + HIP_CHECK(hipFree(dOut)); + + delete [] hOut; +}