diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7001c9ecb..592347b3f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,14 @@ Documentation for rocRAND is available at
 ### Added
 
 * gfx950 support
+* Additional unit tests for `test_log_normal_distribution.cpp`
+* Additional unit tests for `test_normal_distribution.cpp`
+* Additional unit tests for `test_uniform_distribution.cpp`
+* Additional unit tests for `test_rocrand_threefry2x32_20_prng.cpp`
+* Additional unit tests for `test_rocrand_threefry2x64_20_prng.cpp`
+* Additional unit tests for `test_rocrand_threefry4x32_20_prng.cpp`
+* Additional unit tests for `test_rocrand_threefry4x64_20_prng.cpp`
+* New unit tests for `include/rocrand/rocrand_discrete.h` in `test_discrete_distribution.cpp`
 
 ### Changed
 
diff --git a/test/internal/test_discrete_distribution.cpp b/test/internal/test_discrete_distribution.cpp
new file mode 100644
index 000000000..16ec00104
--- /dev/null
+++ b/test/internal/test_discrete_distribution.cpp
@@ -0,0 +1,678 @@
+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include <gtest/gtest.h>
+#include <stdio.h>
+
+#include <random>
+
+#include <vector>
+#include <rocrand/rocrand_discrete.h>
+#include <rocrand/rocrand_mtgp32_11213.h>
+
+#define HIP_CHECK(cmd)                                                                          \
+    do                                                                                          \
+    {                                                                                           \
+        auto error = (cmd);                                                                     \
+        if(error != hipSuccess)                                                                 \
+        {                                                                                       \
+            std::cerr << "Encountered HIP error (" << hipGetErrorString(error) << ") at line "  \
+                      << __LINE__ << " in file " << __FILE__ << "\n";                           \
+            exit(-1);                                                                           \
+        }                                                                                       \
+    } while(0)                                                                                  \
+
+#define ROCRAND_CHECK(cmd)                                                                      \
+    do                                                                                          \
+    {                                                                                           \
+        auto status = cmd;                                                                      \
+        if(status != 0){                                                                        \
+            std::cerr << "Encountered ROCRAND error: " << status << "at line"                   \
+            << __LINE__ << " in file " << __FILE__ << "\n";                                     \
+            exit(-1);                                                                           \
+        }                                                                                       \
+    } while(0)                                                                                  \
+
+
+
+struct GlobalSizes {
+    static constexpr size_t items_per_thread = 256;
+    static constexpr size_t block_size = 32;
+    static constexpr size_t items_per_block = items_per_thread * block_size;
+    static constexpr size_t grid_size = 1234;
+    static constexpr size_t size = grid_size * items_per_block;
+};
+
+using DiscreteDataType = ::testing::Types<double, unsigned int, unsigned long, unsigned long long int>;
+
+template <typename DT>
+class InternalDiscreteDistributionTests : public ::testing::Test{
+    public:
+        using T = DT;
+};
+
+TYPED_TEST_SUITE(InternalDiscreteDistributionTests, DiscreteDataType);
+
+template<typename T, class DiscreteFunc>
+__global__ void internal_discrete_kernel(T * device_input, unsigned int * device_output, rocrand_discrete_distribution_st &dis, const DiscreteFunc & f){
+    const size_t items_per_block = GlobalSizes::items_per_thread * GlobalSizes::block_size;
+    const size_t offset = (items_per_block * blockIdx.x) + (GlobalSizes::items_per_thread * threadIdx.x);
+
+    for(size_t i = 0; i < GlobalSizes::items_per_thread; i++){
+        device_output[offset + i] = f(device_input[offset + i], dis);
+    }
+}
+
+template <typename T, class DiscreteFunc>
+void run_internal_discrete_tests(const DiscreteFunc & f){
+    std::vector<std::vector<double>> all_distributions = {
+        {10, 10, 10, 10},
+        {1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 1},
+        {1234, 1677, 1519, 1032, 561, 254, 98, 33, 10, 2},
+        {1, 2, 8, 4, 3, 2, 1}
+    };
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    
+    T * host_input = new T[GlobalSizes::size];
+    unsigned int * host_output = new unsigned int[GlobalSizes::size];
+
+    // Check for different types of data input and generate the input data
+    if constexpr (std::is_same_v<T, double>){
+        std::uniform_real_distribution<double> dis(0, 1);
+        for(size_t i = 0; i < GlobalSizes::size; i++) host_input[i] = dis(gen);
+    }
+    else if constexpr(std::is_same_v<T, unsigned long> || std::is_same_v<T, unsigned int>){
+        std::uniform_int_distribution<T> dis(0, std::numeric_limits<unsigned int>::max());
+        for(size_t i = 0; i < GlobalSizes::size; i++) host_input[i] = dis(gen);
+    }
+    else{
+        std::uniform_int_distribution<T> dis(0, std::numeric_limits<T>::max());
+        for(size_t i = 0; i < GlobalSizes::size; i++) host_input[i] = dis(gen);
+    }
+
+    T * device_input;
+    unsigned int * device_output;
+
+    HIP_CHECK(hipMalloc(&device_input, sizeof(T) * GlobalSizes::size));
+    HIP_CHECK(hipMalloc(&device_output, sizeof(unsigned int) * GlobalSizes::size));
+
+    HIP_CHECK(hipMemcpy(device_input, host_input, sizeof(T) * GlobalSizes::size, hipMemcpyHostToDevice));
+
+    // Generate different discrete distributions and check them against expected 
+    for(std::vector<double> distribution : all_distributions){
+        // Getting expected Results
+        double sum = std::accumulate(distribution.begin(), distribution.end(), 0);
+        std::vector<double> expected_prob(distribution.size());
+        for(size_t i = 0; i < distribution.size(); i++)
+            expected_prob[i] = distribution[i] / sum;
+        
+        // Creating the discrete distribution
+        rocrand_discrete_distribution discrete_distribution;
+        ROCRAND_CHECK(rocrand_create_discrete_distribution(expected_prob.data(), expected_prob.size(), 0, &discrete_distribution));
+
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(internal_discrete_kernel<T>),
+            dim3(GlobalSizes::grid_size), dim3(GlobalSizes::block_size), 0, 0,
+            device_input, device_output, *discrete_distribution, f
+        );
+
+        HIP_CHECK(hipMemcpy(host_output, device_output, sizeof(unsigned int) * GlobalSizes::size, hipMemcpyDeviceToHost));
+
+        std::vector<double> histogram(distribution.size());
+
+        // Calculating the actual results
+        for(size_t i = 0; i < GlobalSizes::size; i++)
+            histogram[host_output[i]]++;
+
+        std::vector<double> actual_prob(distribution.size());
+        for(size_t i = 0; i < actual_prob.size(); i++)
+            actual_prob[i] = histogram[i] / static_cast<double>(GlobalSizes::size);
+        
+        // If the original probability is bigger than 5% then expected should be within 1% difference.
+        // Otherwise it should be within 0.01
+        for(size_t i = 0; i < expected_prob.size(); i++){
+            double eps = expected_prob[i] > 0.05 ? expected_prob[i] * 0.01 : 0.01;
+            ASSERT_NEAR(expected_prob[i], actual_prob[i], eps); 
+        }
+
+        ROCRAND_CHECK(rocrand_destroy_discrete_distribution(discrete_distribution));
+    }
+
+    delete [] host_input;
+    delete [] host_output;
+
+    HIP_CHECK(hipFree(device_input));
+    HIP_CHECK(hipFree(device_output));
+}
+
+TYPED_TEST(InternalDiscreteDistributionTests, InternalDiscreteAliasTest){
+    using T = TestFixture::T;
+    run_internal_discrete_tests<T>(
+        [=] __device__(T val, rocrand_discrete_distribution_st & dis)
+            {return rocrand_device::detail::discrete_alias(val, dis);}
+    );
+}
+
+TYPED_TEST(InternalDiscreteDistributionTests, InternalDiscreteCDFTest){
+    using T = TestFixture::T;
+    run_internal_discrete_tests<T>(
+        [=] __device__(T val, rocrand_discrete_distribution_st & dis)
+            {return rocrand_device::detail::discrete_cdf(val, dis);}
+    );
+}
+
+template<class RocRandPrngType>
+__global__ void block_wide_external_discrete_kernel(
+    RocRandPrngType * states, 
+    unsigned int * device_output, 
+    rocrand_discrete_distribution_st & dis, 
+    size_t items_per_thread, 
+    size_t block_size
+){
+    const size_t items_per_block = items_per_thread * block_size;
+    const size_t offset = (items_per_block * blockIdx.x) + (items_per_thread * threadIdx.x);
+    
+    for(size_t i = 0; i < items_per_thread; i++){
+        __shared__ RocRandPrngType state;
+        if(threadIdx.x == 0)
+            state = states[blockIdx.x];
+        __syncthreads();
+
+        device_output[offset + i] = rocrand_discrete(&state, &dis);
+
+        if(threadIdx.x == 0)
+            states[blockIdx.x] = state;
+    }
+}
+
+template<class RocRandPrngType>
+__global__ void external_discrete_kernel(
+    RocRandPrngType * states, 
+    unsigned int * device_output, 
+    rocrand_discrete_distribution_st & dis, 
+    size_t items_per_thread, 
+    size_t block_size
+){
+    const size_t items_per_block = items_per_thread * block_size;
+    const size_t offset = (items_per_block * blockIdx.x) + (items_per_thread * threadIdx.x);
+    
+    for(size_t i = 0; i < items_per_thread; i++){
+        auto local_state = states[offset + i];
+        device_output[offset + i] = rocrand_discrete(&local_state, &dis);
+        states[offset + i] = local_state;
+    }
+}
+
+template<bool block_wide, class PrngState>
+void run_external_discrete_tests(
+    PrngState & device_states, 
+    size_t items_per_thread = GlobalSizes::items_per_thread, 
+    size_t block_size = GlobalSizes::block_size, 
+    size_t grid_size = GlobalSizes::grid_size, 
+    size_t size = GlobalSizes::size
+){
+
+    std::vector<std::vector<double>> all_distributions = {
+        {10, 10, 10, 10},
+        {1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 1},
+        {1234, 1677, 1519, 1032, 561, 254, 98, 33, 10, 2},
+        {1, 2, 8, 4, 3, 2, 1}
+    };
+
+    unsigned int * host_output = new unsigned int[size];
+    unsigned int * device_output;
+    HIP_CHECK(hipMalloc(&device_output, sizeof(unsigned int) * size));
+
+    for(std::vector<double> distribution : all_distributions){
+        
+        // Getting expected Results
+        double sum = std::accumulate(distribution.begin(), distribution.end(), 0);
+        std::vector<double> expected_prob(distribution.size());
+        for(size_t i = 0; i < distribution.size(); i++)
+            expected_prob[i] = distribution[i] / sum;
+
+        // Creating the discrete distribution
+        rocrand_discrete_distribution discrete_distribution;
+        ROCRAND_CHECK(rocrand_create_discrete_distribution(expected_prob.data(), expected_prob.size(), 0, &discrete_distribution));
+
+        if constexpr(block_wide){
+            hipLaunchKernelGGL(
+                HIP_KERNEL_NAME(block_wide_external_discrete_kernel),
+                dim3(grid_size), dim3(block_size), 0, 0,
+                device_states, device_output, *discrete_distribution, items_per_thread, block_size
+            );
+        }
+        else{
+            hipLaunchKernelGGL(
+                HIP_KERNEL_NAME(external_discrete_kernel),
+                dim3(grid_size), dim3(block_size), 0, 0,
+                device_states, device_output, *discrete_distribution, items_per_thread, block_size
+            );
+        }
+
+        HIP_CHECK(hipMemcpy(host_output, device_output, sizeof(unsigned int) * size, hipMemcpyDeviceToHost));
+        std::vector<double> histogram(distribution.size());
+
+        // Calculating the actual results
+        for(size_t i = 0; i < size; i++)
+            histogram[host_output[i]]++;
+
+        std::vector<double> actual_prob(distribution.size());
+        for(size_t i = 0; i < actual_prob.size(); i++)
+            actual_prob[i] = histogram[i] / static_cast<double>(size);
+        
+        // If the original probability is bigger than 5% then expected should be within 1% difference.
+        // Otherwise it should be within 0.01
+        for(size_t i = 0; i < expected_prob.size(); i++){
+            double eps = expected_prob[i] > 0.05 ? expected_prob[i] * 0.01 : 0.01;
+            ASSERT_NEAR(expected_prob[i], actual_prob[i], eps); 
+        }
+
+        ROCRAND_CHECK(rocrand_destroy_discrete_distribution(discrete_distribution));
+    }
+
+    delete [] host_output;
+    HIP_CHECK(hipFree(device_output));
+}
+
+template<class RocRandPrngType, class InitFunc>
+__global__ void init_rocrand_states_kernel(RocRandPrngType * states, const InitFunc & f){
+    constexpr size_t items_per_block = GlobalSizes::items_per_thread * GlobalSizes::block_size;
+    const size_t offset = (items_per_block * blockIdx.x) + (GlobalSizes::items_per_thread * threadIdx.x);
+
+    for(size_t i = 0; i < GlobalSizes::items_per_thread; i++)
+        f(i, offset, &states[offset + i]);
+        // rocrand_init((123456 ^ i), offset + i, 0, &states[offset + i]);
+}
+
+TEST(ExternalDiscreteDistributionTests, Philox4x32_10Test){
+    // Initialize the prng state
+    rocrand_state_philox4x32_10 * device_states;
+    HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_philox4x32_10) * GlobalSizes::size));
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(init_rocrand_states_kernel),
+        dim3(GlobalSizes::grid_size), dim3(GlobalSizes::block_size), 0, 0,
+        device_states, 
+        [] __device__ (size_t index, size_t offset, rocrand_state_philox4x32_10 * state){
+            rocrand_init((123456 ^ index), offset + index, 0, state);
+        }
+    );
+
+    run_external_discrete_tests<false>(device_states);
+
+    HIP_CHECK(hipFree(device_states));
+}
+
+TEST(ExternalDiscreteDistributionTests, Mrg31k3pTest){
+    // Initialize the prng state
+    rocrand_state_mrg31k3p * device_states;
+    HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_mrg31k3p) * GlobalSizes::size));
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(init_rocrand_states_kernel),
+        dim3(GlobalSizes::grid_size), dim3(GlobalSizes::block_size), 0, 0,
+        device_states,
+        [] __device__ (size_t index, size_t offset, rocrand_state_mrg31k3p * state){
+            rocrand_init((123456 ^ index), offset + index, 0, state);
+        }
+    );
+
+    run_external_discrete_tests<false>(device_states);
+
+    HIP_CHECK(hipFree(device_states));
+}
+
+TEST(ExternalDiscreteDistributionTests, Mrg32k3aTest){
+    // Initialize the prng state
+    rocrand_state_mrg32k3a * device_states;
+    HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_mrg32k3a) * GlobalSizes::size));
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(init_rocrand_states_kernel),
+        dim3(GlobalSizes::grid_size), dim3(GlobalSizes::block_size), 0, 0,
+        device_states,
+        [] __device__ (size_t index, size_t offset, rocrand_state_mrg32k3a * state){
+            rocrand_init((123456 ^ index), offset + index, 0, state);
+        }
+    );
+
+    run_external_discrete_tests<false>(device_states);
+
+    HIP_CHECK(hipFree(device_states));
+}
+
+TEST(ExternalDiscreteDistributionTests, XorwowTest){
+    // Initialize the prng state
+    rocrand_state_xorwow * device_states;
+    HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_xorwow) * GlobalSizes::size));
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(init_rocrand_states_kernel),
+        dim3(GlobalSizes::grid_size), dim3(GlobalSizes::block_size), 0, 0,
+        device_states,
+        [] __device__ (size_t index, size_t offset, rocrand_state_xorwow * state){
+            rocrand_init((123456 ^ index), offset + index, 0, state);
+        }
+    );
+
+    run_external_discrete_tests<false>(device_states);
+
+    HIP_CHECK(hipFree(device_states));
+}
+
+TEST(ExternalDiscreteDistributionTests, Mtgp32Test){
+    constexpr size_t items_per_thread = 1024;
+    constexpr size_t block_size = 256;
+    constexpr size_t grid_size = 12;
+    constexpr size_t items_per_block = block_size * items_per_thread;
+
+    constexpr size_t test_size = items_per_block * grid_size;
+    rocrand_state_mtgp32 * states;
+    
+    HIP_CHECK(hipMalloc(&states, sizeof(rocrand_state_mtgp32) * grid_size));
+    rocrand_make_state_mtgp32(states,  mtgp32dc_params_fast_11213, grid_size, 123456);
+    HIP_CHECK(hipDeviceSynchronize());
+
+    run_external_discrete_tests<true>(
+        states, 
+        items_per_thread, 
+        block_size, 
+        grid_size, 
+        test_size
+    );
+    HIP_CHECK(hipFree(states));
+}
+
+TEST(ExternalDiscreteDistributionTests, Lfsr113Test){
+    // Initialize the prng state
+    rocrand_state_lfsr113 * device_states;
+    HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_lfsr113) * GlobalSizes::size));
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(init_rocrand_states_kernel),
+        dim3(GlobalSizes::grid_size), dim3(GlobalSizes::block_size), 0, 0,
+        device_states,
+        [] __device__ (size_t index, size_t offset, rocrand_state_lfsr113 * state){
+            rocrand_init(
+                {
+                    (123456 ^ index),
+                    (123456 ^ index) << 1, 
+                    (123456 ^ index) << 2, 
+                    (123456 ^ index) << 3
+                }, 
+                offset + index, 
+                0, 
+                state
+            );
+        }
+    );
+
+    run_external_discrete_tests<false>(device_states);
+
+    HIP_CHECK(hipFree(device_states));
+}
+
+TEST(ExternalDiscreteDistributionTests, Sobol32Test){
+    // Initialize the prng state
+    rocrand_state_sobol32 * host_states = new rocrand_state_sobol32[GlobalSizes::size];
+    const unsigned int* directions;
+    ROCRAND_CHECK(rocrand_get_direction_vectors32(&directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6));
+
+    // 640000 is the size of directions. This is to prevent overflow stuff
+    for(size_t i = 0; i < GlobalSizes::size; i++)
+        rocrand_init(directions, i % 640000, host_states + i);
+
+    rocrand_state_sobol32 * device_states;
+    HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_sobol32) * GlobalSizes::size));
+    HIP_CHECK(hipMemcpy(device_states, host_states, sizeof(rocrand_state_sobol32) * GlobalSizes::size, hipMemcpyHostToDevice));
+
+
+    run_external_discrete_tests<false>(device_states);
+
+    delete [] host_states;
+    HIP_CHECK(hipFree(device_states));
+}
+
+TEST(ExternalDiscreteDistributionTests, ScrambledSobol32Test){
+    // Initialize the prng state
+    rocrand_state_scrambled_sobol32 * host_states = new rocrand_state_scrambled_sobol32[GlobalSizes::size];
+    const unsigned int* directions;
+    ROCRAND_CHECK(rocrand_get_direction_vectors32(&directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6));
+
+    // 640000 is the size of directions. This is to prevent overflow stuff
+    for(size_t i = 0; i < GlobalSizes::size; i++)
+        rocrand_init(directions, 123456 ^ i, i % 640000, host_states + i);
+
+    rocrand_state_scrambled_sobol32 * device_states;
+    HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_scrambled_sobol32) * GlobalSizes::size));
+    HIP_CHECK(hipMemcpy(device_states, host_states, sizeof(rocrand_state_scrambled_sobol32) * GlobalSizes::size, hipMemcpyHostToDevice));
+
+
+    run_external_discrete_tests<false>(device_states);
+
+    delete [] host_states;
+    HIP_CHECK(hipFree(device_states));
+}
+
+TEST(ExternalDiscreteDistributionTests, Sobol64Test){
+    // Initialize the prng state
+    rocrand_state_sobol64 * host_states = new rocrand_state_sobol64[GlobalSizes::size];
+    const unsigned long long* directions;
+    ROCRAND_CHECK(rocrand_get_direction_vectors64(&directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6));
+
+    // 1280000 is the size of directions. This is to prevent overflow stuff
+    for(size_t i = 0; i < GlobalSizes::size; i++)
+        rocrand_init(directions, i % 1280000, host_states + i);
+
+    rocrand_state_sobol64 * device_states;
+    HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_sobol64) * GlobalSizes::size));
+    HIP_CHECK(hipMemcpy(device_states, host_states, sizeof(rocrand_state_sobol64) * GlobalSizes::size, hipMemcpyHostToDevice));
+
+
+    run_external_discrete_tests<false>(device_states);
+
+    delete [] host_states;
+    HIP_CHECK(hipFree(device_states));
+}
+
+TEST(ExternalDiscreteDistributionTests, ScrambledSobol64Test){
+    // Initialize the prng state
+    rocrand_state_scrambled_sobol64 * host_states = new rocrand_state_scrambled_sobol64[GlobalSizes::size];
+    const unsigned long long* directions;
+    ROCRAND_CHECK(rocrand_get_direction_vectors64(&directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6));
+
+    // 1280000 is the size of directions. This is to prevent overflow stuff
+    for(size_t i = 0; i < GlobalSizes::size; i++)
+        rocrand_init(directions, 123456 ^ i, i % 1280000, host_states + i);
+
+    rocrand_state_scrambled_sobol64 * device_states;
+    HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_scrambled_sobol64) * GlobalSizes::size));
+    HIP_CHECK(hipMemcpy(device_states, host_states, sizeof(rocrand_state_scrambled_sobol64) * GlobalSizes::size, hipMemcpyHostToDevice));
+
+
+    run_external_discrete_tests<false>(device_states);
+
+    delete [] host_states;
+    HIP_CHECK(hipFree(device_states));
+}
+
+TEST(ExternalDiscreteDistributionTests, Threefry2x32_20Test){
+    // Initialize the prng state
+    rocrand_state_threefry2x32_20 * device_states;
+    HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_threefry2x32_20) * GlobalSizes::size));
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(init_rocrand_states_kernel),
+        dim3(GlobalSizes::grid_size), dim3(GlobalSizes::block_size), 0, 0,
+        device_states,
+        [] __device__ (size_t index, size_t offset, rocrand_state_threefry2x32_20 * state){
+            rocrand_init((123456 ^ index), offset + index, 0, state);
+        }
+    );
+
+    run_external_discrete_tests<false>(device_states);
+
+    HIP_CHECK(hipFree(device_states));
+}
+
+TEST(ExternalDiscreteDistributionTests, Threefry2x64_20Test){
+    // Initialize the prng state
+    rocrand_state_threefry2x64_20 * device_states;
+    HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_threefry2x64_20) * GlobalSizes::size));
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(init_rocrand_states_kernel),
+        dim3(GlobalSizes::grid_size), dim3(GlobalSizes::block_size), 0, 0,
+        device_states,
+        [] __device__ (size_t index, size_t offset, rocrand_state_threefry2x64_20 * state){
+            rocrand_init((123456 ^ index), offset + index, 0, state);
+        }
+    );
+
+    run_external_discrete_tests<false>(device_states);
+
+    HIP_CHECK(hipFree(device_states));
+}
+
+TEST(ExternalDiscreteDistributionTests, Threefry4x32_20Test){
+    // Initialize the prng state
+    rocrand_state_threefry4x32_20 * device_states;
+    HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_threefry4x32_20) * GlobalSizes::size));
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(init_rocrand_states_kernel),
+        dim3(GlobalSizes::grid_size), dim3(GlobalSizes::block_size), 0, 0,
+        device_states,
+        [] __device__ (size_t index, size_t offset, rocrand_state_threefry4x32_20 * state){
+            rocrand_init((123456 ^ index), offset + index, 0, state);
+        }
+    );
+
+    run_external_discrete_tests<false>(device_states);
+
+    HIP_CHECK(hipFree(device_states));
+}
+
+TEST(ExternalDiscreteDistributionTests, Threefry4x64_20Test){
+    // Initialize the prng state
+    rocrand_state_threefry4x64_20 * device_states;
+    HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_threefry4x64_20) * GlobalSizes::size));
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(init_rocrand_states_kernel),
+        dim3(GlobalSizes::grid_size), dim3(GlobalSizes::block_size), 0, 0,
+        device_states,
+        [] __device__ (size_t index, size_t offset, rocrand_state_threefry4x64_20 * state){
+            rocrand_init((123456 ^ index), offset + index, 0, state);
+        }
+    );
+
+    run_external_discrete_tests<false>(device_states);
+
+    HIP_CHECK(hipFree(device_states));
+}
+
+template<size_t items_per_thread, size_t block_size>
+__global__ void uint4_kernel(rocrand_state_philox4x32_10 * states, uint4 * device_output, rocrand_discrete_distribution_st & dis){
+    const size_t items_per_block = items_per_thread * block_size;
+    const size_t offset = (items_per_block * blockIdx.x) + (items_per_thread * threadIdx.x);
+    
+    for(size_t i = 0; i < items_per_thread; i++){
+        auto local_state = states[offset + i];
+        device_output[offset + i] = rocrand_discrete4(&local_state, &dis);
+        states[offset + i] = local_state;
+    }
+}
+
+TEST(ExternalDiscreteDistributionTests, Philox4x32_10WithUIN4OutputTest)
+{
+    // Initialize the prng state
+    rocrand_state_philox4x32_10 * device_states;
+    HIP_CHECK(hipMalloc(&device_states, sizeof(rocrand_state_philox4x32_10) * GlobalSizes::size));
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(init_rocrand_states_kernel),
+        dim3(GlobalSizes::grid_size), dim3(GlobalSizes::block_size), 0, 0,
+        device_states, 
+        [] __device__ (size_t index, size_t offset, rocrand_state_philox4x32_10 * state){
+            rocrand_init((123456 ^ index), offset + index, 0, state);
+        }
+    );
+
+    std::vector<std::vector<double>> all_distributions = {
+        {10, 10, 10, 10},
+        {1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 1},
+        {1234, 1677, 1519, 1032, 561, 254, 98, 33, 10, 2},
+        {1, 2, 8, 4, 3, 2, 1}
+    };
+
+    uint4 * host_output = new uint4[GlobalSizes::size];
+    uint4 * device_output;
+    HIP_CHECK(hipMalloc(&device_output, sizeof(uint4) * GlobalSizes::size));
+
+    for(std::vector<double> distribution : all_distributions){
+        
+        // Getting expected Results
+        double sum = std::accumulate(distribution.begin(), distribution.end(), 0);
+        std::vector<double> expected_prob(distribution.size());
+        for(size_t i = 0; i < distribution.size(); i++)
+            expected_prob[i] = distribution[i] / sum;
+
+        // Creating the discrete distribution
+        rocrand_discrete_distribution discrete_distribution;
+        ROCRAND_CHECK(rocrand_create_discrete_distribution(expected_prob.data(), expected_prob.size(), 0, &discrete_distribution));
+
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(uint4_kernel<GlobalSizes::items_per_thread, GlobalSizes::block_size>),
+            dim3(GlobalSizes::grid_size), dim3(GlobalSizes::block_size), 0, 0,
+            device_states, device_output, * discrete_distribution
+        );
+
+        HIP_CHECK(hipMemcpy(host_output, device_output, sizeof(uint4) * GlobalSizes::size, hipMemcpyDeviceToHost));
+        std::vector<double> histogram(distribution.size());
+
+        // Calculating the actual results
+        for(size_t i = 0; i < GlobalSizes::size; i++){
+            histogram[host_output[i].w]++;
+            histogram[host_output[i].x]++;
+            histogram[host_output[i].y]++;
+            histogram[host_output[i].z]++;
+        }
+
+        std::vector<double> actual_prob(distribution.size());
+        for(size_t i = 0; i < actual_prob.size(); i++)
+            actual_prob[i] = histogram[i] / static_cast<double>(GlobalSizes::size * 4);
+        
+        // If the original probability is bigger than 5% then expected should be within 1% difference.
+        // Otherwise it should be within 0.01
+        for(size_t i = 0; i < expected_prob.size(); i++){
+            double eps = expected_prob[i] > 0.05 ? expected_prob[i] * 0.01 : 0.01;
+            ASSERT_NEAR(expected_prob[i], actual_prob[i], eps); 
+        }
+
+        ROCRAND_CHECK(rocrand_destroy_discrete_distribution(discrete_distribution));
+    }
+
+    delete [] host_output;
+    HIP_CHECK(hipFree(device_output));
+}
diff --git a/test/internal/test_log_normal_distribution.cpp b/test/internal/test_log_normal_distribution.cpp
index efdf0f411..40a297422 100644
--- a/test/internal/test_log_normal_distribution.cpp
+++ b/test/internal/test_log_normal_distribution.cpp
@@ -24,6 +24,9 @@
 #include <random>
 
 #include <rng/distribution/log_normal.hpp>
+#include <rocrand/rocrand_mtgp32_11213.h>
+
+#define HIP_CHECK(state) ASSERT_EQ(state, hipSuccess)
 
 using namespace rocrand_impl::host;
 
@@ -403,11 +406,874 @@ TYPED_TEST(sobol_log_normal_distribution_tests, half_test)
         std += std::pow(__half2float(val[i]) - mean, 2);
     }
     std = std::sqrt(std / size);
-
+    
     float expected_mean = std::exp(0.2f + 0.5f * 0.5f / 2);
     float expected_std
-        = std::sqrt((std::exp(0.5f * 0.5f) - 1.0) * std::exp(2 * 0.2f + 0.5f * 0.5f));
-
+    = std::sqrt((std::exp(0.5f * 0.5f) - 1.0) * std::exp(2 * 0.2f + 0.5f * 0.5f));
+    
     EXPECT_NEAR(expected_mean, mean, expected_mean * 0.1f);
     EXPECT_NEAR(expected_std, std, expected_std * 0.1f);
 }
+
+
+template <typename OutType>
+struct StatesLND{
+    template <typename FuncCall>
+    void run_test(const FuncCall & f, size_t testSize = 4000000){
+        double iMean = 0;
+        double iStd = 1;
+
+        float * output = new float [testSize];
+        OutType out;
+    
+        double mean = 0;
+    
+        for(size_t i = 0; i <= testSize; i += 4){
+            f(out, iMean, iStd);
+    
+            output[i] = out.w;
+            output[i + 1] = out.x;
+            output[i + 2] = out.y;
+            output[i + 3] = out.z;
+            mean += out.w + out.x + out.y + out.z;
+        }
+    
+        mean /= testSize;
+    
+        double std = 0.0;
+        for(size_t i = 0; i < testSize; i++)
+            std += std::pow(output[i] - mean, 2);
+    
+        std = std::sqrt(std / testSize);
+    
+        double eMean = std::exp(iMean + (iStd * iStd) / 2);
+        double eStd = std::sqrt(std::log(1 + (iStd * iStd)/(iMean * iMean)));
+        ASSERT_NEAR(mean, eMean, eMean * 0.1) << "Expected Mean: " << eMean << " Actual Mean: " << mean << " Eps: " << eMean * 0.1;
+        ASSERT_NEAR(std, eStd, eStd * 0.1) << "Expected Std: " << eStd << " Actual Std: " << std << " Eps: " << eStd * 0.1;
+    }
+};
+
+TEST(log_normal_distribution_tests, philox4x32_10_test){
+    rocrand_state_philox4x32_10 states;
+    rocrand_init(123456, 654321, 0, &states);
+
+    StatesLND<float4> testFloat;
+    StatesLND<double4> testDouble;
+
+    #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
+        testFloat.run_test()(
+            [&] __host__ __device__ (float4 & output, float mean, float std){
+                output = {
+                    rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std),
+                    rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std)
+                };
+        )
+        testDouble.run_test()(
+            [&] __host__ __device__ (double4 & output, double mean, double std){
+                output = {
+                    rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std),
+                    rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std)
+                };
+            } 
+        )
+    #endif // ROCRAND_DETAIL_BM_NOT_IN_STATE
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output, float mean, float std){
+            float2 o1 = rocrand_log_normal2(&states, mean, std);
+            float2 o2 = rocrand_log_normal2(&states, mean, std);
+            output = {
+                o1.x, o2.x, o1.y, o2.y
+            };
+        }
+    );
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output, float mean, float std){
+            output = rocrand_log_normal4(&states, mean, std);
+        }
+    );
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output, double mean, double std){
+            double2 o1 = rocrand_log_normal_double2(&states, mean, std);
+            double2 o2 = rocrand_log_normal_double2(&states, mean, std);
+            output = {
+                o1.x, o2.x, o1.y, o2.y
+            };
+        }
+    );
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output, double mean, double std){
+            output = rocrand_log_normal_double4(&states, mean, std);
+        }
+    );
+}
+
+TEST(log_normal_distribution_tests, mrg31k3p_test){
+    rocrand_state_mrg31k3p states;
+    rocrand_init(123456, 654321, 0, &states);
+
+    StatesLND<float4> testFloat;
+    StatesLND<double4> testDouble;
+
+    #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
+        testFloat.run_test()(
+            [&] __host__ __device__ (float4 & output, float mean, float std){
+                output = {
+                    rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std),
+                    rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std)
+                };
+        )
+        testDouble.run_test()(
+            [&] __host__ __device__ (double4 & output, double mean, double std){
+                output = {
+                    rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std),
+                    rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std)
+                };
+            } 
+        )
+    #endif // ROCRAND_DETAIL_BM_NOT_IN_STATE
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output, float mean, float std){
+            float2 o1 = rocrand_log_normal2(&states, mean, std);
+            float2 o2 = rocrand_log_normal2(&states, mean, std);
+            output = {
+                o1.x, o2.x, o1.y, o2.y
+            };
+        }
+    );
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output, double mean, double std){
+            double2 o1 = rocrand_log_normal_double2(&states, mean, std);
+            double2 o2 = rocrand_log_normal_double2(&states, mean, std);
+            output = {
+                o1.x, o2.x, o1.y, o2.y
+            };
+        }
+    );
+}
+
+TEST(log_normal_distribution_tests, mrg32k3a_test){
+    rocrand_state_mrg32k3a states;
+    rocrand_init(123456, 654321, 0, &states);
+
+    StatesLND<float4> testFloat;
+    StatesLND<double4> testDouble;
+
+    #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
+        testFloat.run_test()(
+            [&] __host__ __device__ (float4 & output, float mean, float std){
+                output = {
+                    rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std),
+                    rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std)
+                };
+        )
+        testDouble.run_test()(
+            [&] __host__ __device__ (double4 & output, double mean, double std){
+                output = {
+                    rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std),
+                    rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std)
+                };
+            } 
+        )
+    #endif // ROCRAND_DETAIL_BM_NOT_IN_STATE
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output, float mean, float std){
+            float2 o1 = rocrand_log_normal2(&states, mean, std);
+            float2 o2 = rocrand_log_normal2(&states, mean, std);
+            output = {
+                o1.x, o2.x, o1.y, o2.y
+            };
+        }
+    );
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output, double mean, double std){
+            double2 o1 = rocrand_log_normal_double2(&states, mean, std);
+            double2 o2 = rocrand_log_normal_double2(&states, mean, std);
+            output = {
+                o1.x, o2.x, o1.y, o2.y
+            };
+        }
+    );
+}
+
+TEST(log_normal_distribution_tests, xorwow_test){
+    rocrand_state_xorwow states;
+    rocrand_init(123456, 654321, 0, &states);
+
+    StatesLND<float4> testFloat;
+    StatesLND<double4> testDouble;
+
+    #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
+        testFloat.run_test()(
+            [&] __host__ __device__ (float4 & output, float mean, float std){
+                output = {
+                    rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std),
+                    rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std)
+                };
+        )
+        testDouble.run_test()(
+            [&] __host__ __device__ (double4 & output, double mean, double std){
+                output = {
+                    rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std),
+                    rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std)
+                };
+            } 
+        )
+    #endif // ROCRAND_DETAIL_BM_NOT_IN_STATE
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output, float mean, float std){
+            float2 o1 = rocrand_log_normal2(&states, mean, std);
+            float2 o2 = rocrand_log_normal2(&states, mean, std);
+            output = {
+                o1.x, o2.x, o1.y, o2.y
+            };
+        }
+    );
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output, double mean, double std){
+            double2 o1 = rocrand_log_normal_double2(&states, mean, std);
+            double2 o2 = rocrand_log_normal_double2(&states, mean, std);
+            output = {
+                o1.x, o2.x, o1.y, o2.y
+            };
+        }
+    );
+}
+
+TEST(log_normal_distribution_tests, sobol32_test){
+    rocrand_state_sobol32 states;
+    const unsigned int* directions;
+    HIP_CHECK(rocrand_get_direction_vectors32(&directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6));
+    rocrand_init(directions, 0, &states);
+
+    StatesLND<float4> testFloat;
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output, float mean, float std){
+            output = {
+                rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std),
+                rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std)
+            };
+        }
+    );
+
+    StatesLND<double4> testDouble;
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output, double mean, double std){
+            output = {
+                rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std),
+                rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std)
+            };
+        }
+    );
+}
+
+TEST(log_normal_distribution_tests, scrambled_sobol32_test){
+    rocrand_state_scrambled_sobol32 states;
+    const unsigned int* directions;
+    HIP_CHECK(rocrand_get_direction_vectors32(&directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6));
+    rocrand_init(directions, 123456, 0, &states);
+
+    StatesLND<float4> testFloat;
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output, float mean, float std){
+            output = {
+                rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std),
+                rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std)
+            };
+        }
+    );
+
+    StatesLND<double4> testDouble;
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output, double mean, double std){
+            output = {
+                rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std),
+                rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std)
+            };
+        }
+    );
+}
+
+TEST(log_normal_distribution_tests, sobol64_test){
+    rocrand_state_sobol64 states;
+    const unsigned long long* directions;
+    HIP_CHECK(rocrand_get_direction_vectors64(&directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6));
+    rocrand_init(directions, 0, &states);
+
+    StatesLND<float4> testFloat;
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output, float mean, float std){
+            output = {
+                rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std),
+                rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std)
+            };
+        }
+    );
+
+    StatesLND<double4> testDouble;
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output, double mean, double std){
+            output = {
+                rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std),
+                rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std)
+            };
+        }
+    );
+}
+
+TEST(log_normal_distribution_tests, scrambled_sobol64_test){
+    rocrand_state_scrambled_sobol64 states;
+    const unsigned long long* directions;
+    HIP_CHECK(rocrand_get_direction_vectors64(&directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6));
+    rocrand_init(directions, 123456, 0, &states);
+
+    StatesLND<float4> testFloat;
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output, float mean, float std){
+            output = {
+                rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std),
+                rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std)
+            };
+        }
+    );
+
+    StatesLND<double4> testDouble;
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output, double mean, double std){
+            output = {
+                rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std),
+                rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std)
+            };
+        }
+    );
+}
+
+TEST(log_normal_distribution_tests, lfsr113_test){
+    rocrand_state_lfsr113 states;
+    rocrand_init(static_cast<uint4>(12), 0, &states);
+
+    StatesLND<float4> testFloat;
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output, float mean, float std){
+            output = {
+                rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std),
+                rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std)
+            };
+        }
+    );
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output, float mean, float std){
+            float2 o1 = rocrand_log_normal2(&states, mean, std);
+            float2 o2 = rocrand_log_normal2(&states, mean, std);
+            output = {
+                o1.x, o2.x, o1.y, o2.y
+            };
+        }
+    );
+
+    StatesLND<double4> testDouble;
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output, double mean, double std){
+            output = {
+                rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std),
+                rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std)
+            };
+        }
+    );
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output, double mean, double std){
+            double2 o1 = rocrand_log_normal_double2(&states, mean, std);
+            double2 o2 = rocrand_log_normal_double2(&states, mean, std);
+            output = {
+                o1.x, o2.x, o1.y, o2.y
+            };
+        }
+    );
+}
+
+TEST(log_normal_distribution_tests, threefry2x32_20_test){
+    rocrand_state_threefry2x32_20 states;
+    rocrand_init(123456, 654321, 0, &states);
+
+    StatesLND<float4> testFloat;
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output, float mean, float std){
+            output = {
+                rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std),
+                rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std)
+            };
+        }
+    );
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output, float mean, float std){
+            float2 o1 = rocrand_log_normal2(&states, mean, std);
+            float2 o2 = rocrand_log_normal2(&states, mean, std);
+            output = {
+                o1.x, o2.x, o1.y, o2.y
+            };
+        }
+    );
+
+    StatesLND<double4> testDouble;
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output, double mean, double std){
+            output = {
+                rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std),
+                rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std)
+            };
+        }
+    );
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output, double mean, double std){
+            double2 o1 = rocrand_log_normal_double2(&states, mean, std);
+            double2 o2 = rocrand_log_normal_double2(&states, mean, std);
+            output = {
+                o1.x, o2.x, o1.y, o2.y
+            };
+        }
+    );
+}
+
+TEST(log_normal_distribution_tests, threefry2x64_20_test){
+    rocrand_state_threefry2x64_20 states;
+    rocrand_init(123456, 654321, 0, &states);
+
+    StatesLND<float4> testFloat;
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output, float mean, float std){
+            output = {
+                rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std),
+                rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std)
+            };
+        }
+    );
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output, float mean, float std){
+            float2 o1 = rocrand_log_normal2(&states, mean, std);
+            float2 o2 = rocrand_log_normal2(&states, mean, std);
+            output = {
+                o1.x, o2.x, o1.y, o2.y
+            };
+        }
+    );
+
+    StatesLND<double4> testDouble;
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output, double mean, double std){
+            output = {
+                rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std),
+                rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std)
+            };
+        }
+    );
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output, double mean, double std){
+            double2 o1 = rocrand_log_normal_double2(&states, mean, std);
+            double2 o2 = rocrand_log_normal_double2(&states, mean, std);
+            output = {
+                o1.x, o2.x, o1.y, o2.y
+            };
+        }
+    );
+}
+
+TEST(log_normal_distribution_tests, threefry4x32_20_test){
+    rocrand_state_threefry4x32_20 states;
+    rocrand_init(123456, 654321, 0, &states);
+
+    StatesLND<float4> testFloat;
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output, float mean, float std){
+            output = {
+                rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std),
+                rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std)
+            };
+        }
+    );
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output, float mean, float std){
+            float2 o1 = rocrand_log_normal2(&states, mean, std);
+            float2 o2 = rocrand_log_normal2(&states, mean, std);
+            output = {
+                o1.x, o2.x, o1.y, o2.y
+            };
+        }
+    );
+
+    StatesLND<double4> testDouble;
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output, double mean, double std){
+            output = {
+                rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std),
+                rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std)
+            };
+        }
+    );
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output, double mean, double std){
+            double2 o1 = rocrand_log_normal_double2(&states, mean, std);
+            double2 o2 = rocrand_log_normal_double2(&states, mean, std);
+            output = {
+                o1.x, o2.x, o1.y, o2.y
+            };
+        }
+    );
+}
+
+TEST(log_normal_distribution_tests, threefry4x64_20_test){
+    rocrand_state_threefry4x64_20 states;
+    rocrand_init(123456, 654321, 0, &states);
+
+    StatesLND<float4> testFloat;
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output, float mean, float std){
+            output = {
+                rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std),
+                rocrand_log_normal(&states, mean, std), rocrand_log_normal(&states, mean, std)
+            };
+        }
+    );
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output, float mean, float std){
+            float2 o1 = rocrand_log_normal2(&states, mean, std);
+            float2 o2 = rocrand_log_normal2(&states, mean, std);
+            output = {
+                o1.x, o2.x, o1.y, o2.y
+            };
+        }
+    );
+
+    StatesLND<double4> testDouble;
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output, double mean, double std){
+            output = {
+                rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std),
+                rocrand_log_normal_double(&states, mean, std), rocrand_log_normal_double(&states, mean, std)
+            };
+        }
+    );
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output, double mean, double std){
+            double2 o1 = rocrand_log_normal_double2(&states, mean, std);
+            double2 o2 = rocrand_log_normal_double2(&states, mean, std);
+            output = {
+                o1.x, o2.x, o1.y, o2.y
+            };
+        }
+    );
+}
+
+template <typename T, typename pType, typename LNDFunction>
+__global__ void mtgp32_kernel (rocrand_state_mtgp32 * states, T * output, const size_t N, const pType mean, const pType std ,const LNDFunction & f){
+    const unsigned int state_id  = blockIdx.x;
+    const unsigned int thread_id = threadIdx.x;
+    unsigned int       index     = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    if(index >= N)
+        return;
+
+    __shared__ rocrand_state_mtgp32 state;
+    if(thread_id == 0)
+        state = states[state_id];
+    __syncthreads();
+
+    output[index] = f(&state, mean, std);
+
+    if(thread_id == 0)
+        states[state_id] = state; 
+}
+
+TEST(log_normal_distribution_tests, float_mtgp32_test){
+    size_t testSize = 40192;
+    size_t threads = 256;
+    size_t blocks = std::ceil(static_cast<double>(testSize) / static_cast<double>(threads));
+
+    float iMean = 0;
+    float iStd = 1;
+
+    rocrand_state_mtgp32 * states;
+    size_t state_size = blocks, seed = 654321;
+    HIP_CHECK(hipMalloc(&states, state_size * sizeof(rocrand_state_mtgp32)));
+    rocrand_make_state_mtgp32(states,  mtgp32dc_params_fast_11213, state_size, seed);
+
+    float * hOut = new float[testSize];
+    float * dOut;
+    HIP_CHECK(hipMalloc(&dOut, sizeof(float) * testSize));
+    HIP_CHECK(hipDeviceSynchronize());
+
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(mtgp32_kernel<float, float>),
+        dim3(blocks),
+        dim3(threads),
+        0,
+        0,
+        states,
+        dOut,
+        testSize,
+        iMean,
+        iStd,
+
+        [] __device__ (rocrand_state_mtgp32 * state, float mean, float std){
+            return rocrand_log_normal(state, mean, std);
+        }
+    );
+
+    HIP_CHECK(hipMemcpy(hOut, dOut, sizeof(float) * testSize, hipMemcpyDeviceToHost));
+
+    double mean = 0.0;
+    for(size_t i = 0; i < testSize; i++)
+        mean += hOut[i];
+    
+
+    mean /= testSize;
+
+    double std = 0.0;
+    for(size_t i = 0; i < testSize; i++)
+        std += std::pow(hOut[i] - mean, 2);
+
+    std = std::sqrt(std / testSize);
+
+    double eMean = std::exp(iMean + (iStd * iStd) / 2);
+    double eStd = std::sqrt(std::log(1 + (iStd * iStd)/(iMean * iMean)));
+
+    ASSERT_NEAR(mean, eMean, eMean * 0.1) << "Expected Mean: " << eMean << " Actual Mean: " << mean << " Eps: " << eMean * 0.1;
+    ASSERT_NEAR(std, eStd, eStd * 0.1) << "Expected Std: " << eStd << " Actual Std: " << std << " Eps: " << eStd * 0.1;
+
+    HIP_CHECK(hipFree(states));
+    HIP_CHECK(hipFree(dOut));
+
+    delete [] hOut;
+}
+
+TEST(log_normal_distribution_tests, float2_mtgp32_test){
+    size_t testSize = 40192;
+    size_t threads = 256;
+    size_t blocks = std::ceil(static_cast<double>(testSize) / static_cast<double>(threads));
+
+    float iMean = 0;
+    float iStd = 1;
+
+    rocrand_state_mtgp32 * states;
+    size_t state_size = blocks, seed = 654321;
+    HIP_CHECK(hipMalloc(&states, state_size * sizeof(rocrand_state_mtgp32)));
+    rocrand_make_state_mtgp32(states,  mtgp32dc_params_fast_11213, state_size, seed);
+
+    float2 * hOut = new float2[testSize];
+    float2 * dOut;
+    HIP_CHECK(hipMalloc(&dOut, sizeof(float2) * testSize));
+    HIP_CHECK(hipDeviceSynchronize());
+
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(mtgp32_kernel<float2, float>),
+        dim3(blocks),
+        dim3(threads),
+        0,
+        0,
+        states,
+        dOut,
+        testSize,
+        iMean,
+        iStd,
+        [] __device__ (rocrand_state_mtgp32 * state, float mean, float std){
+            return rocrand_log_normal2(state, mean, std);
+        }
+    );
+
+    HIP_CHECK(hipMemcpy(hOut, dOut, sizeof(float2) * testSize, hipMemcpyDeviceToHost));
+
+    double mean = 0.0;
+    for(size_t i = 0; i < testSize; i++){
+        mean += hOut[i].x;
+        mean += hOut[i].y;
+    }
+    
+
+    mean /= (testSize * 2);
+
+    double std = 0.0;
+    for(size_t i = 0; i < testSize; i++){
+        std += std::pow(hOut[i].x - mean, 2);
+        std += std::pow(hOut[i].y - mean, 2);
+    }
+
+    std = std::sqrt(std / (testSize * 2));
+
+    double eMean = std::exp(iMean + (iStd * iStd) / 2);
+    double eStd = std::sqrt(std::log(1 + (iStd * iStd)/(iMean * iMean)));
+
+    ASSERT_NEAR(mean, eMean, eMean * 0.1) << "Expected Mean: " << eMean << " Actual Mean: " << mean << " Eps: " << eMean * 0.1;
+    ASSERT_NEAR(std, eStd, eStd * 0.1) << "Expected Std: " << eStd << " Actual Std: " << std << " Eps: " << eStd * 0.1;
+
+    HIP_CHECK(hipFree(states));
+    HIP_CHECK(hipFree(dOut));
+
+    delete [] hOut;
+}
+
+TEST(log_normal_distribution_tests, double_mtgp32_test){
+    size_t testSize = 40192;
+    size_t threads = 256;
+    size_t blocks = std::ceil(static_cast<double>(testSize) / static_cast<double>(threads));
+
+    double iMean = 0;
+    double iStd = 1;
+
+    rocrand_state_mtgp32 * states;
+    size_t state_size = blocks, seed = 654321;
+    HIP_CHECK(hipMalloc(&states, state_size * sizeof(rocrand_state_mtgp32)));
+    rocrand_make_state_mtgp32(states,  mtgp32dc_params_fast_11213, state_size, seed);
+
+    double * hOut = new double[testSize];
+    double * dOut;
+    HIP_CHECK(hipMalloc(&dOut, sizeof(double) * testSize));
+    HIP_CHECK(hipDeviceSynchronize());
+
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(mtgp32_kernel<double, double>),
+        dim3(blocks),
+        dim3(threads),
+        0,
+        0,
+        states,
+        dOut,
+        testSize,
+        iMean,
+        iStd,
+
+        [] __device__ (rocrand_state_mtgp32 * state, double mean, double std){
+            return rocrand_log_normal_double(state, mean, std);
+        }
+    );
+
+    HIP_CHECK(hipMemcpy(hOut, dOut, sizeof(double) * testSize, hipMemcpyDeviceToHost));
+
+    double mean = 0.0;
+    for(size_t i = 0; i < testSize; i++)
+        mean += hOut[i];
+    
+
+    mean /= testSize;
+
+    double std = 0.0;
+    for(size_t i = 0; i < testSize; i++)
+        std += std::pow(hOut[i] - mean, 2);
+
+    std = std::sqrt(std / testSize);
+
+    double eMean = std::exp(iMean + (iStd * iStd) / 2);
+    double eStd = std::sqrt(std::log(1 + (iStd * iStd)/(iMean * iMean)));
+
+    ASSERT_NEAR(mean, eMean, eMean * 0.1) << "Expected Mean: " << eMean << " Actual Mean: " << mean << " Eps: " << eMean * 0.1;
+    ASSERT_NEAR(std, eStd, eStd * 0.1) << "Expected Std: " << eStd << " Actual Std: " << std << " Eps: " << eStd * 0.1;
+
+    HIP_CHECK(hipFree(states));
+    HIP_CHECK(hipFree(dOut));
+
+    delete [] hOut;
+}
+
+TEST(log_normal_distribution_tests, double2_mtgp32_test){
+    size_t testSize = 40192;
+    size_t threads = 256;
+    size_t blocks = std::ceil(static_cast<double>(testSize) / static_cast<double>(threads));
+
+    double iMean = 0;
+    double iStd = 1;
+
+    rocrand_state_mtgp32 * states;
+    size_t state_size = blocks, seed = 654321;
+    HIP_CHECK(hipMalloc(&states, state_size * sizeof(rocrand_state_mtgp32)));
+    rocrand_make_state_mtgp32(states,  mtgp32dc_params_fast_11213, state_size, seed);
+
+    double2 * hOut = new double2[testSize];
+    double2 * dOut;
+    HIP_CHECK(hipMalloc(&dOut, sizeof(double2) * testSize));
+    HIP_CHECK(hipDeviceSynchronize());
+
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(mtgp32_kernel<double2, double>),
+        dim3(blocks),
+        dim3(threads),
+        0,
+        0,
+        states,
+        dOut,
+        testSize,
+        iMean,
+        iStd,
+        [] __device__ (rocrand_state_mtgp32 * state, double mean, double std){
+            return rocrand_log_normal_double2(state, mean, std);
+        }
+    );
+
+    HIP_CHECK(hipMemcpy(hOut, dOut, sizeof(double2) * testSize, hipMemcpyDeviceToHost));
+
+    double mean = 0.0;
+    for(size_t i = 0; i < testSize; i++){
+        mean += hOut[i].x;
+        mean += hOut[i].y;
+    }
+    
+
+    mean /= (testSize * 2);
+
+    double std = 0.0;
+    for(size_t i = 0; i < testSize; i++){
+        std += std::pow(hOut[i].x - mean, 2);
+        std += std::pow(hOut[i].y - mean, 2);
+    }
+
+    std = std::sqrt(std / (testSize * 2));
+
+    double eMean = std::exp(iMean + (iStd * iStd) / 2);
+    double eStd = std::sqrt(std::log(1 + (iStd * iStd)/(iMean * iMean)));
+
+    ASSERT_NEAR(mean, eMean, eMean * 0.1) << "Expected Mean: " << eMean << " Actual Mean: " << mean << " Eps: " << eMean * 0.1;
+    ASSERT_NEAR(std, eStd, eStd * 0.1) << "Expected Std: " << eStd << " Actual Std: " << std << " Eps: " << eStd * 0.1;
+
+    HIP_CHECK(hipFree(states));
+    HIP_CHECK(hipFree(dOut));
+
+    delete [] hOut;
+}
diff --git a/test/internal/test_normal_distribution.cpp b/test/internal/test_normal_distribution.cpp
index b1226fd64..f7fb8b46e 100644
--- a/test/internal/test_normal_distribution.cpp
+++ b/test/internal/test_normal_distribution.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -24,7 +24,9 @@
 #include <random>
 
 #include <rng/distribution/normal.hpp>
+#include <rocrand/rocrand_mtgp32_11213.h>
 
+#define HIP_CHECK(state) ASSERT_EQ(state, hipSuccess)
 using namespace rocrand_impl::host;
 
 TEST(normal_distribution_tests, float_test)
@@ -103,6 +105,215 @@ TEST(normal_distribution_tests, double_test)
     EXPECT_NEAR(5.0, std, 1.0); // 20%
 }
 
+TEST(normal_distribution_tests, float_out_uint2_in_test)
+{
+
+    struct nd
+    {
+        const float mean;
+        const float stddev;
+
+        nd(float mean, float stddev) : mean(mean), stddev(stddev) {}
+
+        __forceinline__ __host__ __device__
+        void operator()(const uint2 &input, float (&output)[2]) const
+        {
+            float2 v  = rocrand_device::detail::normal_distribution2(input);
+            output[0] = mean + v.x * stddev;
+            output[1] = mean + v.y * stddev;
+        }
+    };
+
+    std::random_device                          rd;
+    std::mt19937                                gen(rd());
+    std::uniform_int_distribution<unsigned int> dis;
+
+    const size_t size = 4000;
+    float        val[size];
+    nd           u(2.0, 5.0);
+
+    // Calculate mean
+    double mean = 0.0;
+    for(size_t i = 0; i < size; i += 2)
+    {
+        uint2     input;
+        float     output[2];
+        input.x = dis(gen);
+        input.y = dis(gen);
+        u(input, output);
+        val[i]     = output[0];
+        val[i + 1] = output[1];
+        mean += (output[0] + output[1]);
+    }
+    mean /= size;
+
+    // Calculate stddev
+    double std = 0.0;
+    for(size_t i = 0; i < size; i++)
+    {
+        std += std::pow(val[i] - mean, 2);
+    }
+    std = std::sqrt(std / size);
+
+    EXPECT_NEAR(2.0, mean, 0.4); // 20%
+    EXPECT_NEAR(5.0, std, 1.0); // 20%
+}
+
+
+TEST(normal_distribution_tests, float2_out_longlong_in_test)
+{
+    
+    std::random_device                          rd;
+    std::mt19937                                gen(rd());
+    std::uniform_int_distribution<unsigned int> dis;
+
+    const size_t size = 4000;
+    float        val[size];
+    normal_distribution<float, unsigned long long> u(2, 5);
+    
+    // Calculate mean
+    double mean = 0.0;
+    for(size_t i = 0; i < size; i += 2)
+    {
+        unsigned long long input[1];
+        float    output[2];
+        unsigned long long l = static_cast<unsigned long long>(dis(gen));
+        unsigned long long r = static_cast<unsigned long long>(dis(gen));
+        input[0] = (l << 32) | r;
+        u(input, output);
+        val[i]     = output[0];
+        val[i + 1] = output[1];
+        mean += (output[0] + output[1]);
+    }
+    mean /= size;
+    
+    // Calculate stddev
+    double std = 0.0;
+    for(size_t i = 0; i < size; i++)
+    {
+        std += std::pow(val[i] - mean, 2) / size;
+    }
+    std = std::sqrt(std);
+    
+    EXPECT_NEAR(2, mean, 0.4) << "Mean: " << mean << " Expected: " << 2; // 20%
+    EXPECT_NEAR(5, std, 1.0) <<  "Stddev: " << std << " Expected: " << 5; // 20%
+}
+
+TEST(normal_distribution_tests, float4_outputs){
+    struct normal_distribution_float_4_out{
+        const float mean;
+        const float stddev;
+
+        normal_distribution_float_4_out(float mean, float stddev) : mean(mean), stddev(stddev) {}
+
+        __forceinline__ __host__ __device__
+        void uint4_in(const uint4 &input, float (&output)[4]) const
+        {
+            float4 v  = rocrand_device::detail::normal_distribution4(input);
+            output[0] = mean + v.w * stddev; 
+            output[1] = mean + v.x * stddev;
+            output[2] = mean + v.y * stddev;
+            output[3] = mean + v.z * stddev;
+        }
+
+        __forceinline__ __host__ __device__
+        void longlong2_in(const longlong2 &input, float (&output)[4]) const
+        {
+            float4 v  = rocrand_device::detail::normal_distribution4(input);
+            output[0] = mean + v.w * stddev; 
+            output[1] = mean + v.x * stddev;
+            output[2] = mean + v.y * stddev;
+            output[3] = mean + v.z * stddev;
+        }
+
+        __forceinline__ __host__ __device__
+        void ull_2_in(const unsigned long long (&input)[2], float (&output)[4]) const
+        {
+            float4 v  = rocrand_device::detail::normal_distribution4(input[0], input[1]);
+            output[0] = mean + v.w * stddev; 
+            output[1] = mean + v.x * stddev;
+            output[2] = mean + v.y * stddev;
+            output[3] = mean + v.z * stddev;
+        }
+    };
+    
+    std::random_device                          rd;
+    std::mt19937                                gen(rd());
+    std::uniform_int_distribution<unsigned int> dis;
+
+    const size_t size = 4000;
+    float        vull2[size], vll2[size], vui4[size];
+    normal_distribution_float_4_out u(2, 5);
+
+    // Calculate mean
+    double mui4 = 0, mll2 = 0, mull2 = 0;
+    for(size_t i = 0; i < size; i += 4)
+    {
+        unsigned long long ull2[2];
+        longlong2          ll2;
+        uint4              ui4;
+        float     output[4];
+        unsigned long long l = static_cast<unsigned long long>(dis(gen));
+        unsigned long long r = static_cast<unsigned long long>(dis(gen));
+        
+        unsigned long long in1 = (l << 32) | r;
+    
+        ull2[0] = in1;
+        ll2.x = in1;
+        ui4.w = l; ui4.x = r;
+
+        l = static_cast<unsigned long long>(dis(gen));
+        r = static_cast<unsigned long long>(dis(gen));
+        unsigned long long in2 = (l << 32) | r;
+
+        ull2[1] = in2;
+        ll2.y = in2;
+        ui4.y = l; ui4.z = r;
+
+        u.uint4_in(ui4, output);
+        vui4[i] = output[0]; 
+        vui4[i + 1] = output[1]; 
+        vui4[i + 2] = output[2]; 
+        vui4[i + 3] = output[3];
+        mui4 += (output[0] + output[1] + output[2] + output[3]) / size;
+
+        u.longlong2_in(ll2, output);
+        vll2[i] = output[0]; 
+        vll2[i + 1] = output[1]; 
+        vll2[i + 2] = output[2]; 
+        vll2[i + 3] = output[3];
+        mll2 += (output[0] + output[1] + output[2] + output[3]) / size;
+
+        u.ull_2_in(ull2, output);
+        vull2[i] = output[0]; 
+        vull2[i + 1] = output[1]; 
+        vull2[i + 2] = output[2]; 
+        vull2[i + 3] = output[3];
+        mull2 += (output[0] + output[1] + output[2] + output[3]) / size;
+    }
+
+    double sui4 = 0, sll2 = 0, sull2 = 0;
+    for(size_t i = 0; i < size; i++)
+    {
+        sui4 += std::pow(vui4[i] - mui4, 2);
+        sll2 += std::pow(vll2[i] - mll2, 2);
+        sull2 += std::pow(vull2[i] - mull2, 2);
+    }
+
+    sui4 = sqrt(sui4 / size);
+    sll2 = sqrt(sll2 / size);
+    sull2 = sqrt(sull2 / size);
+
+    EXPECT_NEAR(2, mui4, 0.4) << "Mean: " << mui4 << " Expected: " << 2; // 20%
+    EXPECT_NEAR(5, sui4, 1.0) <<  "Stddev: " << sui4 << " Expected: " << 5; // 20%
+
+    EXPECT_NEAR(2, mll2, 0.4) << "Mean: " << mll2 << " Expected: " << 2; // 20%
+    EXPECT_NEAR(5, sll2, 1.0) <<  "Stddev: " << sll2 << " Expected: " << 5; // 20%
+
+    EXPECT_NEAR(2, mull2, 0.4) << "Mean: " << mull2 << " Expected: " << 2; // 20%
+    EXPECT_NEAR(5, sull2, 1.0) <<  "Stddev: " << sull2 << " Expected: " << 5; // 20%
+}
+
 TEST(normal_distribution_tests, half_test)
 {
     std::random_device                          rd;
@@ -375,3 +586,552 @@ TEST(sobol_normal_distribution_tests, half_test)
     EXPECT_NEAR(2.0f, mean, 0.4f); // 20%
     EXPECT_NEAR(5.0f, std, 1.0f); // 20%
 }
+
+template <typename State>
+void run_normal_dist_with_state_out1(State * states){
+    double expected_mean = 2.0f, expected_std = 5.0f;
+
+    struct single_out{
+        double mean, std;
+        State * states;
+        
+        single_out(double mean, double std, State * states){
+            this->mean = mean;
+            this->std = std;
+            this->states = states;
+        }
+
+        float __forceinline__ __device__ __host__ fOp(){
+            return mean + rocrand_normal(states) * std;
+        }
+
+        double __forceinline__ __device__ __host__ dOp(){
+            return mean + rocrand_normal_double(states) * std;
+        }
+    };
+
+    const size_t size = 4000;
+
+    float fOut[size];
+    double dOut[size];
+
+    single_out s(expected_mean, expected_std, states);
+
+    float fMean = 0, fStd = 0;
+    double dMean = 0, dStd = 0;
+
+    for(size_t i = 0; i < size; i++){
+        fOut[i] = s.fOp();
+        dOut[i] = s.dOp();
+
+        fMean += fOut[i] / size;
+        dMean += dOut[i] / size;
+    }
+    
+    for(size_t i = 0; i < size; i++){
+        fStd += std::pow(fOut[i] - fMean, 2) / size;
+        dStd += std::pow(dOut[i] - dMean, 2) / size;
+    }
+
+    fStd = std::sqrt(fStd);
+    dStd = std::sqrt(dStd);
+
+    EXPECT_NEAR(expected_mean, fMean, (expected_mean * 0.2) + 1e-1) << "Mean: " << fMean << " Expected: " << expected_mean; // 20%
+    EXPECT_NEAR(expected_std, fStd, (expected_std * 0.2) + 1e-1) <<  "Stddev: " << fStd << " Expected: " << expected_std; // 20%
+
+    EXPECT_NEAR(expected_mean, dMean, (expected_mean * 0.2) + 1e-1) << "Mean: " << dMean << " Expected: " << expected_mean; // 20%
+    EXPECT_NEAR(expected_std, dStd, (expected_std * 0.2) + 1e-1) <<  "Stddev: " << dStd << " Expected: " << expected_std; // 20%
+}
+
+template <typename State>
+void run_normal_dist_with_state_out2(State * states){
+    double expected_mean = 2.0f, expected_std = 5.0f;
+    struct nd{
+        double mean, std;
+        State * states;
+        nd(double  mean, double stddev, State * states) {
+            this->mean = mean;
+            this->std = stddev;
+            this->states = states;
+        }
+        __forceinline__ __host__ __device__ void operator()(float(&output)[2]){
+            float2 v = rocrand_normal2(states);
+            output[0] = static_cast<float>(mean) + v.x * static_cast<float>(std);
+            output[1] = static_cast<float>(mean) + v.y * static_cast<float>(std);
+        }
+        
+        __forceinline__ __host__ __device__ void operator()(double(&output)[2]){
+            double2 v = rocrand_normal_double2(states);
+            output[0] = mean + v.x * std;
+            output[1] = mean + v.y * std;
+        }
+    };
+    const size_t size = 4000;
+    float valF[size];
+    double valD[size];
+    nd u(expected_mean, expected_std, states);
+
+    double meanF = 0, meanD = 0;
+    for(size_t i = 0; i < size; i += 2)
+    {
+        float fOut[2];
+        double dOut[2];
+        
+        u(fOut);
+        valF[i]     = fOut[0];
+        valF[i + 1] = fOut[1];
+        meanF += (fOut[0] + fOut[1]) / size;
+
+        u(dOut);
+        valD[i]     = dOut[0];
+        valD[i + 1] = dOut[1];
+        meanD += (dOut[0] + dOut[1]) / size;
+    }
+
+    // Calculate stddev
+    double stdF = 0, stdD = 0;
+    for(size_t i = 0; i < size; i++){
+        stdF += std::pow(valF[i] - meanF, 2) / size;
+        stdD += std::pow(valD[i] - meanD, 2) / size;
+    }
+    stdF = std::sqrt(stdF);
+    stdD = std::sqrt(stdD);
+
+    EXPECT_NEAR(expected_mean, meanF, (expected_mean * 0.2) + 1e-1) << "Mean: " << meanF << " Expected: " << expected_mean; // 20%
+    EXPECT_NEAR(expected_std, stdF, (expected_std * 0.2) + 1e-1) <<  "Stddev: " << stdF << " Expected: " << expected_std; // 20%
+
+    EXPECT_NEAR(expected_mean, meanD, (expected_mean * 0.2) + 1e-1) << "Mean: " << meanD << " Expected: " << expected_mean; // 20%
+    EXPECT_NEAR(expected_std, stdD, (expected_std * 0.2) + 1e-1) <<  "Stddev: " << stdD << " Expected: " << expected_std; // 20%
+}
+
+template <typename State>
+void run_normal_dist_with_state_out4(State * states){
+    double expected_mean = 2.0f, expected_std = 5.0f;
+    
+    struct nd{
+        double mean, std;
+        State * states;
+        nd(double  mean, double stddev, State * states) {
+            this->mean = mean;
+            this->std = stddev;
+            this->states = states;
+        }
+        __forceinline__ __host__ __device__ void operator()(float(&output)[4]){
+            float4 v = rocrand_normal4(states);
+            output[0] = static_cast<float>(mean) + v.w * static_cast<float>(std);
+            output[1] = static_cast<float>(mean) + v.x * static_cast<float>(std);
+            output[2] = static_cast<float>(mean) + v.y * static_cast<float>(std);
+            output[3] = static_cast<float>(mean) + v.z * static_cast<float>(std);
+        }
+        
+        __forceinline__ __host__ __device__ void operator()(double(&output)[4]){
+            double4 v = rocrand_normal_double4(states);
+            output[0] = mean + v.w * std;
+            output[1] = mean + v.x * std;
+            output[2] = mean + v.y * std;
+            output[3] = mean + v.z * std;
+        }
+    };
+    const size_t size = 4000;
+    float valF[size];
+    double valD[size];
+
+    nd u(expected_mean, expected_std, states);
+
+    double meanF = 0, meanD = 0;
+    for(size_t i = 0; i < size; i += 4)
+    {
+        float fOut[4];
+        double dOut[4];
+        
+        u(fOut);
+        valF[i]     = fOut[0];
+        valF[i + 1] = fOut[1];
+        valF[i + 2] = fOut[2];
+        valF[i + 3] = fOut[3];
+        meanF += (fOut[0] + fOut[1] + fOut[2] + fOut[3]) / size;
+
+        u(dOut);
+        valD[i]     = dOut[0];
+        valD[i + 1] = dOut[1];
+        valD[i + 2] = dOut[2];
+        valD[i + 3] = dOut[3];
+        meanD += (dOut[0] + dOut[1] + dOut[2] + dOut[3]) / size;
+    }
+
+    // Calculate stddev
+    double stdF = 0, stdD = 0;
+    for(size_t i = 0; i < size; i++){
+        stdF += std::pow(valF[i] - meanF, 2) / size;
+        stdD += std::pow(valD[i] - meanD, 2) / size;
+    }
+    stdF = std::sqrt(stdF);
+    stdD = std::sqrt(stdD);
+
+    EXPECT_NEAR(expected_mean, meanF, (expected_mean * 0.2) + 1e-2) << "Mean: " << meanF << " Expected: " << expected_mean; // 20%
+    EXPECT_NEAR(expected_std, stdF, (expected_std * 0.2) + 1e-2) <<  "Stddev: " << stdF << " Expected: " << expected_std; // 20%
+
+    EXPECT_NEAR(expected_mean, meanD, (expected_mean * 0.2) + 1e-2) << "Mean: " << meanD << " Expected: " << expected_mean; // 20%
+    EXPECT_NEAR(expected_std, stdD, (expected_std * 0.2) + 1e-2) <<  "Stddev: " << stdD << " Expected: " << expected_std; // 20%
+}
+    
+TEST(normal_distribution_with_states, philox4x32_10){
+    rocrand_state_philox4x32_10 states;
+    rocrand_init(123456, 654321, 0, &states);
+    run_normal_dist_with_state_out2<rocrand_state_philox4x32_10>(&states);
+    run_normal_dist_with_state_out4<rocrand_state_philox4x32_10>(&states);
+}
+
+TEST(normal_distribution_with_states, mrg31k3p){
+    rocrand_state_mrg31k3p states;
+    rocrand_init(123456, 654321, 0, &states);
+    run_normal_dist_with_state_out2<rocrand_state_mrg31k3p>(&states);
+}
+
+TEST(normal_distribution_with_states, mrg32k3a){
+    rocrand_state_mrg32k3a states;
+    rocrand_init(123456, 654321, 0, &states);
+    run_normal_dist_with_state_out2<rocrand_state_mrg32k3a>(&states);
+}
+
+TEST(normal_distribution_with_states, xorwow){
+    rocrand_state_xorwow states;
+    rocrand_init(123456, 654321, 0, &states);
+    run_normal_dist_with_state_out2<rocrand_state_xorwow>(&states);
+}
+
+TEST(normal_distribution_with_states, sobol32){
+    rocrand_state_sobol32 states;
+    const unsigned int* directions;
+    HIP_CHECK(rocrand_get_direction_vectors32(&directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6));
+
+    rocrand_init(directions, 0, &states);
+
+    run_normal_dist_with_state_out1<rocrand_state_sobol32>(&states);
+
+}
+
+TEST(normal_distribution_with_states, scarambled_sobol32){
+    rocrand_state_scrambled_sobol32 states;
+    const unsigned int* directions;
+    HIP_CHECK(rocrand_get_direction_vectors32(&directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6));
+
+    rocrand_init(directions, 123456, 0, &states);
+
+    run_normal_dist_with_state_out1<rocrand_state_scrambled_sobol32>(&states);
+}
+
+TEST(normal_distribution_with_states, sobol64){
+    rocrand_state_sobol64 states;
+    const unsigned long long* directions;
+    HIP_CHECK(rocrand_get_direction_vectors64(&directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6));
+
+    rocrand_init(directions, 0, &states);
+
+    run_normal_dist_with_state_out1<rocrand_state_sobol64>(&states);
+
+}
+
+TEST(normal_distribution_with_states, scarambled_sobol64){
+    rocrand_state_scrambled_sobol64 states;
+    const unsigned long long* directions;
+    HIP_CHECK(rocrand_get_direction_vectors64(&directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6));
+
+    rocrand_init(directions, 123456, 0, &states);
+
+    run_normal_dist_with_state_out1<rocrand_state_scrambled_sobol64>(&states);
+}
+
+
+TEST(normal_distribution_with_states, lfsr113){
+    rocrand_state_lfsr113 states;
+    rocrand_init(static_cast<uint4>(12), 0, &states);
+
+    run_normal_dist_with_state_out1<rocrand_state_lfsr113>(&states);
+    run_normal_dist_with_state_out2<rocrand_state_lfsr113>(&states);
+}
+
+TEST(normal_distribution_with_states, threefry2x32_20){
+    rocrand_state_threefry2x32_20 states;
+    rocrand_init(123456, 654321, 0, & states);
+
+    run_normal_dist_with_state_out1<rocrand_state_threefry2x32_20>(&states);
+    run_normal_dist_with_state_out2<rocrand_state_threefry2x32_20>(&states);
+}
+
+TEST(normal_distribution_with_states, threefry2x64_20){
+    rocrand_state_threefry2x64_20 states;
+    rocrand_init(123456, 654321, 0, & states);
+
+    run_normal_dist_with_state_out1<rocrand_state_threefry2x64_20>(&states);
+    run_normal_dist_with_state_out2<rocrand_state_threefry2x64_20>(&states);
+}
+
+TEST(normal_distribution_with_states, rocrand_state_threefry4x32_20){
+    rocrand_state_threefry4x32_20 states;
+    rocrand_init(123456, 654321, 0, & states);
+
+    run_normal_dist_with_state_out1<rocrand_state_threefry4x32_20>(&states);
+    run_normal_dist_with_state_out2<rocrand_state_threefry4x32_20>(&states);
+}
+
+TEST(normal_distribution_with_states, rocrand_state_threefry4x64_20){
+    rocrand_state_threefry4x64_20 states;
+    rocrand_init(123456, 654321, 0, & states);
+
+    run_normal_dist_with_state_out1<rocrand_state_threefry4x64_20>(&states);
+    run_normal_dist_with_state_out2<rocrand_state_threefry4x64_20>(&states);
+}
+
+template <typename T, typename UDFunction>
+__global__ void mtgp32_kernel (rocrand_state_mtgp32 * states, T * output, const size_t N, const UDFunction & f){
+    const unsigned int state_id  = blockIdx.x;
+    const unsigned int thread_id = threadIdx.x;
+    unsigned int       index     = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    if(index >= N)
+        return;
+
+    __shared__ rocrand_state_mtgp32 state;
+    if(thread_id == 0)
+        state = states[state_id];
+    __syncthreads();
+
+    // output[index] = rocrand_uniform(&state);
+    output[index] = f(&state);
+
+    if(thread_id == 0)
+        states[state_id] = state; 
+}
+
+TEST(normal_distribution_with_states, float_mtgp32){
+
+    size_t testSize = 40192;
+    size_t threads = 256;
+    size_t blocks = std::ceil(static_cast<double>(testSize) / static_cast<double>(threads));
+
+    rocrand_state_mtgp32 * states;
+    size_t state_size = blocks, seed = 654321;
+    HIP_CHECK(hipMalloc(&states, state_size * sizeof(rocrand_state_mtgp32)));
+    rocrand_make_state_mtgp32(states,  mtgp32dc_params_fast_11213, state_size, seed);
+
+    float * hOut = new float[testSize];
+    float * dOut;
+    HIP_CHECK(hipMalloc(&dOut, sizeof(float) * testSize));
+    HIP_CHECK(hipDeviceSynchronize());
+
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(mtgp32_kernel<float>),
+        dim3(blocks),
+        dim3(threads),
+        0,
+        0,
+        states,
+        dOut,
+        testSize,
+        [] __device__ (rocrand_state_mtgp32 * state){
+            return rocrand_normal(state);
+        }
+    );
+
+    HIP_CHECK(hipMemcpy(hOut, dOut, sizeof(float) * testSize, hipMemcpyDeviceToHost));
+
+    double mean = 0.0;
+    for(size_t i = 0; i < testSize; i++)
+        mean += hOut[i];
+    
+
+    mean /= testSize;
+
+    double std = 0.0;
+    for(size_t i = 0; i < testSize; i++)
+        std += std::pow(hOut[i] - mean, 2);
+
+    std = std::sqrt(std / testSize);
+
+    double eMean = 0;
+    double eStd = 1;
+
+    ASSERT_NEAR(mean, eMean, 0.01) << "Expected Mean: " << eMean << " Actual Mean: " << mean << " Eps: " << 0.01;
+    ASSERT_NEAR(std, eStd, eStd * 0.1) << "Expected Std: " << eStd << " Actual Std: " << std << " Eps: " << eStd * 0.1;
+
+    HIP_CHECK(hipFree(states));
+    HIP_CHECK(hipFree(dOut));
+
+    delete [] hOut;
+}
+
+TEST(normal_distribution_with_states, float2_mtgp32){
+
+    size_t testSize = 20224;
+    size_t threads = 256;
+    size_t blocks = std::ceil(static_cast<double>(testSize) / static_cast<double>(threads));
+
+    rocrand_state_mtgp32 * states;
+    size_t state_size = blocks, seed = 654321;
+    HIP_CHECK(hipMalloc(&states, state_size * sizeof(rocrand_state_mtgp32)));
+    rocrand_make_state_mtgp32(states,  mtgp32dc_params_fast_11213, state_size, seed);
+
+    float2 * hOut = new float2[testSize];
+    float2 * dOut;
+    HIP_CHECK(hipMalloc(&dOut, sizeof(float2) * testSize));
+    HIP_CHECK(hipDeviceSynchronize());
+
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(mtgp32_kernel<float2>),
+        dim3(blocks),
+        dim3(threads),
+        0,
+        0,
+        states,
+        dOut,
+        testSize,
+        [] __device__ (rocrand_state_mtgp32 * state){
+            return rocrand_normal2(state);
+        }
+    );
+
+    HIP_CHECK(hipMemcpy(hOut, dOut, sizeof(float2) * testSize, hipMemcpyDeviceToHost));
+
+    double mean = 0.0;
+    for(size_t i = 0; i < testSize; i++)
+        mean += hOut[i].x + hOut[i].y;
+    
+
+    mean /= (testSize * 2);
+
+    double std = 0.0;
+    for(size_t i = 0; i < testSize; i++)
+        std += std::pow(hOut[i].x - mean, 2) + std::pow(hOut[i].y - mean, 2);
+
+    std = std::sqrt(std / (testSize * 2));
+
+    double eMean = 0;
+    double eStd = 1;
+
+    ASSERT_NEAR(mean, eMean, 0.01) << "Expected Mean: " << eMean << " Actual Mean: " << mean << " Eps: " << 0.01;
+    ASSERT_NEAR(std, eStd, eStd * 0.1) << "Expected Std: " << eStd << " Actual Std: " << std << " Eps: " << eStd * 0.1;
+
+    HIP_CHECK(hipFree(states));
+    HIP_CHECK(hipFree(dOut));
+
+    delete [] hOut;
+}
+
+TEST(normal_distribution_with_states, double_mtgp32){
+
+    size_t testSize = 40192;
+    size_t threads = 256;
+    size_t blocks = std::ceil(static_cast<double>(testSize) / static_cast<double>(threads));
+
+    rocrand_state_mtgp32 * states;
+    size_t state_size = blocks, seed = 654321;
+    HIP_CHECK(hipMalloc(&states, state_size * sizeof(rocrand_state_mtgp32)));
+    rocrand_make_state_mtgp32(states,  mtgp32dc_params_fast_11213, state_size, seed);
+
+    double * hOut = new double[testSize];
+    double * dOut;
+    HIP_CHECK(hipMalloc(&dOut, sizeof(double) * testSize));
+    HIP_CHECK(hipDeviceSynchronize());
+
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(mtgp32_kernel<double>),
+        dim3(blocks),
+        dim3(threads),
+        0,
+        0,
+        states,
+        dOut,
+        testSize,
+        [] __device__ (rocrand_state_mtgp32 * state){
+            return rocrand_normal_double(state);
+        }
+    );
+
+    HIP_CHECK(hipMemcpy(hOut, dOut, sizeof(double) * testSize, hipMemcpyDeviceToHost));
+
+    double mean = 0.0;
+    for(size_t i = 0; i < testSize; i++)
+        mean += hOut[i];
+    
+
+    mean /= testSize;
+
+    double std = 0.0;
+    for(size_t i = 0; i < testSize; i++)
+        std += std::pow(hOut[i] - mean, 2);
+
+    std = std::sqrt(std / testSize);
+
+    double eMean = 0;
+    double eStd = 1;
+
+    ASSERT_NEAR(mean, eMean, 0.01) << "Expected Mean: " << eMean << " Actual Mean: " << mean << " Eps: " << 0.01;
+    ASSERT_NEAR(std, eStd, eStd * 0.1) << "Expected Std: " << eStd << " Actual Std: " << std << " Eps: " << eStd * 0.1;
+
+    HIP_CHECK(hipFree(states));
+    HIP_CHECK(hipFree(dOut));
+
+    delete [] hOut;
+}
+
+TEST(normal_distribution_with_states, double2_mtgp32){
+
+    size_t testSize = 20224;
+    size_t threads = 256;
+    size_t blocks = std::ceil(static_cast<double>(testSize) / static_cast<double>(threads));
+
+    rocrand_state_mtgp32 * states;
+    size_t state_size = blocks, seed = 654321;
+    HIP_CHECK(hipMalloc(&states, state_size * sizeof(rocrand_state_mtgp32)));
+    rocrand_make_state_mtgp32(states,  mtgp32dc_params_fast_11213, state_size, seed);
+
+    double2 * hOut = new double2[testSize];
+    double2 * dOut;
+    HIP_CHECK(hipMalloc(&dOut, sizeof(double2) * testSize));
+    HIP_CHECK(hipDeviceSynchronize());
+
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(mtgp32_kernel<double2>),
+        dim3(blocks),
+        dim3(threads),
+        0,
+        0,
+        states,
+        dOut,
+        testSize,
+        [] __device__ (rocrand_state_mtgp32 * state){
+            return rocrand_normal_double2(state);
+        }
+    );
+
+    HIP_CHECK(hipMemcpy(hOut, dOut, sizeof(double2) * testSize, hipMemcpyDeviceToHost));
+
+    double mean = 0.0;
+    for(size_t i = 0; i < testSize; i++)
+        mean += hOut[i].x + hOut[i].y;
+    
+
+    mean /= (testSize * 2);
+
+    double std = 0.0;
+    for(size_t i = 0; i < testSize; i++)
+        std += std::pow(hOut[i].x - mean, 2) + std::pow(hOut[i].y - mean, 2);
+
+    std = std::sqrt(std / (testSize * 2));
+
+    double eMean = 0;
+    double eStd = 1;
+
+    ASSERT_NEAR(mean, eMean, 0.01) << "Expected Mean: " << eMean << " Actual Mean: " << mean << " Eps: " << 0.01;
+    ASSERT_NEAR(std, eStd, eStd * 0.1) << "Expected Std: " << eStd << " Actual Std: " << std << " Eps: " << eStd * 0.1;
+
+    HIP_CHECK(hipFree(states));
+    HIP_CHECK(hipFree(dOut));
+
+    delete [] hOut;
+}
diff --git a/test/internal/test_rocrand_threefry2x32_20_prng.cpp b/test/internal/test_rocrand_threefry2x32_20_prng.cpp
index fa6667e41..478df90bc 100644
--- a/test/internal/test_rocrand_threefry2x32_20_prng.cpp
+++ b/test/internal/test_rocrand_threefry2x32_20_prng.cpp
@@ -161,3 +161,93 @@ TEST(threefry_prng_state_tests, discard_sequence_test)
     EXPECT_EQ(state.counter.y, 457U);
     EXPECT_EQ(state.substate, 0U);
 }
+
+TEST(threefry_additional_tests, rocrand_init_test)
+{
+    // making sure the outputs are the same when initialized with same parameters
+    rocrand_state_threefry2x32_20 state1, state2;
+
+    using ull = unsigned long long;
+
+    ull seeds[] = {0, 123, 321, 123456, 654321};
+    ull subsequences[] = {0xf, 0xff, 0x1f, 0x1ff, 0x1f1};
+    ull offsets[] = {0, 1, 2, 3, 4};
+
+    for(int i = 0; i < 5; i++){
+        rocrand_init(seeds[i], subsequences[i], offsets[i], &state1);
+        rocrand_init(seeds[i], subsequences[i], offsets[i], &state2);
+
+        for(int j = 0; j < 5000; j++)
+            ASSERT_EQ(rocrand(&state1), rocrand(&state2));
+    }
+}
+
+TEST(threefry_additional_tests, rocrand_test)
+{
+    // making sure the outputs are uniformly distributed!
+    rocrand_state_threefry2x32_20 state;
+
+    rocrand_init(0, 0, 0, &state);
+    size_t testSize = 40000;
+
+    unsigned int * output = new unsigned int[testSize];
+
+    double mean = 0;
+    for(size_t i = 0; i < testSize; i++){
+        output[i] = rocrand(&state);
+        mean += static_cast<double>(output[i]);
+    }
+    mean /= testSize;
+
+    double std = 0.0;
+    for(size_t i = 0; i < testSize; i++)
+        std += std::pow(output[i] - mean, 2);
+
+    std = std::sqrt(std / testSize);
+
+    double maxi = (double) std::numeric_limits<unsigned int>::max();
+    double eMean = 0.5 * (maxi); // 0.5(a + b)
+    double eStd = (maxi) / (2 * std::sqrt(3)); // (b - a) / (2*3^0.5)
+
+    ASSERT_NEAR(mean, eMean, eMean * 0.1);
+    ASSERT_NEAR(std, eStd, eStd * 0.1);
+
+    delete [] output;
+}
+
+TEST(threefry_additional_tests, rocrand2_test)
+{
+    // making sure the outputs are uniformly distributed!
+    rocrand_state_threefry2x32_20 state;
+
+    rocrand_init(0, 0, 0, &state);
+    size_t testSize = 40000;
+
+    unsigned int * output = new unsigned int[testSize];
+
+    double mean = 0;
+    for(size_t i = 0; i < testSize; i += 2){
+        uint2 t = rocrand2(&state);
+        output[i] = t.x;
+        output[i + 1] = t.y;
+        mean += static_cast<double>(output[i]); 
+        mean += static_cast<double>(output[i + 1]);
+    }
+    mean /= testSize;
+
+    double std = 0.0;
+    for(size_t i = 0; i < testSize; i++)
+        std += std::pow(output[i] - mean, 2);
+
+    std = std::sqrt(std / testSize);
+
+    double maxi = (double) std::numeric_limits<unsigned int>::max();
+    // min val is 0
+    double eMean = 0.5 * (maxi); // 0.5(a + b)
+    double eStd = (maxi) / (2 * std::sqrt(3)); // (b - a) / (2*3^0.5)
+
+    ASSERT_NEAR(mean, eMean, eMean * 0.1);
+    ASSERT_NEAR(std, eStd, eStd * 0.1);\
+
+    delete [] output;
+}
diff --git a/test/internal/test_rocrand_threefry2x64_20_prng.cpp b/test/internal/test_rocrand_threefry2x64_20_prng.cpp
index cd13ea3bb..4a1d776fd 100644
--- a/test/internal/test_rocrand_threefry2x64_20_prng.cpp
+++ b/test/internal/test_rocrand_threefry2x64_20_prng.cpp
@@ -169,3 +169,73 @@ TEST(threefry_prng_state_tests, discard_sequence_test)
     EXPECT_EQ(state.counter.y, 457ULL);
     EXPECT_EQ(state.substate, 0U);
 }
+
+TEST(threefry_additional_tests, rocrand_test)
+{
+    // making sure the outputs are uniformly distributed!
+    rocrand_state_threefry2x64_20 state;
+
+    rocrand_init(0, 0, 0, &state);
+    size_t testSize = 40000;
+
+    unsigned long long * output = new unsigned long long[testSize];
+
+    double mean = 0;
+    for(size_t i = 0; i < testSize; i++){
+        output[i] = rocrand(&state);
+        mean += static_cast<double>(output[i]);
+    }
+    mean /= testSize;
+
+    double std = 0.0;
+    for(size_t i = 0; i < testSize; i++)
+        std += std::pow(output[i] - mean, 2);
+
+    std = std::sqrt(std / testSize);
+
+    double maxi = (double) std::numeric_limits<unsigned long long>::max();
+    double eMean = 0.5 * (maxi); // 0.5(a + b)
+    double eStd = (maxi) / (2 * std::sqrt(3)); // (b - a) / (2*3^0.5)
+
+    ASSERT_NEAR(mean, eMean, eMean * 0.1);
+    ASSERT_NEAR(std, eStd, eStd * 0.1);
+
+    delete [] output;
+}
+
+TEST(threefry_additional_tests, rocrand2_test)
+{
+    // making sure the outputs are uniformly distributed!
+    rocrand_state_threefry2x64_20 state;
+
+    rocrand_init(0, 0, 0, &state);
+    size_t testSize = 40000;
+
+    unsigned long long * output = new unsigned long long[testSize];
+
+    double mean = 0;
+    for(size_t i = 0; i < testSize; i += 2){
+        ulonglong2 t = rocrand2(&state);
+        output[i] = t.x;
+        output[i + 1] = t.y;
+        mean += static_cast<double>(output[i]); 
+        mean += static_cast<double>(output[i + 1]);
+    }
+    mean /= testSize;
+
+    double std = 0.0;
+    for(size_t i = 0; i < testSize; i++)
+        std += std::pow(output[i] - mean, 2);
+
+    std = std::sqrt(std / testSize);
+
+    double maxi = (double) std::numeric_limits<unsigned long long>::max();
+    // min val is 0
+    double eMean = 0.5 * (maxi); // 0.5(a + b)
+    double eStd = (maxi) / (2 * std::sqrt(3)); // (b - a) / (2*3^0.5)
+
+    ASSERT_NEAR(mean, eMean, eMean * 0.1);
+    ASSERT_NEAR(std, eStd, eStd * 0.1);\
+
+    delete [] output;
+}
diff --git a/test/internal/test_rocrand_threefry4x32_20_prng.cpp b/test/internal/test_rocrand_threefry4x32_20_prng.cpp
index 3ce1be60e..c69df6fbf 100644
--- a/test/internal/test_rocrand_threefry4x32_20_prng.cpp
+++ b/test/internal/test_rocrand_threefry4x32_20_prng.cpp
@@ -245,3 +245,97 @@ TEST(threefry_prng_state_tests, discard_sequence_test)
     EXPECT_EQ(state.counter.w, 6U);
     EXPECT_EQ(state.substate, 0U);
 }
+
+TEST(threefry_additional_tests, rocrand_init_test)
+{
+    // making sure the outputs are the same when initialized with same parameters
+    rocrand_state_threefry4x32_20 state1, state2;
+
+    using ull = unsigned long long;
+
+    ull seeds[] = {0, 123, 321, 123456, 654321};
+    ull subsequences[] = {0xf, 0xff, 0x1f, 0x1ff, 0x1f1};
+    ull offsets[] = {0, 1, 2, 3, 4};
+
+    for(int i = 0; i < 5; i++){
+        rocrand_init(seeds[i], subsequences[i], offsets[i], &state1);
+        rocrand_init(seeds[i], subsequences[i], offsets[i], &state2);
+
+        for(int j = 0; j < 5000; j++)
+            ASSERT_EQ(rocrand(&state1), rocrand(&state2));
+    }
+}
+
+TEST(threefry_additional_tests, rocrand_test)
+{
+    // making sure the outputs are uniformly distributed!
+    rocrand_state_threefry4x32_20 state;
+
+    rocrand_init(0, 0, 0, &state);
+    size_t testSize = 40000;
+
+    unsigned int * output = new unsigned int[testSize];
+
+    double mean = 0;
+    for(size_t i = 0; i < testSize; i++){
+        output[i] = rocrand(&state);
+        mean += static_cast<double>(output[i]);
+    }
+    mean /= testSize;
+
+    double std = 0.0;
+    for(size_t i = 0; i < testSize; i++)
+        std += std::pow(output[i] - mean, 2);
+
+    std = std::sqrt(std / testSize);
+
+    double maxi = (double) std::numeric_limits<unsigned int>::max();
+    double eMean = 0.5 * (maxi); // 0.5(a + b)
+    double eStd = (maxi) / (2 * std::sqrt(3)); // (b - a) / (2*3^0.5)
+
+    ASSERT_NEAR(mean, eMean, eMean * 0.1);
+    ASSERT_NEAR(std, eStd, eStd * 0.1);
+
+    delete [] output;
+}
+
+TEST(threefry_additional_tests, rocrand4_test)
+{
+    // making sure the outputs are uniformly distributed!
+    rocrand_state_threefry4x32_20 state;
+
+    rocrand_init(0, 0, 0, &state);
+    size_t testSize = 40000;
+
+    unsigned int * output = new unsigned int[testSize];
+
+    double mean = 0;
+    for(size_t i = 0; i < testSize; i += 4){
+        uint4 t = rocrand4(&state);
+        output[i] = t.w;
+        output[i + 1] = t.x;
+        output[i + 2] = t.y;
+        output[i + 3] = t.z;
+        mean += static_cast<double>(output[i]); 
+        mean += static_cast<double>(output[i + 1]);
+        mean += static_cast<double>(output[i + 2]);
+        mean += static_cast<double>(output[i + 3]);
+    }
+    mean /= testSize;
+
+    double std = 0.0;
+    for(size_t i = 0; i < testSize; i++)
+        std += std::pow(output[i] - mean, 2);
+
+    std = std::sqrt(std / testSize);
+
+    double maxi = (double) std::numeric_limits<unsigned int>::max();
+    // min val is 0
+    double eMean = 0.5 * (maxi); // 0.5(a + b)
+    double eStd = (maxi) / (2 * std::sqrt(3)); // (b - a) / (2*3^0.5)
+
+    ASSERT_NEAR(mean, eMean, eMean * 0.1);
+    ASSERT_NEAR(std, eStd, eStd * 0.1);
+
+    delete [] output;
+}
diff --git a/test/internal/test_rocrand_threefry4x64_20_prng.cpp b/test/internal/test_rocrand_threefry4x64_20_prng.cpp
index 280c333d2..cff39bacb 100644
--- a/test/internal/test_rocrand_threefry4x64_20_prng.cpp
+++ b/test/internal/test_rocrand_threefry4x64_20_prng.cpp
@@ -261,3 +261,97 @@ TEST(threefry_prng_state_tests, discard_sequence_test)
     EXPECT_EQ(state.counter.w, 6ULL);
     EXPECT_EQ(state.substate, 0U);
 }
+
+TEST(threefry_additional_tests, rocrand_init_test)
+{
+    // making sure the outputs are the same when initialized with same parameters
+    rocrand_state_threefry4x64_20 state1, state2;
+
+    using ull = unsigned long long;
+
+    ull seeds[] = {0, 123, 321, 123456, 654321};
+    ull subsequences[] = {0xf, 0xff, 0x1f, 0x1ff, 0x1f1};
+    ull offsets[] = {0, 1, 2, 3, 4};
+
+    for(int i = 0; i < 5; i++){
+        rocrand_init(seeds[i], subsequences[i], offsets[i], &state1);
+        rocrand_init(seeds[i], subsequences[i], offsets[i], &state2);
+
+        for(int j = 0; j < 5000; j++)
+            ASSERT_EQ(rocrand(&state1), rocrand(&state2));
+    }
+}
+
+TEST(threefry_additional_tests, rocrand_test)
+{
+    // making sure the outputs are uniformly distributed!
+    rocrand_state_threefry4x64_20 state;
+
+    rocrand_init(0, 0, 0, &state);
+    size_t testSize = 40000;
+
+    unsigned long long * output = new unsigned long long[testSize];
+
+    double mean = 0;
+    for(size_t i = 0; i < testSize; i++){
+        output[i] = rocrand(&state);
+        mean += static_cast<double>(output[i]);
+    }
+    mean /= testSize;
+
+    double std = 0.0;
+    for(size_t i = 0; i < testSize; i++)
+        std += std::pow(output[i] - mean, 2);
+
+    std = std::sqrt(std / testSize);
+
+    double maxi = (double) std::numeric_limits<unsigned long long>::max();
+    double eMean = 0.5 * (maxi); // 0.5(a + b)
+    double eStd = (maxi) / (2 * std::sqrt(3)); // (b - a) / (2*3^0.5)
+
+    ASSERT_NEAR(mean, eMean, eMean * 0.1);
+    ASSERT_NEAR(std, eStd, eStd * 0.1);
+
+    delete [] output;
+}
+
+TEST(threefry_additional_tests, rocrand4_test)
+{
+    // making sure the outputs are uniformly distributed!
+    rocrand_state_threefry4x64_20 state;
+
+    rocrand_init(0, 0, 0, &state);
+    size_t testSize = 40000;
+
+    unsigned long long * output = new unsigned long long[testSize];
+
+    double mean = 0;
+    for(size_t i = 0; i < testSize; i += 4){
+        ulonglong4 t = rocrand4(&state);
+        output[i] = t.w;
+        output[i + 1] = t.x;
+        output[i + 2] = t.y;
+        output[i + 3] = t.z;
+        mean += static_cast<double>(output[i]); 
+        mean += static_cast<double>(output[i + 1]);
+        mean += static_cast<double>(output[i + 2]);
+        mean += static_cast<double>(output[i + 3]);
+    }
+    mean /= testSize;
+
+    double std = 0.0;
+    for(size_t i = 0; i < testSize; i++)
+        std += std::pow(output[i] - mean, 2);
+
+    std = std::sqrt(std / testSize);
+
+    double maxi = (double) std::numeric_limits<unsigned long long>::max();
+    // min val is 0
+    double eMean = 0.5 * (maxi); // 0.5(a + b)
+    double eStd = (maxi) / (2 * std::sqrt(3)); // (b - a) / (2*3^0.5)
+
+    ASSERT_NEAR(mean, eMean, eMean * 0.1);
+    ASSERT_NEAR(std, eStd, eStd * 0.1);
+
+    delete [] output;
+}
diff --git a/test/internal/test_uniform_distribution.cpp b/test/internal/test_uniform_distribution.cpp
index e897e0144..37453c6db 100644
--- a/test/internal/test_uniform_distribution.cpp
+++ b/test/internal/test_uniform_distribution.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -24,6 +24,9 @@
 #include <random>
 
 #include <rng/distribution/uniform.hpp>
+#include <rocrand/rocrand_mtgp32_11213.h>
+
+#define HIP_CHECK(state) ASSERT_EQ(state, hipSuccess)
 
 using namespace rocrand_impl::host;
 
@@ -358,3 +361,764 @@ TEST(sobol_uniform_distribution_tests, half_test)
     EXPECT_GT(__half2float(output[0]), 0.0f);
     EXPECT_LT(__half2float(output[0]), 1e-4f);
 }
+
+template <typename OutType, typename InType, typename UD>
+struct NumericUD{
+
+    template <typename FuncCall>
+    void run_test(UD & dis, const FuncCall & f){
+        std::random_device rd;
+        std::mt19937 gen(rd());
+
+        const size_t testSize = 4000000;
+    
+        float * output = new float [testSize];
+        
+        InType input;
+        OutType out;
+    
+        double mean = 0;
+    
+        for(size_t i = 0; i <= testSize; i += 4){
+            input = {dis(gen), dis(gen), dis(gen), dis(gen)};
+    
+            f(input, out);
+    
+            output[i] = out.w;
+            output[i + 1] = out.x;
+            output[i + 2] = out.y;
+            output[i + 3] = out.z;
+
+            ASSERT_GT(out.w, 0);
+            ASSERT_GT(out.x, 0);
+            ASSERT_GT(out.y, 0);
+            ASSERT_GT(out.z, 0);
+
+            ASSERT_LE(out.w, 1);
+            ASSERT_LE(out.x, 1);
+            ASSERT_LE(out.y, 1);
+            ASSERT_LE(out.z, 1);
+    
+            mean += out.w + out.x + out.y + out.z;
+        }
+    
+        mean /= testSize;
+    
+        double std = 0.0;
+        for(size_t i = 0; i < testSize; i++)
+            std += std::pow(output[i] - mean, 2);
+    
+        std = std::sqrt(std / testSize);
+    
+        double eMean = 0.5 * (0 + 1); // 0.5(a + b)
+        double eStd = (1 - 0) / (2 * std::sqrt(3)); // (b - a) / (2*3^0.5)
+    
+        ASSERT_NEAR(mean, eMean, eMean * 0.1) << "Expected Mean: " << eMean << " Actual Mean: " << mean << " Eps: " << eMean * 0.1;
+        ASSERT_NEAR(std, eStd, eStd * 0.1) << "Expected Std: " << eStd << " Actual Std: " << std << " Eps: " << eStd * 0.1;
+    }
+
+};
+
+TEST(uniform_distribution_tests, float4_uint4_in_test){
+    unsigned int mini = 0;
+    unsigned int maxi = std::numeric_limits<unsigned int>::max();
+    std::uniform_int_distribution<unsigned int> dis(mini, maxi);
+
+    NumericUD<float4, uint4, std::uniform_int_distribution<unsigned int>> test;
+    test.run_test(
+        dis,
+        [] __host__ __device__ (uint4 & input, float4 & output){
+            output = rocrand_device::detail::uniform_distribution4(input);
+        }
+    );
+}
+
+TEST(uniform_distribution_tests, float4_ulonglong4_test){
+    unsigned long long mini = 0;
+    unsigned long long maxi = std::numeric_limits<unsigned long long>::max();
+    std::uniform_int_distribution<unsigned long long> dis(mini, maxi);
+
+    NumericUD<float4, ulonglong4, std::uniform_int_distribution<unsigned long long>> test;
+    test.run_test(
+        dis,
+        [] __host__ __device__ (ulonglong4 & input, float4 & output){
+            output = rocrand_device::detail::uniform_distribution4(input);
+        }
+    );
+}
+
+TEST(uniform_distribution_tests, double4_uint4_test){
+    unsigned int mini = 0;
+    unsigned int maxi = std::numeric_limits<unsigned int>::max();
+    std::uniform_int_distribution<unsigned int> dis(mini, maxi);
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+
+    NumericUD<double4, uint4, std::uniform_int_distribution<unsigned int>> test;
+    test.run_test(
+        dis,
+        [&] __host__ __device__ (uint4 & input, double4 & output){
+            output = rocrand_device::detail::uniform_distribution_double4(input, input);
+        }
+    );
+}
+
+TEST(uniform_distribution_tests, double4_ulonglong4_test){
+    unsigned long long mini = 0;
+    unsigned long long maxi = std::numeric_limits<unsigned long long>::max();
+    std::uniform_int_distribution<unsigned long long> dis(mini, maxi);
+
+    NumericUD<double4, ulonglong4, std::uniform_int_distribution<unsigned long long>> test;
+    test.run_test(
+        dis,
+        [] __host__ __device__ (ulonglong4 & input, double4 & output){
+            output = rocrand_device::detail::uniform_distribution_double4(input);
+        }
+    );
+}
+
+TEST(uniform_distribution_tests, double2_uint4_in_test){
+    unsigned int mini = 0;
+    unsigned int maxi = std::numeric_limits<unsigned int>::max();
+    std::uniform_int_distribution<unsigned int> dis(mini, maxi);
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+
+    NumericUD<double4, uint4, std::uniform_int_distribution<unsigned int>> test;
+    test.run_test(
+        dis,
+        [&] __host__ __device__ (uint4 & input, double4 & output){
+
+            uint4 secondInput = {dis(gen), dis(gen), dis(gen), dis(gen)};
+            double2 o1 = rocrand_device::detail::uniform_distribution_double2(input);
+            double2 o2 = rocrand_device::detail::uniform_distribution_double2(secondInput);
+
+            output.w = o1.x; output.x = o1.y;
+            output.y = o2.x; output.z = o2.y;
+        }
+    );
+}
+
+TEST(uniform_distribution_tests, double2_ulonglong2_in_test){
+    unsigned long long mini = 0;
+    unsigned long long maxi = std::numeric_limits<unsigned long long>::max();
+    std::uniform_int_distribution<unsigned long long> dis(mini, maxi);
+
+    NumericUD<double4, ulonglong4, std::uniform_int_distribution<unsigned long long>> test;
+    test.run_test(
+        dis,
+        [&] __host__ __device__ (ulonglong4 & input, double4 & output){
+
+            ulonglong2 i1 = {input.w, input.x};
+            ulonglong2 i2 = {input.y, input.z};
+
+            double2 o1 = rocrand_device::detail::uniform_distribution_double2(i1);
+            double2 o2 = rocrand_device::detail::uniform_distribution_double2(i2);
+
+            output.w = o1.x; output.x = o1.y;
+            output.y = o2.x; output.z = o2.y;
+        }
+    );
+}
+
+TEST(uniform_distribution_tests, double2_ulonglong4_in_test){
+    unsigned long long mini = 0;
+    unsigned long long maxi = std::numeric_limits<unsigned long long>::max();
+    std::uniform_int_distribution<unsigned long long> dis(mini, maxi);
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+
+    NumericUD<double4, ulonglong4, std::uniform_int_distribution<unsigned long long>> test;
+    test.run_test(
+        dis,
+        [&] __host__ __device__ (ulonglong4 & input, double4 & output){
+            ulonglong4 secondInput = {dis(gen), dis(gen), dis(gen), dis(gen)};
+            double2 o1 = rocrand_device::detail::uniform_distribution_double2(input);
+            double2 o2 = rocrand_device::detail::uniform_distribution_double2(secondInput);
+
+            output.w = o1.x; output.x = o1.y;
+            output.y = o2.x; output.z = o2.y;
+        }
+    );
+}
+
+template <typename OutType>
+struct StatesUD{
+    template <typename FuncCall>
+    void run_test(const FuncCall & f, size_t testSize = 4000000){
+        float * output = new float [testSize];
+        OutType out;
+    
+        double mean = 0;
+    
+        for(size_t i = 0; i <= testSize; i += 4){
+            f(out);
+    
+            output[i] = out.w;
+            output[i + 1] = out.x;
+            output[i + 2] = out.y;
+            output[i + 3] = out.z;
+
+            ASSERT_GT(out.w, 0);
+            ASSERT_GT(out.x, 0);
+            ASSERT_GT(out.y, 0);
+            ASSERT_GT(out.z, 0);
+
+            ASSERT_LE(out.w, 1);
+            ASSERT_LE(out.x, 1);
+            ASSERT_LE(out.y, 1);
+            ASSERT_LE(out.z, 1);
+    
+            mean += out.w + out.x + out.y + out.z;
+        }
+    
+        mean /= testSize;
+    
+        double std = 0.0;
+        for(size_t i = 0; i < testSize; i++)
+            std += std::pow(output[i] - mean, 2);
+    
+        std = std::sqrt(std / testSize);
+    
+        double eMean = 0.5 * (0 + 1); // 0.5(a + b)
+        double eStd = (1 - 0) / (2 * std::sqrt(3)); // (b - a) / (2*3^0.5)
+    
+        ASSERT_NEAR(mean, eMean, eMean * 0.1) << "Expected Mean: " << eMean << " Actual Mean: " << mean << " Eps: " << eMean * 0.1;
+        ASSERT_NEAR(std, eStd, eStd * 0.1) << "Expected Std: " << eStd << " Actual Std: " << std << " Eps: " << eStd * 0.1;
+    }
+};
+
+TEST(uniform_distribution_tests, philox4x32_10_test){
+    rocrand_state_philox4x32_10 states;
+    rocrand_init(123456, 654321, 0, &states);
+
+    StatesUD<float4> testFloat;
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output){
+            output = {
+                rocrand_uniform(&states), rocrand_uniform(&states), 
+                rocrand_uniform(&states), rocrand_uniform(&states)
+            };
+        }
+    );
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output){
+            float2 o1 = rocrand_uniform2(&states);
+            float2 o2 = rocrand_uniform2(&states);
+
+            output.w = o1.x; output.x = o1.y;
+            output.y = o2.x; output.z = o2.y;
+        }
+    );
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output){
+            output = rocrand_uniform4(&states);
+        }
+    );
+
+    StatesUD<double4> testDouble;
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output){
+            output = {
+                rocrand_uniform_double(&states), rocrand_uniform_double(&states), 
+                rocrand_uniform_double(&states), rocrand_uniform_double(&states)
+            };
+        }
+    );
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output){
+            double2 o1 = rocrand_uniform_double2(&states);
+            double2 o2 = rocrand_uniform_double2(&states);
+
+            output.w = o1.x; output.x = o1.y;
+            output.y = o2.x; output.z = o2.y;
+        }
+    );
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output){
+            output = rocrand_uniform_double4(&states);
+        }
+    );
+}
+
+TEST(uniform_distribution_tests, mrg31k3p_test){
+    rocrand_state_mrg31k3p states;
+    rocrand_init(123456, 654321, 0, &states);
+
+    StatesUD<float4> testFloat;
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output){
+            output = {
+                rocrand_uniform(&states), rocrand_uniform(&states), 
+                rocrand_uniform(&states), rocrand_uniform(&states)
+            };
+        }
+    );
+
+    StatesUD<double4> testDouble;
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output){
+            output = {
+                rocrand_uniform_double(&states), rocrand_uniform_double(&states), 
+                rocrand_uniform_double(&states), rocrand_uniform_double(&states)
+            };
+        }
+    );
+}
+
+TEST(uniform_distribution_tests, mrg32k3a_test){
+    rocrand_state_mrg32k3a states;
+    rocrand_init(123456, 654321, 0, &states);
+
+    StatesUD<float4> testFloat;
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output){
+            output = {
+                rocrand_uniform(&states), rocrand_uniform(&states), 
+                rocrand_uniform(&states), rocrand_uniform(&states)
+            };
+        }
+    );
+
+    StatesUD<double4> testDouble;
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output){
+            output = {
+                rocrand_uniform_double(&states), rocrand_uniform_double(&states), 
+                rocrand_uniform_double(&states), rocrand_uniform_double(&states)
+            };
+        }
+    );
+}
+
+TEST(uniform_distribution_tests, xorwow_test){
+    rocrand_state_xorwow states;
+    rocrand_init(123456, 654321, 0, &states);
+
+    StatesUD<float4> testFloat;
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output){
+            output = {
+                rocrand_uniform(&states), rocrand_uniform(&states), 
+                rocrand_uniform(&states), rocrand_uniform(&states)
+            };
+        }
+    );
+
+    StatesUD<double4> testDouble;
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output){
+            output = {
+                rocrand_uniform_double(&states), rocrand_uniform_double(&states), 
+                rocrand_uniform_double(&states), rocrand_uniform_double(&states)
+            };
+        }
+    );
+}
+
+TEST(uniform_distribution_tests, sobol32_test){
+    rocrand_state_sobol32 states;
+    const unsigned int* directions;
+    HIP_CHECK(rocrand_get_direction_vectors32(&directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6));
+    rocrand_init(directions, 0, &states);
+
+    StatesUD<float4> testFloat;
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output){
+            output = {
+                rocrand_uniform(&states), rocrand_uniform(&states), 
+                rocrand_uniform(&states), rocrand_uniform(&states)
+            };
+        }
+    );
+
+    StatesUD<double4> testDouble;
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output){
+            output = {
+                rocrand_uniform_double(&states), rocrand_uniform_double(&states), 
+                rocrand_uniform_double(&states), rocrand_uniform_double(&states)
+            };
+        }
+    );
+}
+
+TEST(uniform_distribution_tests, scrambled_sobol32_test){
+    rocrand_state_scrambled_sobol32 states;
+    const unsigned int* directions;
+    HIP_CHECK(rocrand_get_direction_vectors32(&directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6));
+    rocrand_init(directions, 123456, 0, &states);
+
+    StatesUD<float4> testFloat;
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output){
+            output = {
+                rocrand_uniform(&states), rocrand_uniform(&states), 
+                rocrand_uniform(&states), rocrand_uniform(&states)
+            };
+        }
+    );
+
+    StatesUD<double4> testDouble;
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output){
+            output = {
+                rocrand_uniform_double(&states), rocrand_uniform_double(&states), 
+                rocrand_uniform_double(&states), rocrand_uniform_double(&states)
+            };
+        }
+    );
+}
+
+TEST(uniform_distribution_tests, sobol64_test){
+    rocrand_state_sobol64 states;
+    const unsigned long long* directions;
+    HIP_CHECK(rocrand_get_direction_vectors64(&directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6));
+    rocrand_init(directions, 0, &states);
+
+    StatesUD<float4> testFloat;
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output){
+            output = {
+                rocrand_uniform(&states), rocrand_uniform(&states), 
+                rocrand_uniform(&states), rocrand_uniform(&states)
+            };
+        }
+    );
+
+    StatesUD<double4> testDouble;
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output){
+            output = {
+                rocrand_uniform_double(&states), rocrand_uniform_double(&states), 
+                rocrand_uniform_double(&states), rocrand_uniform_double(&states)
+            };
+        }
+    );
+}
+
+TEST(uniform_distribution_tests, scrambled_sobol64_test){
+    rocrand_state_scrambled_sobol64 states;
+    const unsigned long long* directions;
+    HIP_CHECK(rocrand_get_direction_vectors64(&directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6));
+    rocrand_init(directions, 123456, 0, &states);
+
+    StatesUD<float4> testFloat;
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output){
+            output = {
+                rocrand_uniform(&states), rocrand_uniform(&states), 
+                rocrand_uniform(&states), rocrand_uniform(&states)
+            };
+        }
+    );
+
+    StatesUD<double4> testDouble;
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output){
+            output = {
+                rocrand_uniform_double(&states), rocrand_uniform_double(&states), 
+                rocrand_uniform_double(&states), rocrand_uniform_double(&states)
+            };
+        }
+    );
+}
+
+TEST(uniform_distribution_tests, lfsr113_test){
+    rocrand_state_lfsr113 states;
+    rocrand_init(static_cast<uint4>(12), 0, &states);
+
+    StatesUD<float4> testFloat;
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output){
+            output = {
+                rocrand_uniform(&states), rocrand_uniform(&states), 
+                rocrand_uniform(&states), rocrand_uniform(&states)
+            };
+        }
+    );
+
+    StatesUD<double4> testDouble;
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output){
+            output = {
+                rocrand_uniform_double(&states), rocrand_uniform_double(&states), 
+                rocrand_uniform_double(&states), rocrand_uniform_double(&states)
+            };
+        }
+    );
+}
+
+TEST(uniform_distribution_tests, threefry2x32_20_test){
+    rocrand_state_threefry2x32_20 states;
+    rocrand_init(123456, 654321, 0, & states);
+
+    StatesUD<float4> testFloat;
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output){
+            output = {
+                rocrand_uniform(&states), rocrand_uniform(&states), 
+                rocrand_uniform(&states), rocrand_uniform(&states)
+            };
+        }
+    );
+
+    StatesUD<double4> testDouble;
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output){
+            output = {
+                rocrand_uniform_double(&states), rocrand_uniform_double(&states), 
+                rocrand_uniform_double(&states), rocrand_uniform_double(&states)
+            };
+        }
+    );
+}
+
+TEST(uniform_distribution_tests, threefry2x64_20_test){
+    rocrand_state_threefry2x64_20 states;
+    rocrand_init(123456, 654321, 0, & states);
+
+    StatesUD<float4> testFloat;
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output){
+            output = {
+                rocrand_uniform(&states), rocrand_uniform(&states), 
+                rocrand_uniform(&states), rocrand_uniform(&states)
+            };
+        }
+    );
+
+    StatesUD<double4> testDouble;
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output){
+            output = {
+                rocrand_uniform_double(&states), rocrand_uniform_double(&states), 
+                rocrand_uniform_double(&states), rocrand_uniform_double(&states)
+            };
+        }
+    );
+}
+
+TEST(uniform_distribution_tests, threefry4x32_20_test){
+    rocrand_state_threefry4x32_20 states;
+    rocrand_init(123456, 654321, 0, & states);
+
+    StatesUD<float4> testFloat;
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output){
+            output = {
+                rocrand_uniform(&states), rocrand_uniform(&states), 
+                rocrand_uniform(&states), rocrand_uniform(&states)
+            };
+        }
+    );
+
+    StatesUD<double4> testDouble;
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output){
+            output = {
+                rocrand_uniform_double(&states), rocrand_uniform_double(&states), 
+                rocrand_uniform_double(&states), rocrand_uniform_double(&states)
+            };
+        }
+    );
+}
+
+TEST(uniform_distribution_tests, threefry4x64_20_test){
+    rocrand_state_threefry4x64_20 states;
+    rocrand_init(123456, 654321, 0, & states);
+
+    StatesUD<float4> testFloat;
+
+    testFloat.run_test(
+        [&] __host__ __device__ (float4 & output){
+            output = {
+                rocrand_uniform(&states), rocrand_uniform(&states), 
+                rocrand_uniform(&states), rocrand_uniform(&states)
+            };
+        }
+    );
+
+    StatesUD<double4> testDouble;
+
+    testDouble.run_test(
+        [&] __host__ __device__ (double4 & output){
+            output = {
+                rocrand_uniform_double(&states), rocrand_uniform_double(&states), 
+                rocrand_uniform_double(&states), rocrand_uniform_double(&states)
+            };
+        }
+    );
+}
+
+template <typename T, typename UDFunction>
+__global__ void mtgp32_kernel (rocrand_state_mtgp32 * states, T * output, const size_t N, const UDFunction & f){
+    const unsigned int state_id  = blockIdx.x;
+    const unsigned int thread_id = threadIdx.x;
+    unsigned int       index     = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    if(index >= N)
+        return;
+
+    __shared__ rocrand_state_mtgp32 state;
+    if(thread_id == 0)
+        state = states[state_id];
+    __syncthreads();
+
+    // output[index] = rocrand_uniform(&state);
+    output[index] = f(&state);
+
+    if(thread_id == 0)
+        states[state_id] = state; 
+}
+
+TEST(uniform_distribution_tests, float_mtgp32_test){
+    size_t testSize = 40192;
+    size_t threads = 256;
+    size_t blocks = std::ceil(static_cast<double>(testSize) / static_cast<double>(threads));
+
+    rocrand_state_mtgp32 * states;
+    size_t state_size = blocks, seed = 654321;
+    HIP_CHECK(hipMalloc(&states, state_size * sizeof(rocrand_state_mtgp32)));
+    rocrand_make_state_mtgp32(states,  mtgp32dc_params_fast_11213, state_size, seed);
+
+    float * hOut = new float[testSize];
+    float * dOut;
+    HIP_CHECK(hipMalloc(&dOut, sizeof(float) * testSize));
+    HIP_CHECK(hipDeviceSynchronize());
+
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(mtgp32_kernel<float>),
+        dim3(blocks),
+        dim3(threads),
+        0,
+        0,
+        states,
+        dOut,
+        testSize,
+        [] __device__ (rocrand_state_mtgp32 * state){
+            return rocrand_uniform(state);
+        }
+    );
+
+    HIP_CHECK(hipMemcpy(hOut, dOut, sizeof(float) * testSize, hipMemcpyDeviceToHost));
+
+    double mean = 0.0;
+    for(size_t i = 0; i < testSize; i++){
+        ASSERT_GT(hOut[i], 0.0);
+        ASSERT_LE(hOut[i], 1.0);
+
+        mean += hOut[i];
+    }
+
+    mean /= testSize;
+
+    double std = 0.0;
+    for(size_t i = 0; i < testSize; i++)
+        std += std::pow(hOut[i] - mean, 2);
+
+    std = std::sqrt(std / testSize);
+
+    double eMean = 0.5 * (0 + 1); // 0.5(a + b)
+    double eStd = (1 - 0) / (2 * std::sqrt(3)); // (b - a) / (2*3^0.5)
+
+    ASSERT_NEAR(mean, eMean, eMean * 0.1) << "Expected Mean: " << eMean << " Actual Mean: " << mean << " Eps: " << eMean * 0.1;
+    ASSERT_NEAR(std, eStd, eStd * 0.1) << "Expected Std: " << eStd << " Actual Std: " << std << " Eps: " << eStd * 0.1;
+
+    HIP_CHECK(hipFree(states));
+    HIP_CHECK(hipFree(dOut));
+
+    delete [] hOut;
+}
+
+TEST(uniform_distribution_tests, double_mtgp32_test){
+    size_t testSize = 40192;
+    size_t threads = 256;
+    size_t blocks = std::ceil(static_cast<double>(testSize) / static_cast<double>(threads));
+
+    rocrand_state_mtgp32 * states;
+    size_t state_size = blocks, seed = 654321;
+    HIP_CHECK(hipMalloc(&states, state_size * sizeof(rocrand_state_mtgp32)));
+    rocrand_make_state_mtgp32(states,  mtgp32dc_params_fast_11213, state_size, seed);
+
+    double * hOut = new double[testSize];
+    double * dOut;
+    HIP_CHECK(hipMalloc(&dOut, sizeof(double) * testSize));
+    HIP_CHECK(hipDeviceSynchronize());
+
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(mtgp32_kernel<double>),
+        dim3(blocks),
+        dim3(threads),
+        0,
+        0,
+        states,
+        dOut,
+        testSize,
+        [] __device__ (rocrand_state_mtgp32 * state){
+            return rocrand_uniform_double(state);
+        }
+    );
+
+    HIP_CHECK(hipMemcpy(hOut, dOut, sizeof(double) * testSize, hipMemcpyDeviceToHost));
+
+    double mean = 0.0;
+    for(size_t i = 0; i < testSize; i++){
+        ASSERT_GT(hOut[i], 0.0);
+        ASSERT_LE(hOut[i], 1.0);
+
+        mean += hOut[i];
+    }
+
+    mean /= testSize;
+
+    double std = 0.0;
+    for(size_t i = 0; i < testSize; i++)
+        std += std::pow(hOut[i] - mean, 2);
+
+    std = std::sqrt(std / testSize);
+
+    double eMean = 0.5 * (0 + 1); // 0.5(a + b)
+    double eStd = (1 - 0) / (2 * std::sqrt(3)); // (b - a) / (2*3^0.5)
+
+    ASSERT_NEAR(mean, eMean, eMean * 0.1) << "Expected Mean: " << eMean << " Actual Mean: " << mean << " Eps: " << eMean * 0.1;
+    ASSERT_NEAR(std, eStd, eStd * 0.1) << "Expected Std: " << eStd << " Actual Std: " << std << " Eps: " << eStd * 0.1;
+
+    HIP_CHECK(hipFree(states));
+    HIP_CHECK(hipFree(dOut));
+
+    delete [] hOut;
+}