diff --git a/src/random.jl b/src/random.jl index f5a05ae3e1..b8b1f222c0 100644 --- a/src/random.jl +++ b/src/random.jl @@ -52,8 +52,8 @@ function Random.rand!(rng::RNG, A::AnyCuArray) # grid-stride loop threadId = threadIdx().x - window = blockDim().x * gridDim().x - offset = (blockIdx().x - 1) * blockDim().x + window = widemul(blockDim().x, gridDim().x) + offset = widemul(blockIdx().x - 1i32, blockDim().x) while offset < length(A) i = threadId + offset if i <= length(A) @@ -96,8 +96,8 @@ function Random.randn!(rng::RNG, A::AnyCuArray{<:Union{AbstractFloat,Complex{<:A # grid-stride loop threadId = threadIdx().x - window = (blockDim().x - 1) * gridDim().x - offset = (blockIdx().x - 1) * blockDim().x + window = widemul(blockDim().x, gridDim().x) + offset = widemul(blockIdx().x - 1i32, blockDim().x) while offset < length(A) i = threadId + offset j = threadId + offset + window @@ -129,8 +129,8 @@ function Random.randn!(rng::RNG, A::AnyCuArray{<:Union{AbstractFloat,Complex{<:A # grid-stride loop threadId = threadIdx().x - window = (blockDim().x - 1) * gridDim().x - offset = (blockIdx().x - 1) * blockDim().x + window = widemul(blockDim().x, gridDim().x) + offset = widemul(blockIdx().x - 1i32, blockDim().x) while offset < length(A) i = threadId + offset if i <= length(A) @@ -150,11 +150,11 @@ function Random.randn!(rng::RNG, A::AnyCuArray{<:Union{AbstractFloat,Complex{<:A return end - kernel = @cuda launch=false name="rand!" kernel(A, rng.seed, rng.counter) - config = launch_configuration(kernel.fun; max_threads=64) - threads = max(32, min(config.threads, length(A)รท2)) - blocks = min(config.blocks, cld(cld(length(A), 2), threads)) - kernel(A, rng.seed, rng.counter; threads, blocks) + # see note in `rand!` about the launch configuration + threads = 32 + blocks = cld(cld(length(A), 2), threads) + + @cuda threads=threads blocks=blocks name="randn!" kernel(A, rng.seed, rng.counter) new_counter = Int64(rng.counter) + length(A) overflow, remainder = fldmod(new_counter, typemax(UInt32)) diff --git a/test/base/random.jl b/test/base/random.jl index 85e0799458..62a6811539 100644 --- a/test/base/random.jl +++ b/test/base/random.jl @@ -198,3 +198,10 @@ end end end +@testset "counter overflow" begin + rng = CUDA.RNG() + # we may not be able to allocate over 4GB on the GPU, so use unified memory + c = CuArray{Float16, 5, CUDA.UnifiedMemory}(undef, 64, 32, 512, 32, 64) + rand!(rng, c) + randn!(rng, c) +end