From 19d26d8d7d4e8235af8a7af979ebc2ec3927cdcc Mon Sep 17 00:00:00 2001 From: Michael Abbott Date: Tue, 31 Mar 2020 15:45:31 +0200 Subject: [PATCH 1/7] allow AbstractArray in gemm_strided_batched --- src/blas/wrappers.jl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/blas/wrappers.jl b/src/blas/wrappers.jl index 5862655b..d0a58f42 100644 --- a/src/blas/wrappers.jl +++ b/src/blas/wrappers.jl @@ -937,10 +937,10 @@ for (fname, elty) in function gemm_strided_batched!(transA::Char, transB::Char, alpha::($elty), - A::CuArray{$elty, 3}, - B::CuArray{$elty, 3}, + A::AbstractArray{$elty, 3}, + B::AbstractArray{$elty, 3}, beta::($elty), - C::CuArray{$elty, 3}) + C::AbstractArray{$elty, 3}) m = size(A, transA == 'N' ? 1 : 2) k = size(A, transA == 'N' ? 2 : 1) n = size(B, transB == 'N' ? 2 : 1) @@ -967,15 +967,15 @@ for (fname, elty) in function gemm_strided_batched(transA::Char, transB::Char, alpha::($elty), - A::CuArray{$elty, 3}, - B::CuArray{$elty, 3}) + A::AbstractArray{$elty, 3}, + B::AbstractArray{$elty, 3}) C = similar(B, (size(A, transA == 'N' ? 1 : 2), size(B, transB == 'N' ? 2 : 1), size(A, 3))) gemm_strided_batched!(transA, transB, alpha, A, B, zero($elty), C ) end function gemm_strided_batched(transA::Char, transB::Char, - A::CuArray{$elty, 3}, - B::CuArray{$elty, 3}) + A::AbstractArray{$elty, 3}, + B::AbstractArray{$elty, 3}) gemm_strided_batched(transA, transB, one($elty), A, B) end end From 20851f29d58e8a6cb602373cf6e035ebb445630c Mon Sep 17 00:00:00 2001 From: Michael Abbott Date: Tue, 31 Mar 2020 15:51:50 +0200 Subject: [PATCH 2/7] NNlib.batched_mul allowing PermutedDimsArray, plus... --- src/nnlib.jl | 47 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/src/nnlib.jl b/src/nnlib.jl index 56063b58..24523710 100644 --- a/src/nnlib.jl +++ b/src/nnlib.jl @@ -34,14 +34,51 @@ end # Batched matrix multiplication const batched_gemm_args = [ - (:(CuArray{T, 3}), 'N'), - (:(NNlib.BatchedTranspose{T, <:CuArray{T, 3}}), 'T'), - (:(NNlib.BatchedAdjoint{T, <:CuArray{T, 3}}), 'C') + (:(AbstractArray{T, 3}), 'N', :identity), + (:(NNlib.BatchedTranspose{T, <:AbstractArray{T, 3}}), 'T', :batched_transpose), + (:(NNlib.BatchedAdjoint{T, <:AbstractArray{T, 3}}), 'C', :batched_adjoint) ] -for (TA, transA) in batched_gemm_args, (TB, transB) in batched_gemm_args +using NNlib: batched_mul!, BatchedTranspose, BatchedAdjoint, batched_transpose, batched_adjoint + +for (TA, transA, fA) in batched_gemm_args, (TB, transB, fB) in batched_gemm_args @eval function NNlib.batched_mul!(C::CuArray{T, 3}, A::$TA, B::$TB) where {T<:CUBLAS.CublasFloat} - CuArrays.CUBLAS.gemm_strided_batched!($transA, $transB, one(T), NNlib._unbatch(A), NNlib._unbatch(B), zero(T), C) + + Abase, Bbase = NNlib._unbatch(A), NNlib._unbatch(B) + + # Best case, we can call batched_gemm! immediately: + if Base.stride(Abase,1) == Base.stride(Bbase,1) == Base.stride(C,1) == 1 + CuArrays.CUBLAS.gemm_strided_batched!($transA, $transB, one(T), NNlib._unbatch(A), NNlib._unbatch(B), zero(T), C) + + # Second-best, can we fix it by Perm.ing the base, and adjusing 'T' label? + # But only if we won't produce BatchedTranspose(BatchedAdjoint(complex array)). + elseif Base.stride(Abase,2) == 1 && !(T<:Complex && $TA<:NNlib.BatchedAdjoint) + newAbase = NNlib.batched_transpose(PermutedDimsArray(Abase, (2,1,3))) + return NNlib.batched_mul!(C, $fA(newAbase), B) + elseif Base.stride(Bbase,2) == 1 && !(T<:Complex && $TB<:NNlib.BatchedAdjoint) + newBbase = NNlib.batched_transpose(PermutedDimsArray(Bbase, (2,1,3))) + return NNlib.batched_mul!(C, A, $fB(newBbase)) + + # Fallback, e.g when Base.stride(A,3)==1 + else + NNlib.batched_mul_generic!(C, A, B) + end C end end + +# This is obviously the wrong place for this! Not sure where it should go. +# Base.unsafe_convert(::Type{CUDAdrv.CuPtr{T}}, A::PermutedDimsArray) where {T} = +# Base.unsafe_convert(CUDAdrv.CuPtr{T}, parent(A)) +# Recursive version, will handle e.g. NamedDimsArray +function Base.unsafe_convert(::Type{CUDAdrv.CuPtr{T}}, A::AbstractArray) where {T} + if A === parent(A) + throw(MethodError(Base.unsafe_convert, Tuple{CUDAdrv.CuPtr{T}, typeof(A)})) + else + return Base.unsafe_convert(CUDAdrv.CuPtr{T}, parent(A)) + end +end + + +# This is https://github.com/JuliaLang/julia/pull/35304, here just for testing now: +Base.similar(A::PermutedDimsArray, T::Type, dims::Base.Dims) = similar(parent(A), T, dims) From 93489044259c290ffb70570f75cd2c1e7303e7d6 Mon Sep 17 00:00:00 2001 From: Michael Abbott Date: Thu, 2 Apr 2020 01:01:47 +0200 Subject: [PATCH 3/7] choose method using is_strided_cu(A) --- src/CuArrays.jl.save | 91 ++++++++++++++++++++++++++++++++++++++++++ src/CuArrays.jl.save.1 | 91 ++++++++++++++++++++++++++++++++++++++++++ src/nnlib.jl | 84 +++++++++++++++++++++++++++++++------- test/nnlib.jl | 22 ++++++++++ 4 files changed, 274 insertions(+), 14 deletions(-) create mode 100644 src/CuArrays.jl.save create mode 100644 src/CuArrays.jl.save.1 diff --git a/src/CuArrays.jl.save b/src/CuArrays.jl.save new file mode 100644 index 00000000..7a5f77ad --- /dev/null +++ b/src/CuArrays.jl.save @@ -0,0 +1,91 @@ +module CuArrays + +using CUDAapi, CUDAdrv, CUDAnative + +using GPUArrays + +export CuArray, CuVector, CuMatrix, CuVecOrMat, cu +export CUBLAS, CUSPARSE, CUSOLVER, CUFFT, CURAND, CUDNN, CUTENSOR + +import LinearAlgebra + +using Adapt + +using Libdl + +using Requires + + +## source code includes + +include("bindeps.jl") + +# core array functionality +include("memory.jl") +include("array.jl") +include("gpuarrays.jl") +include("subarray.jl") +include("utils.jl") + +# integrations and specialized functionality +include("indexing.jl") +include("broadcast.jl") +include("mapreduce.jl") +include("accumulate.jl") +include("linalg.jl") + +# vendor libraries +include("blas/CUBLAS.jl") +include("sparse/CUSPARSE.jl") +include("solver/CUSOLVER.jl") +include("fft/CUFFT.jl") +include("rand/CURAND.jl") +include("dnn/CUDNN.jl") +include("tensor/CUTENSOR.jl") + +include("deprecated.jl") + +include("nnlib.jl") + +## initialization + +const __initialized__ = Ref(false) +functional() = __initialized__[] + +export has_cudnn, has_cutensor +has_cudnn() = Libdl.dlopen_e(CUDNN.libcudnn[]) !== C_NULL +has_cutensor() = Libdl.dlopen_e(CUTENSOR.libcutensor[]) !== C_NULL + +function __init__() + precompiling = ccall(:jl_generating_output, Cint, ()) != 0 + silent = parse(Bool, get(ENV, "JULIA_CUDA_SILENT", "false")) || precompiling + verbose = parse(Bool, get(ENV, "JULIA_CUDA_VERBOSE", "false")) + + # if any dependent GPU package failed, expect it to have logged an error and bail out + if !CUDAdrv.functional() || !CUDAnative.functional() + verbose && @warn "CuArrays.jl did not initialize because CUDAdrv.jl or CUDAnative.jl failed to" + return + end + + try + __init_bindeps__(silent=silent, verbose=verbose) + + # package integrations + @require ForwardDiff="f6369f11-7733-5829-9624-2563aa707210" include("forwarddiff.jl") + + __init_memory__() + + __initialized__[] = true + catch ex + # don't actually fail to keep the package loadable + if !silent + if verbose + @error "CuArrays.jl failed to initialize" exception=(ex, catch_backtrace()) + else + @info "CuArrays.jl failed to initialize and will be unavailable (set JULIA_CUDA_SILENT or JULIA_CUDA_VERBOSE to silence or expand this message)" + end + end + end +end + +end # module diff --git a/src/CuArrays.jl.save.1 b/src/CuArrays.jl.save.1 new file mode 100644 index 00000000..7a5f77ad --- /dev/null +++ b/src/CuArrays.jl.save.1 @@ -0,0 +1,91 @@ +module CuArrays + +using CUDAapi, CUDAdrv, CUDAnative + +using GPUArrays + +export CuArray, CuVector, CuMatrix, CuVecOrMat, cu +export CUBLAS, CUSPARSE, CUSOLVER, CUFFT, CURAND, CUDNN, CUTENSOR + +import LinearAlgebra + +using Adapt + +using Libdl + +using Requires + + +## source code includes + +include("bindeps.jl") + +# core array functionality +include("memory.jl") +include("array.jl") +include("gpuarrays.jl") +include("subarray.jl") +include("utils.jl") + +# integrations and specialized functionality +include("indexing.jl") +include("broadcast.jl") +include("mapreduce.jl") +include("accumulate.jl") +include("linalg.jl") + +# vendor libraries +include("blas/CUBLAS.jl") +include("sparse/CUSPARSE.jl") +include("solver/CUSOLVER.jl") +include("fft/CUFFT.jl") +include("rand/CURAND.jl") +include("dnn/CUDNN.jl") +include("tensor/CUTENSOR.jl") + +include("deprecated.jl") + +include("nnlib.jl") + +## initialization + +const __initialized__ = Ref(false) +functional() = __initialized__[] + +export has_cudnn, has_cutensor +has_cudnn() = Libdl.dlopen_e(CUDNN.libcudnn[]) !== C_NULL +has_cutensor() = Libdl.dlopen_e(CUTENSOR.libcutensor[]) !== C_NULL + +function __init__() + precompiling = ccall(:jl_generating_output, Cint, ()) != 0 + silent = parse(Bool, get(ENV, "JULIA_CUDA_SILENT", "false")) || precompiling + verbose = parse(Bool, get(ENV, "JULIA_CUDA_VERBOSE", "false")) + + # if any dependent GPU package failed, expect it to have logged an error and bail out + if !CUDAdrv.functional() || !CUDAnative.functional() + verbose && @warn "CuArrays.jl did not initialize because CUDAdrv.jl or CUDAnative.jl failed to" + return + end + + try + __init_bindeps__(silent=silent, verbose=verbose) + + # package integrations + @require ForwardDiff="f6369f11-7733-5829-9624-2563aa707210" include("forwarddiff.jl") + + __init_memory__() + + __initialized__[] = true + catch ex + # don't actually fail to keep the package loadable + if !silent + if verbose + @error "CuArrays.jl failed to initialize" exception=(ex, catch_backtrace()) + else + @info "CuArrays.jl failed to initialize and will be unavailable (set JULIA_CUDA_SILENT or JULIA_CUDA_VERBOSE to silence or expand this message)" + end + end + end +end + +end # module diff --git a/src/nnlib.jl b/src/nnlib.jl index 24523710..63f661c0 100644 --- a/src/nnlib.jl +++ b/src/nnlib.jl @@ -33,43 +33,62 @@ end # Batched matrix multiplication +# This method has a slightly tighter signature than the one in NNlib, all same eltype. +function NNlib.batched_mul!(C::AbstractArray{T,3}, A::AbstractArray{T,3}, B::AbstractArray{T,3}) where {T<:CUBLAS.CublasFloat} + if is_strided_cu(A) && is_strided_cu(B) && is_strided_cu(C) + # Data is on GPU, and it's safe to call strides(A). gemm_strided_batched may be legal. + batched_try_gemm!(C, A, B) + + elseif is_strided_cu(A) || is_strided_cu(B) || is_strided_cu(C) + # This is hopeless, but best option is the fallback + @debug "weird mix of CPU + GPU?" + NNlib.batched_mul_generic!(C, A, B) + + else + # All cases for CPU gemm! will come through here, is_strided_cu(A) compiles away: + NNlib.batched_mul_cpu!(C, A, B) + end +end + const batched_gemm_args = [ - (:(AbstractArray{T, 3}), 'N', :identity), - (:(NNlib.BatchedTranspose{T, <:AbstractArray{T, 3}}), 'T', :batched_transpose), - (:(NNlib.BatchedAdjoint{T, <:AbstractArray{T, 3}}), 'C', :batched_adjoint) + (:(AbstractArray{T, 3}), 'N', :identity), + (:(NNlib.BatchedTranspose{T}), 'T', :batched_transpose), + (:(NNlib.BatchedAdjoint{T}), 'C', :batched_adjoint) ] using NNlib: batched_mul!, BatchedTranspose, BatchedAdjoint, batched_transpose, batched_adjoint +using NNlib: _unbatch, _perm12 for (TA, transA, fA) in batched_gemm_args, (TB, transB, fB) in batched_gemm_args - @eval function NNlib.batched_mul!(C::CuArray{T, 3}, A::$TA, B::$TB) where {T<:CUBLAS.CublasFloat} + @eval function batched_try_gemm!(C::AbstractArray{T, 3}, A::$TA, B::$TB) where {T<:CUBLAS.CublasFloat} - Abase, Bbase = NNlib._unbatch(A), NNlib._unbatch(B) + Abase, Bbase = _unbatch(A), _unbatch(B) # Best case, we can call batched_gemm! immediately: if Base.stride(Abase,1) == Base.stride(Bbase,1) == Base.stride(C,1) == 1 - CuArrays.CUBLAS.gemm_strided_batched!($transA, $transB, one(T), NNlib._unbatch(A), NNlib._unbatch(B), zero(T), C) + CuArrays.CUBLAS.gemm_strided_batched!($transA, $transB, one(T), Abase, Bbase, zero(T), C) # Second-best, can we fix it by Perm.ing the base, and adjusing 'T' label? # But only if we won't produce BatchedTranspose(BatchedAdjoint(complex array)). - elseif Base.stride(Abase,2) == 1 && !(T<:Complex && $TA<:NNlib.BatchedAdjoint) - newAbase = NNlib.batched_transpose(PermutedDimsArray(Abase, (2,1,3))) - return NNlib.batched_mul!(C, $fA(newAbase), B) - elseif Base.stride(Bbase,2) == 1 && !(T<:Complex && $TB<:NNlib.BatchedAdjoint) - newBbase = NNlib.batched_transpose(PermutedDimsArray(Bbase, (2,1,3))) - return NNlib.batched_mul!(C, A, $fB(newBbase)) + elseif Base.stride(Abase,2) == 1 && !(T<:Complex && $TA<:BatchedAdjoint) + newAbase = batched_transpose(_perm12(Abase)) + return batched_try_gemm!(C, $fA(newAbase), B) + + elseif Base.stride(Bbase,2) == 1 && !(T<:Complex && $TB<:BatchedAdjoint) + newBbase = batched_transpose(_perm12(Bbase)) + return batched_try_gemm!(C, A, $fB(newBbase)) # Fallback, e.g when Base.stride(A,3)==1 else + @debug "couldn't re-arrange strides for CUBLAS.gemm_strided_batched!" strides(A) strides(B) strides(C) NNlib.batched_mul_generic!(C, A, B) end C end end + # This is obviously the wrong place for this! Not sure where it should go. -# Base.unsafe_convert(::Type{CUDAdrv.CuPtr{T}}, A::PermutedDimsArray) where {T} = -# Base.unsafe_convert(CUDAdrv.CuPtr{T}, parent(A)) # Recursive version, will handle e.g. NamedDimsArray function Base.unsafe_convert(::Type{CUDAdrv.CuPtr{T}}, A::AbstractArray) where {T} if A === parent(A) @@ -82,3 +101,40 @@ end # This is https://github.com/JuliaLang/julia/pull/35304, here just for testing now: Base.similar(A::PermutedDimsArray, T::Type, dims::Base.Dims) = similar(parent(A), T, dims) + + +# Also the wong place for this, surely. +""" + is_strided_cu(A) + +This should return `true` for `A::CuArray`, and also for: +* Any `view(::CuArray)` or `reshape(::CuArray)` etc. which remains a `StridedArray` +* Any other wrapper for which `is_strided_cu(parent(A))` +* Except that `Adjoint(A)` is only unwrapped for real numbers. + +Such wrappers include `PermutedDimsArray(::CuArray, ...)`, +but also those defined elsewhere (such as `NamedDimsArray`s) +which are assumed not to break strided-ness. + +`Transpose` and `Adjoint` don't currently define `strides`, so for now they return `false`. +""" +is_strided_cu(A::CuArray) = true +is_strided_cu(A) = false +function is_strided_cu(A::AbstractArray) + M = parentmodule(typeof(A)) + if parent(A) === A # Array, SparseMatrix, StaticArray + false + elseif M === Base || M === Core || M ===LinearAlgebra + A isa StridedArray && is_strided_cu(parent(A)) + else + is_strided_cu(parent(A)) # PermutedDimsArray, NamedDimsArray + end +end + +if hasmethod(Base.strides, Tuple{LinearAlgebra.Transpose}) + is_strided_cu(A::LinearAlgebra.Transpose) = is_strided(parent(A)) + is_strided_cu(A::LinearAlgebra.Adjoint) = eltype(A) <: Real && is_strided(parent(A)) +else + is_strided_cu(A::LinearAlgebra.Transpose) = false + is_strided_cu(A::LinearAlgebra.Adjoint) = false +end diff --git a/test/nnlib.jl b/test/nnlib.jl index e8e1462d..cfa1d8a7 100644 --- a/test/nnlib.jl +++ b/test/nnlib.jl @@ -16,4 +16,26 @@ @test cu(Ca) ≈ batched_mul(cu(A), batched_adjoint(cu(B))) end +using CuArrays: is_strided_cu +using LinearAlgebra +@testset "is_strided_cu" begin + + M = cu(ones(10,10)) + + @test is_strided_cu(M) + @test is_strided_cu(view(M, 1:2:5,:)) + @test is_strided_cu(PermutedDimsArray(M, (2,1))) + + @test !is_strided_cu(reshape(view(M, 1:2:10,:), 10,:)) + @test !is_strided_cu((M.+im)') + @test !is_strided_cu(ones(10,10)) + @test !is_strided_cu(Diagonal(ones(3))) + + #= + using NamedDims + @test is_strided(NamedDimsArray(M,(:a, :b))) # and 0.029 ns, 0 allocations + =# + +end + end From 8bc792bdd598909864e75ac234432665fb9eba60 Mon Sep 17 00:00:00 2001 From: Michael Abbott Date: Thu, 2 Apr 2020 01:04:06 +0200 Subject: [PATCH 4/7] rm .save --- src/CuArrays.jl.save | 91 ------------------------------------------ src/CuArrays.jl.save.1 | 91 ------------------------------------------ 2 files changed, 182 deletions(-) delete mode 100644 src/CuArrays.jl.save delete mode 100644 src/CuArrays.jl.save.1 diff --git a/src/CuArrays.jl.save b/src/CuArrays.jl.save deleted file mode 100644 index 7a5f77ad..00000000 --- a/src/CuArrays.jl.save +++ /dev/null @@ -1,91 +0,0 @@ -module CuArrays - -using CUDAapi, CUDAdrv, CUDAnative - -using GPUArrays - -export CuArray, CuVector, CuMatrix, CuVecOrMat, cu -export CUBLAS, CUSPARSE, CUSOLVER, CUFFT, CURAND, CUDNN, CUTENSOR - -import LinearAlgebra - -using Adapt - -using Libdl - -using Requires - - -## source code includes - -include("bindeps.jl") - -# core array functionality -include("memory.jl") -include("array.jl") -include("gpuarrays.jl") -include("subarray.jl") -include("utils.jl") - -# integrations and specialized functionality -include("indexing.jl") -include("broadcast.jl") -include("mapreduce.jl") -include("accumulate.jl") -include("linalg.jl") - -# vendor libraries -include("blas/CUBLAS.jl") -include("sparse/CUSPARSE.jl") -include("solver/CUSOLVER.jl") -include("fft/CUFFT.jl") -include("rand/CURAND.jl") -include("dnn/CUDNN.jl") -include("tensor/CUTENSOR.jl") - -include("deprecated.jl") - -include("nnlib.jl") - -## initialization - -const __initialized__ = Ref(false) -functional() = __initialized__[] - -export has_cudnn, has_cutensor -has_cudnn() = Libdl.dlopen_e(CUDNN.libcudnn[]) !== C_NULL -has_cutensor() = Libdl.dlopen_e(CUTENSOR.libcutensor[]) !== C_NULL - -function __init__() - precompiling = ccall(:jl_generating_output, Cint, ()) != 0 - silent = parse(Bool, get(ENV, "JULIA_CUDA_SILENT", "false")) || precompiling - verbose = parse(Bool, get(ENV, "JULIA_CUDA_VERBOSE", "false")) - - # if any dependent GPU package failed, expect it to have logged an error and bail out - if !CUDAdrv.functional() || !CUDAnative.functional() - verbose && @warn "CuArrays.jl did not initialize because CUDAdrv.jl or CUDAnative.jl failed to" - return - end - - try - __init_bindeps__(silent=silent, verbose=verbose) - - # package integrations - @require ForwardDiff="f6369f11-7733-5829-9624-2563aa707210" include("forwarddiff.jl") - - __init_memory__() - - __initialized__[] = true - catch ex - # don't actually fail to keep the package loadable - if !silent - if verbose - @error "CuArrays.jl failed to initialize" exception=(ex, catch_backtrace()) - else - @info "CuArrays.jl failed to initialize and will be unavailable (set JULIA_CUDA_SILENT or JULIA_CUDA_VERBOSE to silence or expand this message)" - end - end - end -end - -end # module diff --git a/src/CuArrays.jl.save.1 b/src/CuArrays.jl.save.1 deleted file mode 100644 index 7a5f77ad..00000000 --- a/src/CuArrays.jl.save.1 +++ /dev/null @@ -1,91 +0,0 @@ -module CuArrays - -using CUDAapi, CUDAdrv, CUDAnative - -using GPUArrays - -export CuArray, CuVector, CuMatrix, CuVecOrMat, cu -export CUBLAS, CUSPARSE, CUSOLVER, CUFFT, CURAND, CUDNN, CUTENSOR - -import LinearAlgebra - -using Adapt - -using Libdl - -using Requires - - -## source code includes - -include("bindeps.jl") - -# core array functionality -include("memory.jl") -include("array.jl") -include("gpuarrays.jl") -include("subarray.jl") -include("utils.jl") - -# integrations and specialized functionality -include("indexing.jl") -include("broadcast.jl") -include("mapreduce.jl") -include("accumulate.jl") -include("linalg.jl") - -# vendor libraries -include("blas/CUBLAS.jl") -include("sparse/CUSPARSE.jl") -include("solver/CUSOLVER.jl") -include("fft/CUFFT.jl") -include("rand/CURAND.jl") -include("dnn/CUDNN.jl") -include("tensor/CUTENSOR.jl") - -include("deprecated.jl") - -include("nnlib.jl") - -## initialization - -const __initialized__ = Ref(false) -functional() = __initialized__[] - -export has_cudnn, has_cutensor -has_cudnn() = Libdl.dlopen_e(CUDNN.libcudnn[]) !== C_NULL -has_cutensor() = Libdl.dlopen_e(CUTENSOR.libcutensor[]) !== C_NULL - -function __init__() - precompiling = ccall(:jl_generating_output, Cint, ()) != 0 - silent = parse(Bool, get(ENV, "JULIA_CUDA_SILENT", "false")) || precompiling - verbose = parse(Bool, get(ENV, "JULIA_CUDA_VERBOSE", "false")) - - # if any dependent GPU package failed, expect it to have logged an error and bail out - if !CUDAdrv.functional() || !CUDAnative.functional() - verbose && @warn "CuArrays.jl did not initialize because CUDAdrv.jl or CUDAnative.jl failed to" - return - end - - try - __init_bindeps__(silent=silent, verbose=verbose) - - # package integrations - @require ForwardDiff="f6369f11-7733-5829-9624-2563aa707210" include("forwarddiff.jl") - - __init_memory__() - - __initialized__[] = true - catch ex - # don't actually fail to keep the package loadable - if !silent - if verbose - @error "CuArrays.jl failed to initialize" exception=(ex, catch_backtrace()) - else - @info "CuArrays.jl failed to initialize and will be unavailable (set JULIA_CUDA_SILENT or JULIA_CUDA_VERBOSE to silence or expand this message)" - end - end - end -end - -end # module From 0c6f5c68b1cad82ad264ae5f132d46072f92223e Mon Sep 17 00:00:00 2001 From: Michael Abbott Date: Sat, 4 Apr 2020 00:22:59 +0200 Subject: [PATCH 5/7] use _batched_gemm and storage_type from https://github.com/FluxML/NNlib.jl/pull/191 --- src/nnlib.jl | 105 ++------------------------------------------------ test/nnlib.jl | 23 +++++------ 2 files changed, 14 insertions(+), 114 deletions(-) diff --git a/src/nnlib.jl b/src/nnlib.jl index 63f661c0..d6cf1269 100644 --- a/src/nnlib.jl +++ b/src/nnlib.jl @@ -32,109 +32,12 @@ end # Batched matrix multiplication +# Using storage_type from https://github.com/FluxML/NNlib.jl/pull/191 -# This method has a slightly tighter signature than the one in NNlib, all same eltype. -function NNlib.batched_mul!(C::AbstractArray{T,3}, A::AbstractArray{T,3}, B::AbstractArray{T,3}) where {T<:CUBLAS.CublasFloat} - if is_strided_cu(A) && is_strided_cu(B) && is_strided_cu(C) - # Data is on GPU, and it's safe to call strides(A). gemm_strided_batched may be legal. - batched_try_gemm!(C, A, B) - - elseif is_strided_cu(A) || is_strided_cu(B) || is_strided_cu(C) - # This is hopeless, but best option is the fallback - @debug "weird mix of CPU + GPU?" - NNlib.batched_mul_generic!(C, A, B) - - else - # All cases for CPU gemm! will come through here, is_strided_cu(A) compiles away: - NNlib.batched_mul_cpu!(C, A, B) - end -end - -const batched_gemm_args = [ - (:(AbstractArray{T, 3}), 'N', :identity), - (:(NNlib.BatchedTranspose{T}), 'T', :batched_transpose), - (:(NNlib.BatchedAdjoint{T}), 'C', :batched_adjoint) -] - -using NNlib: batched_mul!, BatchedTranspose, BatchedAdjoint, batched_transpose, batched_adjoint -using NNlib: _unbatch, _perm12 - -for (TA, transA, fA) in batched_gemm_args, (TB, transB, fB) in batched_gemm_args - @eval function batched_try_gemm!(C::AbstractArray{T, 3}, A::$TA, B::$TB) where {T<:CUBLAS.CublasFloat} - - Abase, Bbase = _unbatch(A), _unbatch(B) - - # Best case, we can call batched_gemm! immediately: - if Base.stride(Abase,1) == Base.stride(Bbase,1) == Base.stride(C,1) == 1 - CuArrays.CUBLAS.gemm_strided_batched!($transA, $transB, one(T), Abase, Bbase, zero(T), C) - - # Second-best, can we fix it by Perm.ing the base, and adjusing 'T' label? - # But only if we won't produce BatchedTranspose(BatchedAdjoint(complex array)). - elseif Base.stride(Abase,2) == 1 && !(T<:Complex && $TA<:BatchedAdjoint) - newAbase = batched_transpose(_perm12(Abase)) - return batched_try_gemm!(C, $fA(newAbase), B) - - elseif Base.stride(Bbase,2) == 1 && !(T<:Complex && $TB<:BatchedAdjoint) - newBbase = batched_transpose(_perm12(Bbase)) - return batched_try_gemm!(C, A, $fB(newBbase)) - - # Fallback, e.g when Base.stride(A,3)==1 - else - @debug "couldn't re-arrange strides for CUBLAS.gemm_strided_batched!" strides(A) strides(B) strides(C) - NNlib.batched_mul_generic!(C, A, B) - end - C - end -end - - -# This is obviously the wrong place for this! Not sure where it should go. -# Recursive version, will handle e.g. NamedDimsArray -function Base.unsafe_convert(::Type{CUDAdrv.CuPtr{T}}, A::AbstractArray) where {T} - if A === parent(A) - throw(MethodError(Base.unsafe_convert, Tuple{CUDAdrv.CuPtr{T}, typeof(A)})) - else - return Base.unsafe_convert(CUDAdrv.CuPtr{T}, parent(A)) - end -end - +NNlib._batched_gemm!(::Type{<:CuArray}, transA::Char, transB::Char, α::Number, A, B, β::Number, C) = + CuArrays.CUBLAS.gemm_strided_batched!(transA, transB, α, A, B, β, C) # This is https://github.com/JuliaLang/julia/pull/35304, here just for testing now: Base.similar(A::PermutedDimsArray, T::Type, dims::Base.Dims) = similar(parent(A), T, dims) +# @which Base.similar(PermutedDimsArray(rand(2,2), (2,1)), Int, Base.Dims{2}((3,3))) - -# Also the wong place for this, surely. -""" - is_strided_cu(A) - -This should return `true` for `A::CuArray`, and also for: -* Any `view(::CuArray)` or `reshape(::CuArray)` etc. which remains a `StridedArray` -* Any other wrapper for which `is_strided_cu(parent(A))` -* Except that `Adjoint(A)` is only unwrapped for real numbers. - -Such wrappers include `PermutedDimsArray(::CuArray, ...)`, -but also those defined elsewhere (such as `NamedDimsArray`s) -which are assumed not to break strided-ness. - -`Transpose` and `Adjoint` don't currently define `strides`, so for now they return `false`. -""" -is_strided_cu(A::CuArray) = true -is_strided_cu(A) = false -function is_strided_cu(A::AbstractArray) - M = parentmodule(typeof(A)) - if parent(A) === A # Array, SparseMatrix, StaticArray - false - elseif M === Base || M === Core || M ===LinearAlgebra - A isa StridedArray && is_strided_cu(parent(A)) - else - is_strided_cu(parent(A)) # PermutedDimsArray, NamedDimsArray - end -end - -if hasmethod(Base.strides, Tuple{LinearAlgebra.Transpose}) - is_strided_cu(A::LinearAlgebra.Transpose) = is_strided(parent(A)) - is_strided_cu(A::LinearAlgebra.Adjoint) = eltype(A) <: Real && is_strided(parent(A)) -else - is_strided_cu(A::LinearAlgebra.Transpose) = false - is_strided_cu(A::LinearAlgebra.Adjoint) = false -end diff --git a/test/nnlib.jl b/test/nnlib.jl index cfa1d8a7..12ec19d7 100644 --- a/test/nnlib.jl +++ b/test/nnlib.jl @@ -16,25 +16,22 @@ @test cu(Ca) ≈ batched_mul(cu(A), batched_adjoint(cu(B))) end -using CuArrays: is_strided_cu +using NNlib: is_strided, are_strided, storage_type using LinearAlgebra -@testset "is_strided_cu" begin +@testset "NNlib storage_type etc." begin M = cu(ones(10,10)) - @test is_strided_cu(M) - @test is_strided_cu(view(M, 1:2:5,:)) - @test is_strided_cu(PermutedDimsArray(M, (2,1))) + @test is_strided(M) + @test is_strided(view(M, 1:2:5,:)) + @test is_strided(PermutedDimsArray(M, (2,1))) - @test !is_strided_cu(reshape(view(M, 1:2:10,:), 10,:)) - @test !is_strided_cu((M.+im)') - @test !is_strided_cu(ones(10,10)) - @test !is_strided_cu(Diagonal(ones(3))) + @test !is_strided(reshape(view(M, 1:2:10,:), 10,:)) + @test !is_strided((M.+im)') + @test !is_strided(Diagonal(cu(ones(3)))) - #= - using NamedDims - @test is_strided(NamedDimsArray(M,(:a, :b))) # and 0.029 ns, 0 allocations - =# + @test storage_type(M) == CuArray{Float32,2,Nothing} + @test storage_type(reshape(view(M, 1:2:10,:), 10,:)) == CuArray{Float32,2,Nothing} end From 4d8c279e1744a6d579c5b4e5f273f4855c636774 Mon Sep 17 00:00:00 2001 From: Michael Abbott Date: Sat, 4 Apr 2020 10:16:30 +0200 Subject: [PATCH 6/7] allow size(A,3)==1 with size(B,3)==size(C,3) --- src/blas/wrappers.jl | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/blas/wrappers.jl b/src/blas/wrappers.jl index d0a58f42..765e40cf 100644 --- a/src/blas/wrappers.jl +++ b/src/blas/wrappers.jl @@ -945,7 +945,8 @@ for (fname, elty) in k = size(A, transA == 'N' ? 2 : 1) n = size(B, transB == 'N' ? 2 : 1) - @assert size(A, 3) == size(B, 3) == size(C, 3) "Batch size mismatch" + @assert size(A, 3) == size(C, 3) || size(A, 3) == 1 "batch size mismatch: A != C" + @assert size(B, 3) == size(C, 3) || size(B, 3) == 1 "batch size mismatch: B != C" if m != size(C,1) || n != size(C,2) || k != size(B, transB == 'N' ? 1 : 2) throw(DimensionMismatch("")) @@ -956,10 +957,10 @@ for (fname, elty) in ldb = max(1,stride(B,2)) ldc = max(1,stride(C,2)) - strideA = stride(A, 3) - strideB = stride(B, 3) + strideA = size(A, 3) == 1 ? 0 : stride(A, 3) + strideB = size(B, 3) == 1 ? 0 : stride(B, 3) strideC = stride(C, 3) - batchCount = size(A, 3) + batchCount = size(C, 3) $fname(handle(), cutransA,cutransB, m, n, k, [alpha], A, lda, strideA, B, ldb, strideB, [beta], C, ldc, strideC, batchCount) C @@ -969,7 +970,7 @@ for (fname, elty) in alpha::($elty), A::AbstractArray{$elty, 3}, B::AbstractArray{$elty, 3}) - C = similar(B, (size(A, transA == 'N' ? 1 : 2), size(B, transB == 'N' ? 2 : 1), size(A, 3))) + C = similar(B, (size(A, transA == 'N' ? 1 : 2), size(B, transB == 'N' ? 2 : 1), max(size(A, 3), size(B, 3)))) gemm_strided_batched!(transA, transB, alpha, A, B, zero($elty), C ) end function gemm_strided_batched(transA::Char, From 73e2ef4c8a66e3260607df67306d225ef722a056 Mon Sep 17 00:00:00 2001 From: Michael Abbott Date: Thu, 30 Apr 2020 23:29:19 +0200 Subject: [PATCH 7/7] use Compat 3.9 for similar(PermutedDimsArray) --- Project.toml | 2 ++ src/nnlib.jl | 4 ---- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/Project.toml b/Project.toml index af474927..e9c3df61 100644 --- a/Project.toml +++ b/Project.toml @@ -9,6 +9,7 @@ CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82" CUDAapi = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3" CUDAdrv = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde" CUDAnative = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17" +Compat = "34da2185-b29b-5c13-b0c7-acf172513d20" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" @@ -28,6 +29,7 @@ TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" AbstractFFTs = "0.4, 0.5" Adapt = "1.0" CEnum = "0.2" +Compat = "3.9" CUDAapi = "3.0, 4.0" CUDAdrv = "6.0.1" CUDAnative = "3.0" diff --git a/src/nnlib.jl b/src/nnlib.jl index d6cf1269..842ac175 100644 --- a/src/nnlib.jl +++ b/src/nnlib.jl @@ -37,7 +37,3 @@ end NNlib._batched_gemm!(::Type{<:CuArray}, transA::Char, transB::Char, α::Number, A, B, β::Number, C) = CuArrays.CUBLAS.gemm_strided_batched!(transA, transB, α, A, B, β, C) -# This is https://github.com/JuliaLang/julia/pull/35304, here just for testing now: -Base.similar(A::PermutedDimsArray, T::Type, dims::Base.Dims) = similar(parent(A), T, dims) -# @which Base.similar(PermutedDimsArray(rand(2,2), (2,1)), Int, Base.Dims{2}((3,3))) -