From 3eea8a629d5fd7729ecf044e0c591ce9b2d041f1 Mon Sep 17 00:00:00 2001 From: Michael Abbott Date: Sat, 24 Oct 2020 15:20:06 +0200 Subject: [PATCH] allow PermutedDimsArray in gemm_strided_batched copied from https://github.com/JuliaGPU/CuArrays.jl/pull/664, needs https://github.com/FluxML/NNlib.jl/pull/191 --- Project.toml | 2 ++ lib/cublas/wrappers.jl | 25 +++++++++++++------------ src/nnlib.jl | 15 +++------------ test/nnlib.jl | 19 +++++++++++++++++++ 4 files changed, 37 insertions(+), 24 deletions(-) diff --git a/Project.toml b/Project.toml index af679620e8..50534a052c 100644 --- a/Project.toml +++ b/Project.toml @@ -7,6 +7,7 @@ AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82" +Compat = "34da2185-b29b-5c13-b0c7-acf172513d20" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04" GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" @@ -31,6 +32,7 @@ AbstractFFTs = "0.4, 0.5" Adapt = "2.2" BFloat16s = "0.1" CEnum = "0.2, 0.3, 0.4" +Compat = "3.9" DataStructures = "0.17, 0.18" ExprTools = "0.1" GPUArrays = "6.1.0" diff --git a/lib/cublas/wrappers.jl b/lib/cublas/wrappers.jl index b2682cae8a..d81faf83f1 100644 --- a/lib/cublas/wrappers.jl +++ b/lib/cublas/wrappers.jl @@ -923,15 +923,16 @@ for (fname, elty) in function gemm_strided_batched!(transA::Char, transB::Char, alpha::Number, - A::DenseCuArray{$elty, 3}, - B::DenseCuArray{$elty, 3}, + A::AbstractArray{$elty, 3}, + B::AbstractArray{$elty, 3}, beta::Number, - C::DenseCuArray{$elty, 3}) + C::AbstractArray{$elty, 3}) m = size(A, transA == 'N' ? 1 : 2) k = size(A, transA == 'N' ? 2 : 1) n = size(B, transB == 'N' ? 2 : 1) - @assert size(A, 3) == size(B, 3) == size(C, 3) "Batch size mismatch" + @assert size(A, 3) == size(C, 3) || size(A, 3) == 1 "batch size mismatch: A != C" + @assert size(B, 3) == size(C, 3) || size(B, 3) == 1 "batch size mismatch: B != C" if m != size(C,1) || n != size(C,2) || k != size(B, transB == 'N' ? 1 : 2) throw(DimensionMismatch("")) @@ -940,10 +941,10 @@ for (fname, elty) in ldb = max(1,stride(B,2)) ldc = max(1,stride(C,2)) - strideA = stride(A, 3) - strideB = stride(B, 3) + strideA = size(A, 3) == 1 ? 0 : stride(A, 3) + strideB = size(B, 3) == 1 ? 0 : stride(B, 3) strideC = stride(C, 3) - batchCount = size(A, 3) + batchCount = size(C, 3) $fname(handle(), transA, transB, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount) C @@ -951,15 +952,15 @@ for (fname, elty) in function gemm_strided_batched(transA::Char, transB::Char, alpha::Number, - A::DenseCuArray{$elty, 3}, - B::DenseCuArray{$elty, 3}) - C = similar(B, (size(A, transA == 'N' ? 1 : 2), size(B, transB == 'N' ? 2 : 1), size(A, 3))) + A::AbstractArray{$elty, 3}, + B::AbstractArray{$elty, 3}) + C = similar(B, (size(A, transA == 'N' ? 1 : 2), size(B, transB == 'N' ? 2 : 1), max(size(A, 3), size(B, 3)))) gemm_strided_batched!(transA, transB, alpha, A, B, zero($elty), C ) end function gemm_strided_batched(transA::Char, transB::Char, - A::DenseCuArray{$elty, 3}, - B::DenseCuArray{$elty, 3}) + A::AbstractArray{$elty, 3}, + B::AbstractArray{$elty, 3}) gemm_strided_batched(transA, transB, one($elty), A, B) end end diff --git a/src/nnlib.jl b/src/nnlib.jl index 9ffdba88ee..01d0b3a95b 100644 --- a/src/nnlib.jl +++ b/src/nnlib.jl @@ -23,16 +23,7 @@ end # Batched matrix multiplication +# Using storage_type from https://github.com/FluxML/NNlib.jl/pull/191 -const batched_gemm_args = [ - (:(CuArray{T, 3}), 'N'), - (:(NNlib.BatchedTranspose{T, <:CuArray{T, 3}}), 'T'), - (:(NNlib.BatchedAdjoint{T, <:CuArray{T, 3}}), 'C') -] - -for (TA, transA) in batched_gemm_args, (TB, transB) in batched_gemm_args - @eval function NNlib.batched_mul!(C::CuArray{T, 3}, A::$TA, B::$TB) where {T<:CUBLAS.CublasFloat} - CUBLAS.gemm_strided_batched!($transA, $transB, one(T), NNlib._unbatch(A), NNlib._unbatch(B), zero(T), C) - C - end -end + NNlib._batched_gemm!(::Type{<:CuArray}, transA::Char, transB::Char, α::Number, A, B, β::Number, C) = + CUBLAS.gemm_strided_batched!(transA, transB, α, A, B, β, C) diff --git a/test/nnlib.jl b/test/nnlib.jl index 930b0a1b7d..df810b4361 100644 --- a/test/nnlib.jl +++ b/test/nnlib.jl @@ -16,6 +16,25 @@ using NNlib @test CuArray(Ca) ≈ batched_mul(CuArray(A), batched_adjoint(CuArray(B))) end +@testset "NNlib storage_type etc." begin + using LinearAlgebra + using NNlib: is_strided, are_strided, storage_type + + M = cu(ones(10,10)) + + @test is_strided(M) + @test is_strided(view(M, 1:2:5,:)) + @test is_strided(PermutedDimsArray(M, (2,1))) + + @test !is_strided(reshape(view(M, 1:2:10,:), 10,:)) + @test !is_strided((M .+ im)') + @test !is_strided(Diagonal(cu(ones(3)))) + + @test storage_type(M) == CuArray{Float32,2,Nothing} + @test storage_type(reshape(view(M, 1:2:10,:), 10,:)) == CuArray{Float32,2,Nothing} + +end + @testset "Broadcast Fix" begin if CUDA.has_cudnn() @test testf(x -> logσ.(x), rand(5))