Skip to content
This repository has been archived by the owner on Mar 12, 2021. It is now read-only.

Make gemm_strided_batched! work with PermutedDimsArrays #664

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
CUDAapi = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
CUDAdrv = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
CUDAnative = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
Expand All @@ -28,6 +29,7 @@ TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
AbstractFFTs = "0.4, 0.5"
Adapt = "1.0"
CEnum = "0.2"
Compat = "3.9"
CUDAapi = "3.0, 4.0"
CUDAdrv = "6.0.1"
CUDAnative = "3.0"
Expand Down
25 changes: 13 additions & 12 deletions src/blas/wrappers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -937,15 +937,16 @@ for (fname, elty) in
function gemm_strided_batched!(transA::Char,
transB::Char,
alpha::($elty),
A::CuArray{$elty, 3},
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think when I tried to wrap this I intended to use this as a low level API, now since both CUDAnative and CuArrays changed a lot, maybe we need a more bare wrapper (like a pointer type CuPtr) directly wraps the CUBLAS API? then it'd be more elegant to have a higher level wrapper for different Julia array types.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can NNlib.batched_mul be this higher-level wrapper? FluxML/NNlib.jl#191 makes it more flexible, and able to dispatch according to the underlying data.

And what can't you do with this wrapper (which works on any AbstractArray for which this pointer exists) which you could do with a different one?

B::CuArray{$elty, 3},
A::AbstractArray{$elty, 3},
B::AbstractArray{$elty, 3},
beta::($elty),
C::CuArray{$elty, 3})
C::AbstractArray{$elty, 3})
m = size(A, transA == 'N' ? 1 : 2)
k = size(A, transA == 'N' ? 2 : 1)
n = size(B, transB == 'N' ? 2 : 1)

@assert size(A, 3) == size(B, 3) == size(C, 3) "Batch size mismatch"
@assert size(A, 3) == size(C, 3) || size(A, 3) == 1 "batch size mismatch: A != C"
@assert size(B, 3) == size(C, 3) || size(B, 3) == 1 "batch size mismatch: B != C"

if m != size(C,1) || n != size(C,2) || k != size(B, transB == 'N' ? 1 : 2)
throw(DimensionMismatch(""))
Expand All @@ -956,26 +957,26 @@ for (fname, elty) in
ldb = max(1,stride(B,2))
ldc = max(1,stride(C,2))

strideA = stride(A, 3)
strideB = stride(B, 3)
strideA = size(A, 3) == 1 ? 0 : stride(A, 3)
strideB = size(B, 3) == 1 ? 0 : stride(B, 3)
strideC = stride(C, 3)
batchCount = size(A, 3)
batchCount = size(C, 3)
$fname(handle(), cutransA,cutransB, m, n, k, [alpha], A, lda, strideA, B,
ldb, strideB, [beta], C, ldc, strideC, batchCount)
C
end
function gemm_strided_batched(transA::Char,
transB::Char,
alpha::($elty),
A::CuArray{$elty, 3},
B::CuArray{$elty, 3})
C = similar(B, (size(A, transA == 'N' ? 1 : 2), size(B, transB == 'N' ? 2 : 1), size(A, 3)))
A::AbstractArray{$elty, 3},
B::AbstractArray{$elty, 3})
C = similar(B, (size(A, transA == 'N' ? 1 : 2), size(B, transB == 'N' ? 2 : 1), max(size(A, 3), size(B, 3))))
gemm_strided_batched!(transA, transB, alpha, A, B, zero($elty), C )
end
function gemm_strided_batched(transA::Char,
transB::Char,
A::CuArray{$elty, 3},
B::CuArray{$elty, 3})
A::AbstractArray{$elty, 3},
B::AbstractArray{$elty, 3})
gemm_strided_batched(transA, transB, one($elty), A, B)
end
end
Expand Down
16 changes: 4 additions & 12 deletions src/nnlib.jl
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,8 @@ end


# Batched matrix multiplication
# Using storage_type from https://github.com/FluxML/NNlib.jl/pull/191

NNlib._batched_gemm!(::Type{<:CuArray}, transA::Char, transB::Char, α::Number, A, B, β::Number, C) =
CuArrays.CUBLAS.gemm_strided_batched!(transA, transB, α, A, B, β, C)

const batched_gemm_args = [
(:(CuArray{T, 3}), 'N'),
(:(NNlib.BatchedTranspose{T, <:CuArray{T, 3}}), 'T'),
(:(NNlib.BatchedAdjoint{T, <:CuArray{T, 3}}), 'C')
]

for (TA, transA) in batched_gemm_args, (TB, transB) in batched_gemm_args
@eval function NNlib.batched_mul!(C::CuArray{T, 3}, A::$TA, B::$TB) where {T<:CUBLAS.CublasFloat}
CuArrays.CUBLAS.gemm_strided_batched!($transA, $transB, one(T), NNlib._unbatch(A), NNlib._unbatch(B), zero(T), C)
C
end
end
19 changes: 19 additions & 0 deletions test/nnlib.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,23 @@
@test cu(Ca) ≈ batched_mul(cu(A), batched_adjoint(cu(B)))
end

using NNlib: is_strided, are_strided, storage_type
using LinearAlgebra
@testset "NNlib storage_type etc." begin

M = cu(ones(10,10))

@test is_strided(M)
@test is_strided(view(M, 1:2:5,:))
@test is_strided(PermutedDimsArray(M, (2,1)))

@test !is_strided(reshape(view(M, 1:2:10,:), 10,:))
@test !is_strided((M.+im)')
@test !is_strided(Diagonal(cu(ones(3))))

@test storage_type(M) == CuArray{Float32,2,Nothing}
@test storage_type(reshape(view(M, 1:2:10,:), 10,:)) == CuArray{Float32,2,Nothing}

end

end