From 19d26d8d7d4e8235af8a7af979ebc2ec3927cdcc Mon Sep 17 00:00:00 2001
From: Michael Abbott <me@pseudomac>
Date: Tue, 31 Mar 2020 15:45:31 +0200
Subject: [PATCH 1/7] allow AbstractArray in gemm_strided_batched

---
 src/blas/wrappers.jl | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/blas/wrappers.jl b/src/blas/wrappers.jl
index 5862655b..d0a58f42 100644
--- a/src/blas/wrappers.jl
+++ b/src/blas/wrappers.jl
@@ -937,10 +937,10 @@ for (fname, elty) in
         function gemm_strided_batched!(transA::Char,
                                transB::Char,
                                alpha::($elty),
-                               A::CuArray{$elty, 3},
-                               B::CuArray{$elty, 3},
+                               A::AbstractArray{$elty, 3},
+                               B::AbstractArray{$elty, 3},
                                beta::($elty),
-                               C::CuArray{$elty, 3})
+                               C::AbstractArray{$elty, 3})
            m = size(A, transA == 'N' ? 1 : 2)
            k = size(A, transA == 'N' ? 2 : 1)
            n = size(B, transB == 'N' ? 2 : 1)
@@ -967,15 +967,15 @@ for (fname, elty) in
         function gemm_strided_batched(transA::Char,
                       transB::Char,
                       alpha::($elty),
-                      A::CuArray{$elty, 3},
-                      B::CuArray{$elty, 3})
+                      A::AbstractArray{$elty, 3},
+                      B::AbstractArray{$elty, 3})
             C = similar(B, (size(A, transA == 'N' ? 1 : 2), size(B, transB == 'N' ? 2 : 1), size(A, 3)))
             gemm_strided_batched!(transA, transB, alpha, A, B, zero($elty), C )
         end
         function gemm_strided_batched(transA::Char,
                       transB::Char,
-                      A::CuArray{$elty, 3},
-                      B::CuArray{$elty, 3})
+                      A::AbstractArray{$elty, 3},
+                      B::AbstractArray{$elty, 3})
             gemm_strided_batched(transA, transB, one($elty), A, B)
         end
     end

From 20851f29d58e8a6cb602373cf6e035ebb445630c Mon Sep 17 00:00:00 2001
From: Michael Abbott <me@pseudomac>
Date: Tue, 31 Mar 2020 15:51:50 +0200
Subject: [PATCH 2/7] NNlib.batched_mul allowing PermutedDimsArray, plus...

---
 src/nnlib.jl | 47 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 42 insertions(+), 5 deletions(-)

diff --git a/src/nnlib.jl b/src/nnlib.jl
index 56063b58..24523710 100644
--- a/src/nnlib.jl
+++ b/src/nnlib.jl
@@ -34,14 +34,51 @@ end
 # Batched matrix multiplication
 
 const batched_gemm_args = [
-    (:(CuArray{T, 3}), 'N'),
-    (:(NNlib.BatchedTranspose{T, <:CuArray{T, 3}}), 'T'),
-    (:(NNlib.BatchedAdjoint{T, <:CuArray{T, 3}}), 'C')
+    (:(AbstractArray{T, 3}), 'N', :identity),
+    (:(NNlib.BatchedTranspose{T, <:AbstractArray{T, 3}}), 'T', :batched_transpose),
+    (:(NNlib.BatchedAdjoint{T, <:AbstractArray{T, 3}}), 'C', :batched_adjoint)
 ]
 
-for (TA, transA) in batched_gemm_args, (TB, transB) in batched_gemm_args
+using NNlib: batched_mul!, BatchedTranspose, BatchedAdjoint, batched_transpose, batched_adjoint
+
+for (TA, transA, fA) in batched_gemm_args, (TB, transB, fB) in batched_gemm_args
     @eval function NNlib.batched_mul!(C::CuArray{T, 3}, A::$TA, B::$TB) where {T<:CUBLAS.CublasFloat}
-        CuArrays.CUBLAS.gemm_strided_batched!($transA, $transB, one(T), NNlib._unbatch(A), NNlib._unbatch(B), zero(T), C)
+
+        Abase, Bbase = NNlib._unbatch(A), NNlib._unbatch(B)
+
+        # Best case, we can call batched_gemm! immediately:
+        if Base.stride(Abase,1) == Base.stride(Bbase,1) == Base.stride(C,1) == 1
+            CuArrays.CUBLAS.gemm_strided_batched!($transA, $transB, one(T), NNlib._unbatch(A), NNlib._unbatch(B), zero(T), C)
+
+        # Second-best, can we fix it by Perm.ing the base, and adjusing 'T' label?
+        # But only if we won't produce BatchedTranspose(BatchedAdjoint(complex array)).
+        elseif Base.stride(Abase,2) == 1 && !(T<:Complex && $TA<:NNlib.BatchedAdjoint)
+            newAbase = NNlib.batched_transpose(PermutedDimsArray(Abase, (2,1,3)))
+            return NNlib.batched_mul!(C, $fA(newAbase), B)
+        elseif Base.stride(Bbase,2) == 1 && !(T<:Complex && $TB<:NNlib.BatchedAdjoint)
+            newBbase = NNlib.batched_transpose(PermutedDimsArray(Bbase, (2,1,3)))
+            return NNlib.batched_mul!(C, A, $fB(newBbase))
+
+        # Fallback, e.g when Base.stride(A,3)==1
+        else
+            NNlib.batched_mul_generic!(C, A, B)
+        end
         C
     end
 end
+
+# This is obviously the wrong place for this! Not sure where it should go.
+# Base.unsafe_convert(::Type{CUDAdrv.CuPtr{T}}, A::PermutedDimsArray) where {T} = 
+#     Base.unsafe_convert(CUDAdrv.CuPtr{T}, parent(A))
+# Recursive version, will handle e.g. NamedDimsArray
+function Base.unsafe_convert(::Type{CUDAdrv.CuPtr{T}}, A::AbstractArray) where {T}
+    if A === parent(A)
+        throw(MethodError(Base.unsafe_convert, Tuple{CUDAdrv.CuPtr{T}, typeof(A)}))
+    else
+        return Base.unsafe_convert(CUDAdrv.CuPtr{T}, parent(A))
+    end
+end
+
+
+# This is https://github.com/JuliaLang/julia/pull/35304, here just for testing now:
+Base.similar(A::PermutedDimsArray, T::Type, dims::Base.Dims) = similar(parent(A), T, dims)

From 93489044259c290ffb70570f75cd2c1e7303e7d6 Mon Sep 17 00:00:00 2001
From: Michael Abbott <me@pseudomac>
Date: Thu, 2 Apr 2020 01:01:47 +0200
Subject: [PATCH 3/7] choose method using is_strided_cu(A)

---
 src/CuArrays.jl.save   | 91 ++++++++++++++++++++++++++++++++++++++++++
 src/CuArrays.jl.save.1 | 91 ++++++++++++++++++++++++++++++++++++++++++
 src/nnlib.jl           | 84 +++++++++++++++++++++++++++++++-------
 test/nnlib.jl          | 22 ++++++++++
 4 files changed, 274 insertions(+), 14 deletions(-)
 create mode 100644 src/CuArrays.jl.save
 create mode 100644 src/CuArrays.jl.save.1

diff --git a/src/CuArrays.jl.save b/src/CuArrays.jl.save
new file mode 100644
index 00000000..7a5f77ad
--- /dev/null
+++ b/src/CuArrays.jl.save
@@ -0,0 +1,91 @@
+module CuArrays
+
+using CUDAapi, CUDAdrv, CUDAnative
+
+using GPUArrays
+
+export CuArray, CuVector, CuMatrix, CuVecOrMat, cu
+export CUBLAS, CUSPARSE, CUSOLVER, CUFFT, CURAND, CUDNN, CUTENSOR
+
+import LinearAlgebra
+
+using Adapt
+
+using Libdl
+
+using Requires
+
+
+## source code includes
+
+include("bindeps.jl")
+
+# core array functionality
+include("memory.jl")
+include("array.jl")
+include("gpuarrays.jl")
+include("subarray.jl")
+include("utils.jl")
+
+# integrations and specialized functionality
+include("indexing.jl")
+include("broadcast.jl")
+include("mapreduce.jl")
+include("accumulate.jl")
+include("linalg.jl")
+
+# vendor libraries
+include("blas/CUBLAS.jl")
+include("sparse/CUSPARSE.jl")
+include("solver/CUSOLVER.jl")
+include("fft/CUFFT.jl")
+include("rand/CURAND.jl")
+include("dnn/CUDNN.jl")
+include("tensor/CUTENSOR.jl")
+
+include("deprecated.jl")
+
+include("nnlib.jl")
+
+## initialization
+
+const __initialized__ = Ref(false)
+functional() = __initialized__[]
+
+export has_cudnn, has_cutensor
+has_cudnn() = Libdl.dlopen_e(CUDNN.libcudnn[]) !== C_NULL
+has_cutensor() = Libdl.dlopen_e(CUTENSOR.libcutensor[]) !== C_NULL
+
+function __init__()
+    precompiling = ccall(:jl_generating_output, Cint, ()) != 0
+    silent = parse(Bool, get(ENV, "JULIA_CUDA_SILENT", "false")) || precompiling
+    verbose = parse(Bool, get(ENV, "JULIA_CUDA_VERBOSE", "false"))
+
+    # if any dependent GPU package failed, expect it to have logged an error and bail out
+    if !CUDAdrv.functional() || !CUDAnative.functional()
+        verbose && @warn "CuArrays.jl did not initialize because CUDAdrv.jl or CUDAnative.jl failed to"
+        return
+    end
+
+    try
+        __init_bindeps__(silent=silent, verbose=verbose)
+
+        # package integrations
+        @require ForwardDiff="f6369f11-7733-5829-9624-2563aa707210" include("forwarddiff.jl")
+
+        __init_memory__()
+
+        __initialized__[] = true
+    catch ex
+        # don't actually fail to keep the package loadable
+        if !silent
+            if verbose
+                @error "CuArrays.jl failed to initialize" exception=(ex, catch_backtrace())
+            else
+                @info "CuArrays.jl failed to initialize and will be unavailable (set JULIA_CUDA_SILENT or JULIA_CUDA_VERBOSE to silence or expand this message)"
+            end
+        end
+    end
+end
+
+end # module
diff --git a/src/CuArrays.jl.save.1 b/src/CuArrays.jl.save.1
new file mode 100644
index 00000000..7a5f77ad
--- /dev/null
+++ b/src/CuArrays.jl.save.1
@@ -0,0 +1,91 @@
+module CuArrays
+
+using CUDAapi, CUDAdrv, CUDAnative
+
+using GPUArrays
+
+export CuArray, CuVector, CuMatrix, CuVecOrMat, cu
+export CUBLAS, CUSPARSE, CUSOLVER, CUFFT, CURAND, CUDNN, CUTENSOR
+
+import LinearAlgebra
+
+using Adapt
+
+using Libdl
+
+using Requires
+
+
+## source code includes
+
+include("bindeps.jl")
+
+# core array functionality
+include("memory.jl")
+include("array.jl")
+include("gpuarrays.jl")
+include("subarray.jl")
+include("utils.jl")
+
+# integrations and specialized functionality
+include("indexing.jl")
+include("broadcast.jl")
+include("mapreduce.jl")
+include("accumulate.jl")
+include("linalg.jl")
+
+# vendor libraries
+include("blas/CUBLAS.jl")
+include("sparse/CUSPARSE.jl")
+include("solver/CUSOLVER.jl")
+include("fft/CUFFT.jl")
+include("rand/CURAND.jl")
+include("dnn/CUDNN.jl")
+include("tensor/CUTENSOR.jl")
+
+include("deprecated.jl")
+
+include("nnlib.jl")
+
+## initialization
+
+const __initialized__ = Ref(false)
+functional() = __initialized__[]
+
+export has_cudnn, has_cutensor
+has_cudnn() = Libdl.dlopen_e(CUDNN.libcudnn[]) !== C_NULL
+has_cutensor() = Libdl.dlopen_e(CUTENSOR.libcutensor[]) !== C_NULL
+
+function __init__()
+    precompiling = ccall(:jl_generating_output, Cint, ()) != 0
+    silent = parse(Bool, get(ENV, "JULIA_CUDA_SILENT", "false")) || precompiling
+    verbose = parse(Bool, get(ENV, "JULIA_CUDA_VERBOSE", "false"))
+
+    # if any dependent GPU package failed, expect it to have logged an error and bail out
+    if !CUDAdrv.functional() || !CUDAnative.functional()
+        verbose && @warn "CuArrays.jl did not initialize because CUDAdrv.jl or CUDAnative.jl failed to"
+        return
+    end
+
+    try
+        __init_bindeps__(silent=silent, verbose=verbose)
+
+        # package integrations
+        @require ForwardDiff="f6369f11-7733-5829-9624-2563aa707210" include("forwarddiff.jl")
+
+        __init_memory__()
+
+        __initialized__[] = true
+    catch ex
+        # don't actually fail to keep the package loadable
+        if !silent
+            if verbose
+                @error "CuArrays.jl failed to initialize" exception=(ex, catch_backtrace())
+            else
+                @info "CuArrays.jl failed to initialize and will be unavailable (set JULIA_CUDA_SILENT or JULIA_CUDA_VERBOSE to silence or expand this message)"
+            end
+        end
+    end
+end
+
+end # module
diff --git a/src/nnlib.jl b/src/nnlib.jl
index 24523710..63f661c0 100644
--- a/src/nnlib.jl
+++ b/src/nnlib.jl
@@ -33,43 +33,62 @@ end
 
 # Batched matrix multiplication
 
+# This method has a slightly tighter signature than the one in NNlib, all same eltype.
+function NNlib.batched_mul!(C::AbstractArray{T,3}, A::AbstractArray{T,3}, B::AbstractArray{T,3}) where {T<:CUBLAS.CublasFloat}
+    if is_strided_cu(A) && is_strided_cu(B) && is_strided_cu(C)
+        # Data is on GPU, and it's safe to call strides(A). gemm_strided_batched may be legal.
+        batched_try_gemm!(C, A, B)
+
+    elseif is_strided_cu(A) || is_strided_cu(B) || is_strided_cu(C)
+        # This is hopeless, but best option is the fallback
+        @debug "weird mix of CPU + GPU?"
+        NNlib.batched_mul_generic!(C, A, B)
+
+    else
+        # All cases for CPU gemm! will come through here, is_strided_cu(A) compiles away:
+        NNlib.batched_mul_cpu!(C, A, B)
+    end
+end
+
 const batched_gemm_args = [
-    (:(AbstractArray{T, 3}), 'N', :identity),
-    (:(NNlib.BatchedTranspose{T, <:AbstractArray{T, 3}}), 'T', :batched_transpose),
-    (:(NNlib.BatchedAdjoint{T, <:AbstractArray{T, 3}}), 'C', :batched_adjoint)
+    (:(AbstractArray{T, 3}),       'N', :identity),
+    (:(NNlib.BatchedTranspose{T}), 'T', :batched_transpose),
+    (:(NNlib.BatchedAdjoint{T}),   'C', :batched_adjoint)
 ]
 
 using NNlib: batched_mul!, BatchedTranspose, BatchedAdjoint, batched_transpose, batched_adjoint
+using NNlib: _unbatch, _perm12
 
 for (TA, transA, fA) in batched_gemm_args, (TB, transB, fB) in batched_gemm_args
-    @eval function NNlib.batched_mul!(C::CuArray{T, 3}, A::$TA, B::$TB) where {T<:CUBLAS.CublasFloat}
+    @eval function batched_try_gemm!(C::AbstractArray{T, 3}, A::$TA, B::$TB) where {T<:CUBLAS.CublasFloat}
 
-        Abase, Bbase = NNlib._unbatch(A), NNlib._unbatch(B)
+        Abase, Bbase = _unbatch(A), _unbatch(B)
 
         # Best case, we can call batched_gemm! immediately:
         if Base.stride(Abase,1) == Base.stride(Bbase,1) == Base.stride(C,1) == 1
-            CuArrays.CUBLAS.gemm_strided_batched!($transA, $transB, one(T), NNlib._unbatch(A), NNlib._unbatch(B), zero(T), C)
+            CuArrays.CUBLAS.gemm_strided_batched!($transA, $transB, one(T), Abase, Bbase, zero(T), C)
 
         # Second-best, can we fix it by Perm.ing the base, and adjusing 'T' label?
         # But only if we won't produce BatchedTranspose(BatchedAdjoint(complex array)).
-        elseif Base.stride(Abase,2) == 1 && !(T<:Complex && $TA<:NNlib.BatchedAdjoint)
-            newAbase = NNlib.batched_transpose(PermutedDimsArray(Abase, (2,1,3)))
-            return NNlib.batched_mul!(C, $fA(newAbase), B)
-        elseif Base.stride(Bbase,2) == 1 && !(T<:Complex && $TB<:NNlib.BatchedAdjoint)
-            newBbase = NNlib.batched_transpose(PermutedDimsArray(Bbase, (2,1,3)))
-            return NNlib.batched_mul!(C, A, $fB(newBbase))
+        elseif Base.stride(Abase,2) == 1 && !(T<:Complex && $TA<:BatchedAdjoint)
+            newAbase = batched_transpose(_perm12(Abase))
+            return batched_try_gemm!(C, $fA(newAbase), B)
+
+        elseif Base.stride(Bbase,2) == 1 && !(T<:Complex && $TB<:BatchedAdjoint)
+            newBbase = batched_transpose(_perm12(Bbase))
+            return batched_try_gemm!(C, A, $fB(newBbase))
 
         # Fallback, e.g when Base.stride(A,3)==1
         else
+            @debug "couldn't re-arrange strides for CUBLAS.gemm_strided_batched!" strides(A) strides(B) strides(C)
             NNlib.batched_mul_generic!(C, A, B)
         end
         C
     end
 end
 
+
 # This is obviously the wrong place for this! Not sure where it should go.
-# Base.unsafe_convert(::Type{CUDAdrv.CuPtr{T}}, A::PermutedDimsArray) where {T} = 
-#     Base.unsafe_convert(CUDAdrv.CuPtr{T}, parent(A))
 # Recursive version, will handle e.g. NamedDimsArray
 function Base.unsafe_convert(::Type{CUDAdrv.CuPtr{T}}, A::AbstractArray) where {T}
     if A === parent(A)
@@ -82,3 +101,40 @@ end
 
 # This is https://github.com/JuliaLang/julia/pull/35304, here just for testing now:
 Base.similar(A::PermutedDimsArray, T::Type, dims::Base.Dims) = similar(parent(A), T, dims)
+
+
+# Also the wong place for this, surely.
+"""
+    is_strided_cu(A)
+
+This should return `true` for `A::CuArray`, and also for:
+* Any `view(::CuArray)` or `reshape(::CuArray)` etc. which remains a `StridedArray`
+* Any other wrapper for which `is_strided_cu(parent(A))`
+* Except that `Adjoint(A)` is only unwrapped for real numbers.
+
+Such wrappers include `PermutedDimsArray(::CuArray, ...)`,
+but also those defined elsewhere (such as `NamedDimsArray`s)
+which are assumed not to break strided-ness.
+
+`Transpose` and `Adjoint` don't currently define `strides`, so for now they return `false`.
+"""
+is_strided_cu(A::CuArray) = true
+is_strided_cu(A) = false
+function is_strided_cu(A::AbstractArray)
+    M = parentmodule(typeof(A))
+    if parent(A) === A # Array, SparseMatrix, StaticArray
+        false
+    elseif M === Base || M === Core || M ===LinearAlgebra
+        A isa StridedArray && is_strided_cu(parent(A))
+    else
+        is_strided_cu(parent(A)) # PermutedDimsArray, NamedDimsArray
+    end
+end
+
+if hasmethod(Base.strides, Tuple{LinearAlgebra.Transpose})
+    is_strided_cu(A::LinearAlgebra.Transpose) = is_strided(parent(A))
+    is_strided_cu(A::LinearAlgebra.Adjoint) = eltype(A) <: Real && is_strided(parent(A))
+else
+    is_strided_cu(A::LinearAlgebra.Transpose) = false
+    is_strided_cu(A::LinearAlgebra.Adjoint) = false
+end
diff --git a/test/nnlib.jl b/test/nnlib.jl
index e8e1462d..cfa1d8a7 100644
--- a/test/nnlib.jl
+++ b/test/nnlib.jl
@@ -16,4 +16,26 @@
     @test cu(Ca) ≈ batched_mul(cu(A), batched_adjoint(cu(B)))
 end
 
+using CuArrays: is_strided_cu
+using LinearAlgebra
+@testset "is_strided_cu" begin
+
+    M = cu(ones(10,10))
+
+    @test is_strided_cu(M)
+    @test is_strided_cu(view(M, 1:2:5,:))
+    @test is_strided_cu(PermutedDimsArray(M, (2,1)))
+
+    @test !is_strided_cu(reshape(view(M, 1:2:10,:), 10,:))
+    @test !is_strided_cu((M.+im)')
+    @test !is_strided_cu(ones(10,10))
+    @test !is_strided_cu(Diagonal(ones(3)))
+
+    #=
+    using NamedDims
+    @test is_strided(NamedDimsArray(M,(:a, :b))) # and 0.029 ns, 0 allocations
+    =#
+
+end
+
 end

From 8bc792bdd598909864e75ac234432665fb9eba60 Mon Sep 17 00:00:00 2001
From: Michael Abbott <me@pseudomac>
Date: Thu, 2 Apr 2020 01:04:06 +0200
Subject: [PATCH 4/7] rm .save

---
 src/CuArrays.jl.save   | 91 ------------------------------------------
 src/CuArrays.jl.save.1 | 91 ------------------------------------------
 2 files changed, 182 deletions(-)
 delete mode 100644 src/CuArrays.jl.save
 delete mode 100644 src/CuArrays.jl.save.1

diff --git a/src/CuArrays.jl.save b/src/CuArrays.jl.save
deleted file mode 100644
index 7a5f77ad..00000000
--- a/src/CuArrays.jl.save
+++ /dev/null
@@ -1,91 +0,0 @@
-module CuArrays
-
-using CUDAapi, CUDAdrv, CUDAnative
-
-using GPUArrays
-
-export CuArray, CuVector, CuMatrix, CuVecOrMat, cu
-export CUBLAS, CUSPARSE, CUSOLVER, CUFFT, CURAND, CUDNN, CUTENSOR
-
-import LinearAlgebra
-
-using Adapt
-
-using Libdl
-
-using Requires
-
-
-## source code includes
-
-include("bindeps.jl")
-
-# core array functionality
-include("memory.jl")
-include("array.jl")
-include("gpuarrays.jl")
-include("subarray.jl")
-include("utils.jl")
-
-# integrations and specialized functionality
-include("indexing.jl")
-include("broadcast.jl")
-include("mapreduce.jl")
-include("accumulate.jl")
-include("linalg.jl")
-
-# vendor libraries
-include("blas/CUBLAS.jl")
-include("sparse/CUSPARSE.jl")
-include("solver/CUSOLVER.jl")
-include("fft/CUFFT.jl")
-include("rand/CURAND.jl")
-include("dnn/CUDNN.jl")
-include("tensor/CUTENSOR.jl")
-
-include("deprecated.jl")
-
-include("nnlib.jl")
-
-## initialization
-
-const __initialized__ = Ref(false)
-functional() = __initialized__[]
-
-export has_cudnn, has_cutensor
-has_cudnn() = Libdl.dlopen_e(CUDNN.libcudnn[]) !== C_NULL
-has_cutensor() = Libdl.dlopen_e(CUTENSOR.libcutensor[]) !== C_NULL
-
-function __init__()
-    precompiling = ccall(:jl_generating_output, Cint, ()) != 0
-    silent = parse(Bool, get(ENV, "JULIA_CUDA_SILENT", "false")) || precompiling
-    verbose = parse(Bool, get(ENV, "JULIA_CUDA_VERBOSE", "false"))
-
-    # if any dependent GPU package failed, expect it to have logged an error and bail out
-    if !CUDAdrv.functional() || !CUDAnative.functional()
-        verbose && @warn "CuArrays.jl did not initialize because CUDAdrv.jl or CUDAnative.jl failed to"
-        return
-    end
-
-    try
-        __init_bindeps__(silent=silent, verbose=verbose)
-
-        # package integrations
-        @require ForwardDiff="f6369f11-7733-5829-9624-2563aa707210" include("forwarddiff.jl")
-
-        __init_memory__()
-
-        __initialized__[] = true
-    catch ex
-        # don't actually fail to keep the package loadable
-        if !silent
-            if verbose
-                @error "CuArrays.jl failed to initialize" exception=(ex, catch_backtrace())
-            else
-                @info "CuArrays.jl failed to initialize and will be unavailable (set JULIA_CUDA_SILENT or JULIA_CUDA_VERBOSE to silence or expand this message)"
-            end
-        end
-    end
-end
-
-end # module
diff --git a/src/CuArrays.jl.save.1 b/src/CuArrays.jl.save.1
deleted file mode 100644
index 7a5f77ad..00000000
--- a/src/CuArrays.jl.save.1
+++ /dev/null
@@ -1,91 +0,0 @@
-module CuArrays
-
-using CUDAapi, CUDAdrv, CUDAnative
-
-using GPUArrays
-
-export CuArray, CuVector, CuMatrix, CuVecOrMat, cu
-export CUBLAS, CUSPARSE, CUSOLVER, CUFFT, CURAND, CUDNN, CUTENSOR
-
-import LinearAlgebra
-
-using Adapt
-
-using Libdl
-
-using Requires
-
-
-## source code includes
-
-include("bindeps.jl")
-
-# core array functionality
-include("memory.jl")
-include("array.jl")
-include("gpuarrays.jl")
-include("subarray.jl")
-include("utils.jl")
-
-# integrations and specialized functionality
-include("indexing.jl")
-include("broadcast.jl")
-include("mapreduce.jl")
-include("accumulate.jl")
-include("linalg.jl")
-
-# vendor libraries
-include("blas/CUBLAS.jl")
-include("sparse/CUSPARSE.jl")
-include("solver/CUSOLVER.jl")
-include("fft/CUFFT.jl")
-include("rand/CURAND.jl")
-include("dnn/CUDNN.jl")
-include("tensor/CUTENSOR.jl")
-
-include("deprecated.jl")
-
-include("nnlib.jl")
-
-## initialization
-
-const __initialized__ = Ref(false)
-functional() = __initialized__[]
-
-export has_cudnn, has_cutensor
-has_cudnn() = Libdl.dlopen_e(CUDNN.libcudnn[]) !== C_NULL
-has_cutensor() = Libdl.dlopen_e(CUTENSOR.libcutensor[]) !== C_NULL
-
-function __init__()
-    precompiling = ccall(:jl_generating_output, Cint, ()) != 0
-    silent = parse(Bool, get(ENV, "JULIA_CUDA_SILENT", "false")) || precompiling
-    verbose = parse(Bool, get(ENV, "JULIA_CUDA_VERBOSE", "false"))
-
-    # if any dependent GPU package failed, expect it to have logged an error and bail out
-    if !CUDAdrv.functional() || !CUDAnative.functional()
-        verbose && @warn "CuArrays.jl did not initialize because CUDAdrv.jl or CUDAnative.jl failed to"
-        return
-    end
-
-    try
-        __init_bindeps__(silent=silent, verbose=verbose)
-
-        # package integrations
-        @require ForwardDiff="f6369f11-7733-5829-9624-2563aa707210" include("forwarddiff.jl")
-
-        __init_memory__()
-
-        __initialized__[] = true
-    catch ex
-        # don't actually fail to keep the package loadable
-        if !silent
-            if verbose
-                @error "CuArrays.jl failed to initialize" exception=(ex, catch_backtrace())
-            else
-                @info "CuArrays.jl failed to initialize and will be unavailable (set JULIA_CUDA_SILENT or JULIA_CUDA_VERBOSE to silence or expand this message)"
-            end
-        end
-    end
-end
-
-end # module

From 0c6f5c68b1cad82ad264ae5f132d46072f92223e Mon Sep 17 00:00:00 2001
From: Michael Abbott <me@pseudomac>
Date: Sat, 4 Apr 2020 00:22:59 +0200
Subject: [PATCH 5/7] use _batched_gemm and storage_type from
 https://github.com/FluxML/NNlib.jl/pull/191

---
 src/nnlib.jl  | 105 ++------------------------------------------------
 test/nnlib.jl |  23 +++++------
 2 files changed, 14 insertions(+), 114 deletions(-)

diff --git a/src/nnlib.jl b/src/nnlib.jl
index 63f661c0..d6cf1269 100644
--- a/src/nnlib.jl
+++ b/src/nnlib.jl
@@ -32,109 +32,12 @@ end
 
 
 # Batched matrix multiplication
+# Using storage_type from https://github.com/FluxML/NNlib.jl/pull/191
 
-# This method has a slightly tighter signature than the one in NNlib, all same eltype.
-function NNlib.batched_mul!(C::AbstractArray{T,3}, A::AbstractArray{T,3}, B::AbstractArray{T,3}) where {T<:CUBLAS.CublasFloat}
-    if is_strided_cu(A) && is_strided_cu(B) && is_strided_cu(C)
-        # Data is on GPU, and it's safe to call strides(A). gemm_strided_batched may be legal.
-        batched_try_gemm!(C, A, B)
-
-    elseif is_strided_cu(A) || is_strided_cu(B) || is_strided_cu(C)
-        # This is hopeless, but best option is the fallback
-        @debug "weird mix of CPU + GPU?"
-        NNlib.batched_mul_generic!(C, A, B)
-
-    else
-        # All cases for CPU gemm! will come through here, is_strided_cu(A) compiles away:
-        NNlib.batched_mul_cpu!(C, A, B)
-    end
-end
-
-const batched_gemm_args = [
-    (:(AbstractArray{T, 3}),       'N', :identity),
-    (:(NNlib.BatchedTranspose{T}), 'T', :batched_transpose),
-    (:(NNlib.BatchedAdjoint{T}),   'C', :batched_adjoint)
-]
-
-using NNlib: batched_mul!, BatchedTranspose, BatchedAdjoint, batched_transpose, batched_adjoint
-using NNlib: _unbatch, _perm12
-
-for (TA, transA, fA) in batched_gemm_args, (TB, transB, fB) in batched_gemm_args
-    @eval function batched_try_gemm!(C::AbstractArray{T, 3}, A::$TA, B::$TB) where {T<:CUBLAS.CublasFloat}
-
-        Abase, Bbase = _unbatch(A), _unbatch(B)
-
-        # Best case, we can call batched_gemm! immediately:
-        if Base.stride(Abase,1) == Base.stride(Bbase,1) == Base.stride(C,1) == 1
-            CuArrays.CUBLAS.gemm_strided_batched!($transA, $transB, one(T), Abase, Bbase, zero(T), C)
-
-        # Second-best, can we fix it by Perm.ing the base, and adjusing 'T' label?
-        # But only if we won't produce BatchedTranspose(BatchedAdjoint(complex array)).
-        elseif Base.stride(Abase,2) == 1 && !(T<:Complex && $TA<:BatchedAdjoint)
-            newAbase = batched_transpose(_perm12(Abase))
-            return batched_try_gemm!(C, $fA(newAbase), B)
-
-        elseif Base.stride(Bbase,2) == 1 && !(T<:Complex && $TB<:BatchedAdjoint)
-            newBbase = batched_transpose(_perm12(Bbase))
-            return batched_try_gemm!(C, A, $fB(newBbase))
-
-        # Fallback, e.g when Base.stride(A,3)==1
-        else
-            @debug "couldn't re-arrange strides for CUBLAS.gemm_strided_batched!" strides(A) strides(B) strides(C)
-            NNlib.batched_mul_generic!(C, A, B)
-        end
-        C
-    end
-end
-
-
-# This is obviously the wrong place for this! Not sure where it should go.
-# Recursive version, will handle e.g. NamedDimsArray
-function Base.unsafe_convert(::Type{CUDAdrv.CuPtr{T}}, A::AbstractArray) where {T}
-    if A === parent(A)
-        throw(MethodError(Base.unsafe_convert, Tuple{CUDAdrv.CuPtr{T}, typeof(A)}))
-    else
-        return Base.unsafe_convert(CUDAdrv.CuPtr{T}, parent(A))
-    end
-end
-
+NNlib._batched_gemm!(::Type{<:CuArray}, transA::Char, transB::Char, α::Number, A, B, β::Number, C) =
+    CuArrays.CUBLAS.gemm_strided_batched!(transA, transB, α, A, B, β, C)
 
 # This is https://github.com/JuliaLang/julia/pull/35304, here just for testing now:
 Base.similar(A::PermutedDimsArray, T::Type, dims::Base.Dims) = similar(parent(A), T, dims)
+# @which Base.similar(PermutedDimsArray(rand(2,2), (2,1)), Int, Base.Dims{2}((3,3)))
 
-
-# Also the wong place for this, surely.
-"""
-    is_strided_cu(A)
-
-This should return `true` for `A::CuArray`, and also for:
-* Any `view(::CuArray)` or `reshape(::CuArray)` etc. which remains a `StridedArray`
-* Any other wrapper for which `is_strided_cu(parent(A))`
-* Except that `Adjoint(A)` is only unwrapped for real numbers.
-
-Such wrappers include `PermutedDimsArray(::CuArray, ...)`,
-but also those defined elsewhere (such as `NamedDimsArray`s)
-which are assumed not to break strided-ness.
-
-`Transpose` and `Adjoint` don't currently define `strides`, so for now they return `false`.
-"""
-is_strided_cu(A::CuArray) = true
-is_strided_cu(A) = false
-function is_strided_cu(A::AbstractArray)
-    M = parentmodule(typeof(A))
-    if parent(A) === A # Array, SparseMatrix, StaticArray
-        false
-    elseif M === Base || M === Core || M ===LinearAlgebra
-        A isa StridedArray && is_strided_cu(parent(A))
-    else
-        is_strided_cu(parent(A)) # PermutedDimsArray, NamedDimsArray
-    end
-end
-
-if hasmethod(Base.strides, Tuple{LinearAlgebra.Transpose})
-    is_strided_cu(A::LinearAlgebra.Transpose) = is_strided(parent(A))
-    is_strided_cu(A::LinearAlgebra.Adjoint) = eltype(A) <: Real && is_strided(parent(A))
-else
-    is_strided_cu(A::LinearAlgebra.Transpose) = false
-    is_strided_cu(A::LinearAlgebra.Adjoint) = false
-end
diff --git a/test/nnlib.jl b/test/nnlib.jl
index cfa1d8a7..12ec19d7 100644
--- a/test/nnlib.jl
+++ b/test/nnlib.jl
@@ -16,25 +16,22 @@
     @test cu(Ca) ≈ batched_mul(cu(A), batched_adjoint(cu(B)))
 end
 
-using CuArrays: is_strided_cu
+using NNlib: is_strided, are_strided, storage_type
 using LinearAlgebra
-@testset "is_strided_cu" begin
+@testset "NNlib storage_type etc." begin
 
     M = cu(ones(10,10))
 
-    @test is_strided_cu(M)
-    @test is_strided_cu(view(M, 1:2:5,:))
-    @test is_strided_cu(PermutedDimsArray(M, (2,1)))
+    @test is_strided(M)
+    @test is_strided(view(M, 1:2:5,:))
+    @test is_strided(PermutedDimsArray(M, (2,1)))
 
-    @test !is_strided_cu(reshape(view(M, 1:2:10,:), 10,:))
-    @test !is_strided_cu((M.+im)')
-    @test !is_strided_cu(ones(10,10))
-    @test !is_strided_cu(Diagonal(ones(3)))
+    @test !is_strided(reshape(view(M, 1:2:10,:), 10,:))
+    @test !is_strided((M.+im)')
+    @test !is_strided(Diagonal(cu(ones(3))))
 
-    #=
-    using NamedDims
-    @test is_strided(NamedDimsArray(M,(:a, :b))) # and 0.029 ns, 0 allocations
-    =#
+    @test storage_type(M) == CuArray{Float32,2,Nothing}
+    @test storage_type(reshape(view(M, 1:2:10,:), 10,:)) == CuArray{Float32,2,Nothing}
 
 end
 

From 4d8c279e1744a6d579c5b4e5f273f4855c636774 Mon Sep 17 00:00:00 2001
From: Michael Abbott <me@pseudomac>
Date: Sat, 4 Apr 2020 10:16:30 +0200
Subject: [PATCH 6/7] allow size(A,3)==1 with size(B,3)==size(C,3)

---
 src/blas/wrappers.jl | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/blas/wrappers.jl b/src/blas/wrappers.jl
index d0a58f42..765e40cf 100644
--- a/src/blas/wrappers.jl
+++ b/src/blas/wrappers.jl
@@ -945,7 +945,8 @@ for (fname, elty) in
            k = size(A, transA == 'N' ? 2 : 1)
            n = size(B, transB == 'N' ? 2 : 1)
 
-           @assert size(A, 3) == size(B, 3) == size(C, 3) "Batch size mismatch"
+           @assert size(A, 3) == size(C, 3) || size(A, 3) == 1 "batch size mismatch: A != C"
+           @assert size(B, 3) == size(C, 3) || size(B, 3) == 1 "batch size mismatch: B != C"
 
            if m != size(C,1) || n != size(C,2) || k != size(B, transB == 'N' ? 1 : 2)
                throw(DimensionMismatch(""))
@@ -956,10 +957,10 @@ for (fname, elty) in
            ldb = max(1,stride(B,2))
            ldc = max(1,stride(C,2))
 
-           strideA = stride(A, 3)
-           strideB = stride(B, 3)
+           strideA = size(A, 3) == 1 ? 0 : stride(A, 3)
+           strideB = size(B, 3) == 1 ? 0 : stride(B, 3)
            strideC = stride(C, 3)
-           batchCount = size(A, 3)
+           batchCount = size(C, 3)
            $fname(handle(), cutransA,cutransB, m, n, k, [alpha], A, lda, strideA, B,
                   ldb, strideB, [beta], C, ldc, strideC, batchCount)
            C
@@ -969,7 +970,7 @@ for (fname, elty) in
                       alpha::($elty),
                       A::AbstractArray{$elty, 3},
                       B::AbstractArray{$elty, 3})
-            C = similar(B, (size(A, transA == 'N' ? 1 : 2), size(B, transB == 'N' ? 2 : 1), size(A, 3)))
+            C = similar(B, (size(A, transA == 'N' ? 1 : 2), size(B, transB == 'N' ? 2 : 1), max(size(A, 3), size(B, 3))))
             gemm_strided_batched!(transA, transB, alpha, A, B, zero($elty), C )
         end
         function gemm_strided_batched(transA::Char,

From 73e2ef4c8a66e3260607df67306d225ef722a056 Mon Sep 17 00:00:00 2001
From: Michael Abbott <me@pseudomac>
Date: Thu, 30 Apr 2020 23:29:19 +0200
Subject: [PATCH 7/7] use Compat 3.9 for similar(PermutedDimsArray)

---
 Project.toml | 2 ++
 src/nnlib.jl | 4 ----
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/Project.toml b/Project.toml
index af474927..e9c3df61 100644
--- a/Project.toml
+++ b/Project.toml
@@ -9,6 +9,7 @@ CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
 CUDAapi = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
 CUDAdrv = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
 CUDAnative = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
+Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
@@ -28,6 +29,7 @@ TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 AbstractFFTs = "0.4, 0.5"
 Adapt = "1.0"
 CEnum = "0.2"
+Compat = "3.9"
 CUDAapi = "3.0, 4.0"
 CUDAdrv = "6.0.1"
 CUDAnative = "3.0"
diff --git a/src/nnlib.jl b/src/nnlib.jl
index d6cf1269..842ac175 100644
--- a/src/nnlib.jl
+++ b/src/nnlib.jl
@@ -37,7 +37,3 @@ end
 NNlib._batched_gemm!(::Type{<:CuArray}, transA::Char, transB::Char, α::Number, A, B, β::Number, C) =
     CuArrays.CUBLAS.gemm_strided_batched!(transA, transB, α, A, B, β, C)
 
-# This is https://github.com/JuliaLang/julia/pull/35304, here just for testing now:
-Base.similar(A::PermutedDimsArray, T::Type, dims::Base.Dims) = similar(parent(A), T, dims)
-# @which Base.similar(PermutedDimsArray(rand(2,2), (2,1)), Int, Base.Dims{2}((3,3)))
-