From 47c59b7cb3691d0535935a0e235d4e0e7e1e90fc Mon Sep 17 00:00:00 2001
From: Tim Holy <tim.holy@gmail.com>
Date: Wed, 22 Sep 2021 17:01:18 -0500
Subject: [PATCH] WIP: use LoopVectorization

Preliminary tests suggest ~4x speedups, not too shabby
---
 Project.toml          |   6 +-
 src/ImageFiltering.jl |   2 +
 src/imfilter.jl       | 171 ++++++++++++++++++++++++++----------------
 src/kernelfactors.jl  |   3 +
 src/utils.jl          |  24 ++++++
 5 files changed, 142 insertions(+), 64 deletions(-)

diff --git a/Project.toml b/Project.toml
index 99ffc84..3645b42 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,6 +4,7 @@ author = ["Tim Holy <tim.holy@gmail.com>", "Jan Weidner <jw3126@gmail.com>"]
 version = "0.7.0"
 
 [deps]
+ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
 CatIndices = "aafaddc9-749c-510e-ac4f-586e18779b91"
 ComputationalResources = "ed09eef8-17a6-5b46-8889-db040fac31e3"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
@@ -11,6 +12,7 @@ FFTViews = "4f61f5a4-77b1-5117-aa51-3ab5ef4ef0cd"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 ImageCore = "a09fc81d-aa75-5fe9-8630-4744c3626534"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
@@ -19,16 +21,18 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 TiledIteration = "06e1c1a7-607b-532d-9fad-de7d9aa2abac"
 
 [compat]
+ArrayInterface = "3"
 CatIndices = "0.2"
 ComputationalResources = "0.3"
 DataStructures = "0.17.7, 0.18"
 FFTViews = "0.3"
 FFTW = "0.3, 1"
 ImageCore = "0.9"
+LoopVectorization = "0.12.75"
 OffsetArrays = "1.9"
 Reexport = "1.1"
 StaticArrays = "0.10, 0.11, 0.12, 1.0"
-TiledIteration = "0.2, 0.3"
+TiledIteration = "0.4"
 julia = "1"
 
 [extras]
diff --git a/src/ImageFiltering.jl b/src/ImageFiltering.jl
index 58df01f..1b46868 100644
--- a/src/ImageFiltering.jl
+++ b/src/ImageFiltering.jl
@@ -10,6 +10,8 @@ using Base: Indices, tail, fill_to_length, @pure, depwarn, @propagate_inbounds
 using OffsetArrays: IdentityUnitRange   # using the one in OffsetArrays makes this work with multiple Julia versions
 using SparseArrays   # only needed to fix an ambiguity in borderarray
 using Reexport
+using LoopVectorization
+using LoopVectorization.VectorizationBase
 
 @reexport using OffsetArrays: centered # this method once lived here
 
diff --git a/src/imfilter.jl b/src/imfilter.jl
index 7faec92..04c0a8d 100644
--- a/src/imfilter.jl
+++ b/src/imfilter.jl
@@ -774,7 +774,10 @@ end
 function imfilter!(r::AbstractCPU{FIRTiled{N}}, out::AbstractArray{S,N}, A::AbstractArray{T,N}, kernel::Tuple{Any,Any,Vararg{Any}}, border::NoPad, inds::Indices=axes(out)) where {S,T,N}
     kern = kernel[1]
     iscopy(kern) && return imfilter!(r, out, A, tail(kernel), border, inds)
-    tmp = tile_allocate(filter_type(A, kernel), r.settings.tilesize, kernel)
+    TTile, f = native_eltype(filter_type(A, kernel))
+    @assert f == 1
+    # @show r.settings.tilesize
+    tmp = tile_allocate(TTile, r.settings.tilesize, kernel)
     _imfilter_tiled!(r, out, A, kernel, border, tmp, inds)
     out
 end
@@ -834,6 +837,7 @@ end
 
 # Single-threaded, pair of kernels (with only one temporary buffer required)
 function _imfilter_tiled!(r::CPU1, out, A, kernel::Tuple{Any,Any}, border::NoPad, tiles::Vector{AA}, indsout) where AA<:AbstractArray
+    out, A, kernel = maybe_reinterpret(out, A, kernel)
     k1, k2 = kernel
     tile = tiles[1]
     indsk2, indstile = axes(k2), axes(tile)
@@ -850,6 +854,7 @@ end
 
 # Multithreaded, pair of kernels
 function _imfilter_tiled!(r::CPUThreads, out, A, kernel::Tuple{Any,Any}, border::NoPad, tiles::Vector{AA}, indsout) where AA<:AbstractArray
+    out, A, kernel = maybe_reinterpret(out, A, kernel)
     k1, k2 = kernel
     tile = tiles[1]
     indsk2, indstile = axes(k2), axes(tile)
@@ -908,10 +913,8 @@ end
     out
 end
 
-# The first of the pair in `tmp` has the current data. We also make
-# the second a plain array so there's no doubt about who's holding the
-# proper indices.
-function _imfilter_tiled_swap!(r, out, kernel::Tuple{Any,Any,Vararg{Any}}, border, tmp::Tuple{TileBuffer,Array})
+# The first of the pair in `tmp` has the current data.
+function _imfilter_tiled_swap!(r, out, kernel::Tuple{Any,Any,Vararg{Any}}, border, tmp::Tuple{TileBuffer,AbstractArray})
     tileb1, tile2 = tmp
     k1, kt = kernel[1], tail(kernel)
     parentinds = axes(tileb1)
@@ -922,7 +925,7 @@ function _imfilter_tiled_swap!(r, out, kernel::Tuple{Any,Any,Vararg{Any}}, borde
 end
 
 # on the last call we write to `out` instead of one of the buffers
-function _imfilter_tiled_swap!(r, out, kernel::Tuple{Any}, border, tmp::Tuple{TileBuffer,Array})
+function _imfilter_tiled_swap!(r, out, kernel::Tuple{Any}, border, tmp::Tuple{TileBuffer,AbstractArray})
     tileb1 = tmp[1]
     k1 = kernel[1]
     parentinds = axes(tileb1)
@@ -1014,26 +1017,26 @@ end
 # This is unfortunate, but specializing this saves an add in the inner
 # loop and results in a modest performance improvement. It would be
 # nice if LLVM did this automatically. (@polly?)
-function __imfilter_inbounds!(r, out, A::OffsetArray, kern::OffsetArray, border, R, z)
-    off, k = CartesianIndex(kern.offsets), parent(kern)
-    o, O = safehead(off), safetail(off)
-    Rnew = CartesianIndices(map((x,y)->x.+y, R.indices, Tuple(off)))
-    Rk = CartesianIndices(axes(k))
-    offA, pA = CartesianIndex(A.offsets), parent(A)
-    oA, OA = safehead(offA), safetail(offA)
-    for I in safetail(Rnew)
-        IA = I-OA
-        for i in safehead(Rnew)
-            tmp = z
-            iA = i-oA
-            @inbounds for J in safetail(Rk), j in safehead(Rk)
-                tmp += safe_for_prod(pA[iA+j,IA+J], tmp)*k[j,J]
-            end
-            @inbounds out[i-o,I-O] = tmp
-        end
-    end
-    out
-end
+# function __imfilter_inbounds!(r, out, A::OffsetArray, kern::OffsetArray, border, R, z)
+#     off, k = CartesianIndex(kern.offsets), parent(kern)
+#     o, O = safehead(off), safetail(off)
+#     Rnew = CartesianIndices(map((x,y)->x.+y, R.indices, Tuple(off)))
+#     Rk = CartesianIndices(axes(k))
+#     offA, pA = CartesianIndex(A.offsets), parent(A)
+#     oA, OA = safehead(offA), safetail(offA)
+#     for I in safetail(Rnew)
+#         IA = I-OA
+#         for i in safehead(Rnew)
+#             tmp = z
+#             iA = i-oA
+#             @inbounds for J in safetail(Rk), j in safehead(Rk)
+#                 tmp += safe_for_prod(pA[iA+j,IA+J], tmp)*k[j,J]
+#             end
+#             @inbounds out[i-o,I-O] = tmp
+#         end
+#     end
+#     out
+# end
 
 function _imfilter_inbounds!(r::AbstractResource, out, A::AbstractArray, kern::ReshapedOneD, border::NoPad, inds)
     Rpre, ind, Rpost = iterdims(inds, kern)
@@ -1043,68 +1046,110 @@ function _imfilter_inbounds!(r::AbstractResource, out, A::AbstractArray, kern::R
         return out
     end
     p = accumfilter(A[first(R)+first(Rk)], first(k))
-    z = zero(typeof(p+p))
+    z = float(zero(eltype(A)))
+    # z = zero(typeof(p+p))
     _imfilter_inbounds!(r, z, out, A, k, Rpre, ind, Rpost)
 end
 
-# Many of the following are unfortunate specializations
-function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::OffsetVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices)
-    _imfilter_inbounds!(r, z, out, A, parent(k), Rpre, ind, Rpost, k.offsets[1])
-end
+# # Many of the following are unfortunate specializations
+# function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::OffsetVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices)
+#     _imfilter_inbounds!(r, z, out, A, parent(k), Rpre, ind, Rpost, k.offsets[1])
+# end
 
-function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::AbstractVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, koffset=0)
+# LoopVectorization.check_type(::Type{T}) where T<:ColorVectorSpace.MathTypes = true
+# @generated function VectorizationBase.zero_vecunroll(::StaticInt{N}, ::StaticInt{W}, ::Type{Gray{T}}, ::StaticInt{RS}) where {N,W,T,RS}
+#     Expr(:block, Expr(:meta, :inline), :(VectorizationBase._vzero(VecUnroll{$(N-1),$W,$T,Vec{$W,$T}}, StaticInt{$RS}())))
+# end
+# function VectorizationBase._vload_unroll(
+#     sptr::AbstractStridedPointer{Gray{T},N,C,B}, u::Unroll{AU,F,UN,AV,W,M,UX,I}, ::A, ::StaticInt{RS}, ::StaticInt{X}
+#     ) where {T<:NativeTypes,N,C,B,AU,F,UN,AV,W,M,UX,I<:IndexNoUnroll,A<:StaticBool,RS,X}
+# VectorizationBase.VecUnroll{N,1,T,T}(x::S) where {N,T<:VectorizationBase.NativeTypes,S<:FixedPoint{T}} =
+    # VectorizationBase.VecUnroll{N,1,T,T}(reinterpret(x))
+# VectorizationBase.VecUnroll(x::FixedPoint) = VectorizationBase.VecUnroll(reinterpret(x))
+# VectorizationBase.VecUnroll(x::AbstractGray) = VectorizationBase.VecUnroll(gray(x))
+# VectorizationBase.VecUnroll(x::Gray) where {N,T<:VectorizationBase.NativeTypes} =
+#     VectorizationBase.VecUnroll{N,1,T,T}(reinterpret(x))
+
+const LVTypes = Union{VectorizationBase.NativeTypes, SVector{N,<:VectorizationBase.NativeTypes} where N}
+
+const args = Ref{Any}()
+function _imfilter_inbounds!(r::AbstractResource, z, out::AbstractArray{<:LVTypes}, A::AbstractArray{<:LVTypes}, k::AbstractVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices)
+    if !LoopVectorization.check_args(out, A)
+        @show summary(out) summary(A)
+        args[] = (deepcopy(out), deepcopy(A))
+        error("this should have worked")
+    end
     indsk = axes(k, 1)
+    zout = convert(eltype(out), z)
     for Ipost in Rpost
         for i in ind
-            ik = i+koffset
-            for Ipre in Rpre
-                tmp = z
+            @turbo for Ipre in Rpre
+                tmp = zout
                 for j in indsk
-                    @inbounds tmp += safe_for_prod(A[Ipre,ik+j,Ipost], tmp)*k[j]
+                    tmp += safe_for_prod(A[Ipre,i+j,Ipost], z)*k[j]
                 end
-                @inbounds out[Ipre,i,Ipost] = tmp
+                out[Ipre,i,Ipost] = tmp
             end
         end
     end
     out
 end
 
-function _imfilter_inbounds!(r::AbstractResource, out, A::OffsetArray, kern::ReshapedVector, border::NoPad, inds)
-    Rpre, ind, Rpost = iterdims(inds, kern)
-    k = kern.data
-    R, Rk = CartesianIndices(inds), CartesianIndices(axes(kern))
-    if isempty(R) || isempty(Rk)
-        return out
-    end
-    p = accumfilter(A[first(R)+first(Rk)], first(k))
-    z = zero(typeof(p+p))
-    Opre, o, Opost = KernelFactors.indexsplit(CartesianIndex(A.offsets), kern)
-    _imfilter_inbounds!(r, z, out, parent(A), k, Rpre, ind, Rpost, Opre, o, Opost)
-end
-
-function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::OffsetVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, Opre, o, Opost)
-    _imfilter_inbounds!(r, z, out, A, parent(k), Rpre, ind, Rpost, Opre, o, Opost, k.offsets[1])
-end
-
-function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::AbstractVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, Opre, o, Opost, koffset=0)
+# No @turbo version
+function _imfilter_inbounds!(r::AbstractResource, z, out::AbstractArray{<:LVTypes}, A::AbstractArray{<:LVTypes}, k::AbstractVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices)
     indsk = axes(k, 1)
+    zout = convert(eltype(out), z)
     for Ipost in Rpost
-        IOpost = Ipost - Opost
         for i in ind
-            io = i-o+koffset
-            for Ipre in Rpre
-                IOpre = Ipre - Opre
-                tmp = z
+            @inbounds for Ipre in Rpre
+                tmp = zout
                 for j in indsk
-                    @inbounds tmp += safe_for_prod(A[IOpre,io+j,IOpost], tmp)*k[j]
+                    tmp += safe_for_prod(A[Ipre,i+j,Ipost], z)*k[j]
                 end
-                @inbounds out[Ipre,i,Ipost] = tmp
+                out[Ipre,i,Ipost] = tmp
             end
         end
     end
     out
 end
-# end unfortunate specializations
+
+# function _imfilter_inbounds!(r::AbstractResource, out, A::OffsetArray, kern::ReshapedVector, border::NoPad, inds)
+#     Rpre, ind, Rpost = iterdims(inds, kern)
+#     k = kern.data
+#     R, Rk = CartesianIndices(inds), CartesianIndices(axes(kern))
+#     if isempty(R) || isempty(Rk)
+#         return out
+#     end
+#     p = accumfilter(A[first(R)+first(Rk)], first(k))
+#     z = zero(typeof(p+p))
+#     Opre, o, Opost = KernelFactors.indexsplit(CartesianIndex(A.offsets), kern)
+#     _imfilter_inbounds!(r, z, out, parent(A), k, Rpre, ind, Rpost, Opre, o, Opost)
+# end
+
+# function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::OffsetVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, Opre, o, Opost)
+#     _imfilter_inbounds!(r, z, out, A, parent(k), Rpre, ind, Rpost, Opre, o, Opost, k.offsets[1])
+# end
+
+# function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::AbstractVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, Opre, o, Opost)
+#     indsk = axes(k, 1)
+#     zout = convert(eltype(out), z)
+#     for Ipost in Rpost
+#         IOpost = Ipost - Opost
+#         for i in ind
+#             io = i-o+koffset
+#             @turbo for Ipre in Rpre
+#                 IOpre = Ipre - Opre
+#                 tmp = zout
+#                 for j in indsk
+#                     tmp += safe_for_prod(A[IOpre,io+j,IOpost], z)*k[j]
+#                 end
+#                 @inbounds out[Ipre,i,Ipost] = tmp
+#             end
+#         end
+#     end
+#     out
+# end
+# # end unfortunate specializations
 
 ## commented out because "virtual padding" is commented out
 # function _imfilter_iter!(r::AbstractResource, out, padded, kernel::AbstractArray, iter)
diff --git a/src/kernelfactors.jl b/src/kernelfactors.jl
index 8e8f6e3..39b8fc1 100644
--- a/src/kernelfactors.jl
+++ b/src/kernelfactors.jl
@@ -111,6 +111,9 @@ for op in (:+, :-, :*, :/)
 end
 Base.BroadcastStyle(::Type{R}) where {R<:ReshapedOneD{T,N}} where {T,N} = Broadcast.DefaultArrayStyle{N}()
 
+Base.:/(A::ReshapedOneD{T,N,Npre}, x::Real) where {T,N,Npre} = ReshapedOneD{N,Npre}(A.data / x)
+Base.:*(A::ReshapedOneD{T,N,Npre}, x::Real) where {T,N,Npre} = ReshapedOneD{N,Npre}(A.data * x)
+
 _reshape(A::ReshapedOneD{T,N}, ::Val{N}) where {T,N} = A
 _vec(A::ReshapedOneD) = A.data
 Base.vec(A::ReshapedOneD) = A.data  # is this OK? (note indices won't nec. start with 1)
diff --git a/src/utils.jl b/src/utils.jl
index 3f74fa1..b00c712 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -131,3 +131,27 @@ accumfilter(pixelval::Colorant{N0f8}, filterval::N0f8) = float32(c)*Float32(filt
 # In theory, the following might need to be specialized. For safety, make it a
 # standalone function call.
 safe_for_prod(x, ref) = oftype(ref, x)
+
+# For LoopVectorization, the easiest path is to convert to native types
+native_eltype(::Type{T}) where T<:VectorizationBase.NativeTypes = T, 1
+native_eltype(::Type{T}) where T<:FixedPoint = FixedPointNumbers.rawtype(T), FixedPointNumbers.rawone(T)
+native_eltype(::Type{Gray{T}}) where T = native_eltype(T)
+function native_eltype(::Type{RGB{T}}) where T
+    T′, f = native_eltype(T)
+    return SVector{3,T′}, f
+end
+native_eltype(::Type{T}) where T = T, 1
+
+function maybe_reinterpret(out::AbstractArray, A::AbstractArray, kernel::Tuple{Any,Vararg{Any}})
+    Tout = eltype(out)
+    Tout′, outf = native_eltype(Tout)
+    TA = eltype(A)
+    TA′, Af = native_eltype(TA)
+    if Tout′ !== Tout || TA′ !== TA
+        kernel′ = ((kernel[1]/Af)*outf, Base.tail(kernel)...)
+        return (Tout === Tout′ ? out : reinterpret(reshape, Tout′, out),
+                TA === TA′ ? A : reinterpret(reshape, TA′, A),
+                kernel′)
+    end
+    return out, A, kernel
+end