Skip to content

Commit

Permalink
Breaking change: dropped the old @vectorize macro to reduce the amoun…
Browse files Browse the repository at this point in the history
…t of code. Also added
  • Loading branch information
chriselrod committed Jan 6, 2020
1 parent d3a23f5 commit e8f621a
Show file tree
Hide file tree
Showing 9 changed files with 310 additions and 889 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "LoopVectorization"
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
authors = ["Chris Elrod <[email protected]>"]
version = "0.2.4"
version = "0.3.0"

[deps]
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Expand Down
137 changes: 121 additions & 16 deletions benchmarks/driver.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,15 @@ function Base.getindex(br::SizedResults, row, col)
col == 1 ? string(br.sizes[row]) : string(br.results[col - 1, row])
end
Base.setindex!(br::BenchmarkResult, v, i...) = br.sizedresults.results[i...] = v

const HIGHLIGHT_BEST = Highlighter(
(br,i,j) -> (j > 1 && maximum(@view(br.results[:, i])) == br.results[j-1,i]),
foreground = :green
);
function Base.show(io::IO, br::BenchmarkResult)
pretty_table(io, br.sizedresults, br.tests)
pretty_table(
io, br.sizedresults, br.tests, crop = :none, highlighters = (HIGHLIGHT_BEST,)
)
end

using VegaLite, IndexedTables
Expand All @@ -57,22 +64,17 @@ function plot(br::BenchmarkResult)
)
end

function alloc_matrices(s::NTuple{3,Int})
M, K, N = s
C = Matrix{Float64}(undef, M, N)
A = rand(M, K)
B = rand(K, N)
C, A, B
end
alloc_matrices(s::Int) = alloc_matrices((s,s,s))
gflop(s::Int) = s^3 * 2e-9
gflop(s::NTuple{3,Int}) = prod(s) * 2e-9
tothreetuple(i::Int) = (i,i,i)
tothreetuple(i::NTuple{3,Int}) = i
function benchmark_gemm(sizes)
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFort-loops", "GFort-intrinsic", "LoopVectorization"]
br = BenchmarkResult(tests, sizes)
for (i,s) enumerate(sizes)
C, A, B = alloc_matrices(s)
n_gflop = gflop(s)
M, K, N = tothreetuple(s)
C = Matrix{Float64}(undef, M, N)
A = rand(M, K)
B = rand(K, N)
n_gflop = M*K*N*2e-9
br[1,i] = n_gflop / @belapsed mul!($C, $A, $B)
Cblas = copy(C)
br[2,i] = n_gflop / @belapsed jgemm_nkm!($C, $A, $B)
Expand All @@ -85,9 +87,42 @@ function benchmark_gemm(sizes)
@assert C Cblas "Fort intrinsic gemm wrong?"
br[6,i] = n_gflop / @belapsed gemmavx!($C, $A, $B)
@assert C Cblas "LoopVec gemm wrong?"
if i % 10 == 0
percent_complete = round(100i/ length(sizes), sigdigits = 4)
@show percent_complete
end
end
br
end
function benchmark_AtmulB(sizes)
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFort-loops", "GFort-intrinsic", "LoopVectorization"]
br = BenchmarkResult(tests, sizes)
for (i,s) enumerate(sizes)
M, K, N = tothreetuple(s)
C = Matrix{Float64}(undef, M, N)
At = rand(K, M)
B = rand(K, N)
n_gflop = M*K*N*2e-9
br[1,i] = n_gflop / @belapsed mul!($C, $At', $B)
Cblas = copy(C)
br[2,i] = n_gflop / @belapsed jAtmulB!($C, $At, $B)
@assert C Cblas "Julia gemm wrong?"
br[3,i] = n_gflop / @belapsed cAtmulB!($C, $At, $B)
@assert C Cblas "Polly gemm wrong?"
br[4,i] = n_gflop / @belapsed fAtmulB!($C, $At, $B)
@assert C Cblas "Fort gemm wrong?"
br[5,i] = n_gflop / @belapsed fAtmulB_builtin!($C, $At, $B)
@assert C Cblas "Fort intrinsic gemm wrong?"
br[6,i] = n_gflop / @belapsed jAtmulBavx!($C, $At, $B)
@assert C Cblas "LoopVec gemm wrong?"
if i % 10 == 0
percent_complete = round(100i/ length(sizes), sigdigits = 4)
@show percent_complete
end
end
br
end

function benchmark_dot(sizes)
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFort-loops", "LoopVectorization"]
br = BenchmarkResult(tests, sizes)
Expand All @@ -104,6 +139,10 @@ function benchmark_dot(sizes)
@assert fdot(a,b) dotblas "Fort dot wrong?"
br[5,i] = n_gflop / @belapsed jdotavx($a, $b)
@assert jdotavx(a,b) dotblas "LoopVec dot wrong?"
if i % 10 == 0
percent_complete = round(100i/ length(sizes), sigdigits = 4)
@show percent_complete
end
end
br
end
Expand All @@ -123,11 +162,63 @@ function benchmark_selfdot(sizes)
@assert fselfdot(a) dotblas "Fort dot wrong?"
br[5,i] = n_gflop / @belapsed jselfdotavx($a)
@assert jselfdotavx(a) dotblas "LoopVec dot wrong?"
if i % 10 == 0
percent_complete = round(100i/ length(sizes), sigdigits = 4)
@show percent_complete
end
end
br
end
totwotuple(i::Int) = (i,i)
totwotuple(i::Tuple{Int,Int}) = i
function benchmark_gemv(sizes)
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFort-loops", "LoopVectorization"]
br = BenchmarkResult(tests, sizes)
for (i,s) enumerate(sizes)
M, N = totwotuple(s)
x = Vector{Float64}(undef, M); A = rand(M, N); y = rand(N);
n_gflop = M*N * 2e-9
br[1,i] = n_gflop / @belapsed mul!($x, $A, $y)
xblas = copy(x)
br[2,i] = n_gflop / @belapsed jgemv!($x, $A, $y)
@assert x xblas "Julia wrong?"
br[3,i] = n_gflop / @belapsed cgemv!($x, $A, $y)
@assert x xblas "Polly wrong?"
br[4,i] = n_gflop / @belapsed fgemv!($x, $A, $y)
@assert x xblas "Fort wrong?"
br[5,i] = n_gflop / @belapsed jgemvavx!($x, $A, $y)
@assert x xblas "LoopVec wrong?"
if i % 10 == 0
percent_complete = round(100i/ length(sizes), sigdigits = 4)
@show percent_complete
end
end
br
end
function benchmark_dot3(sizes)
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFort-loops", "LoopVectorization"]
br = BenchmarkResult(tests, sizes)
for (i,s) enumerate(sizes)
M, N = totwotuple(s)
x = rand(M); A = rand(M, N); y = rand(N);
n_gflop = M*N * 3e-9
br[1,i] = n_gflop / @belapsed dot($x, $A, $y)
dotblas = dot(x, A, y)
br[2,i] = n_gflop / @belapsed jdot3($x, $A, $y)
@assert jdot3(x, A, y) dotblas "Julia dot wrong?"
br[3,i] = n_gflop / @belapsed cdot3($x, $A, $y)
@assert cdot3(x, A, y) dotblas "Polly dot wrong?"
br[4,i] = n_gflop / @belapsed fdot3($x, $A, $y)
@assert fdot3(x, A, y) dotblas "Fort dot wrong?"
br[5,i] = n_gflop / @belapsed jdot3avx($x, $A, $y)
@assert jdot3avx(x, A, y) dotblas "LoopVec dot wrong?"
if i % 10 == 0
percent_complete = round(100i/ length(sizes), sigdigits = 4)
@show percent_complete
end
end
br
end
function sse!(Xβ, y, X, β)
mul!(copyto!(Xβ, y), X, β, 1.0, -1.0)
dot(Xβ, Xβ)
Expand All @@ -151,22 +242,32 @@ function benchmark_sse(sizes)
@assert fOLSlp(y, X, β) lpblas "Fort wrong?"
br[5,i] = n_gflop / @belapsed jOLSlp_avx($y, $X, $β)
@assert jOLSlp_avx(y, X, β) lpblas "LoopVec wrong?"
if i % 10 == 0
percent_complete = round(100i/ length(sizes), sigdigits = 4)
@show percent_complete
end
end
br
end

function benchmark_exp(sizes)
tests = ["Julia", "GFort-loops", "LoopVectorization"]
tests = ["Julia", "Clang-Polly", "GFort-loops", "LoopVectorization"]
br = BenchmarkResult(tests, sizes)
for (i,s) enumerate(sizes)
a = rand(s); b = similar(a)
n_gflop = 1e-9*s # not really gflops
br[1,i] = n_gflop / @belapsed @. $b = exp($a)
baseb = copy(b)
br[2,i] = n_gflop / @belapsed fvexp!($b, $a)
br[2,i] = n_gflop / @belapsed cvexp!($b, $a)
@assert b baseb "Clang wrong?"
br[3,i] = n_gflop / @belapsed fvexp!($b, $a)
@assert b baseb "Fort wrong?"
br[3,i] = n_gflop / @belapsed @avx @. $b = exp($a)
br[4,i] = n_gflop / @belapsed @avx @. $b = exp($a)
@assert b baseb "LoopVec wrong?"
if i % 10 == 0
percent_complete = round(100i/ length(sizes), sigdigits = 4)
@show percent_complete
end
end
br
end
Expand All @@ -187,6 +288,10 @@ function benchmark_aplusBc(sizes)
@assert D Dcopy "Fort wrong?"
br[4,i] = n_gflop / @belapsed @avx @. $D = $a + $B * $c′
@assert D Dcopy "LoopVec wrong?"
if i % 10 == 0
percent_complete = round(100i/ length(sizes), sigdigits = 4)
@show percent_complete
end
end
br
end
Expand Down
61 changes: 56 additions & 5 deletions benchmarks/loadsharedlibs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@ const LIBCTEST = joinpath(LOOPVECBENCHDIR, "libctests.so")
const LIBFTEST = joinpath(LOOPVECBENCHDIR, "libftests.so")

# requires Clang with polly to build
if !isfile(LIBCTEST)
cfile = joinpath(LOOPVECBENCHDIR, "looptests.c")
run(`clang -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -mllvm -polly -mllvm -polly-vectorizer=stripmine -shared -fPIC $cfile -o $LIBCTEST`)
cfile = joinpath(LOOPVECBENCHDIR, "looptests.c")
if !isfile(LIBCTEST) || mtime(cfile) > mtime(LIBCTEST)
run(`clang -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -lm -mllvm -polly -mllvm -polly-vectorizer=stripmine -shared -fPIC $cfile -o $LIBCTEST`)
end
if !isfile(LIBFTEST)
ffile = joinpath(LOOPVECBENCHDIR, "looptests.f90") # --param max-unroll-times defaults to ≥8, which is generally excessive
ffile = joinpath(LOOPVECBENCHDIR, "looptests.f90")
if !isfile(LIBFTEST) || mtime(ffile) > mtime(LIBFTEST)
# --param max-unroll-times defaults to ≥8, which is generally excessive
run(`gfortran -Ofast -march=native -funroll-loops --param max-unroll-times=4 -floop-nest-optimize -mprefer-vector-width=$(8REGISTER_SIZE) -shared -fPIC $ffile -o $LIBFTEST`)
end

Expand Down Expand Up @@ -46,6 +47,30 @@ function fgemm_builtin!(C, A, B)
C, A, B, Ref(M), Ref(K), Ref(N)
)
end
function cAtmulB!(C, A, B)
M, N = size(C); K = size(B, 1)
ccall(
(:AtmulB, LIBCTEST), Cvoid,
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Clong, Clong, Clong),
C, A, B, M, K, N
)
end
function fAtmulB!(C, A, B)
M, N = size(C); K = size(B, 1)
ccall(
(:AtmulB, LIBFTEST), Cvoid,
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}),
C, A, B, Ref(M), Ref(K), Ref(N)
)
end
function fAtmulB_builtin!(C, A, B)
M, N = size(C); K = size(B, 1)
ccall(
(:AtmulBbuiltin, LIBFTEST), Cvoid,
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}),
C, A, B, Ref(M), Ref(K), Ref(N)
)
end

function cdot(a, b)
N = length(a)
Expand Down Expand Up @@ -83,6 +108,24 @@ function fselfdot(a)
)
d[]
end
function cdot3(x, A, y)
M, N = size(A)
ccall(
(:dot3, LIBCTEST), Float64,
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Clong, Clong),
x, A, y, M, N
)
end
function fdot3(x, A, y)
M, N = size(A)
d = Ref{Float64}()
ccall(
(:dot3, LIBFTEST), Cvoid,
(Ref{Float64}, Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}),
d, x, A, y, Ref(M), Ref(N)
)
d[]
end

function cgemv!(y, A, x)
M, K = size(A)
Expand Down Expand Up @@ -143,6 +186,14 @@ function fOLSlp(y, X, β)
)
lp[]
end
function cvexp!(b, a)
N = length(b)
ccall(
(:vexp, LIBCTEST), Cvoid,
(Ptr{Float64}, Ptr{Float64}, Clong),
b, a, N
)
end
function fvexp!(b, a)
N = length(b)
ccall(
Expand Down
40 changes: 36 additions & 4 deletions benchmarks/looptests.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@

#include<math.h>

void gemm_mnk(double* restrict C, double* restrict A, double* restrict B, long M, long K, long N){
for (long i = 0; i < M*N; i++){
Expand Down Expand Up @@ -78,6 +78,19 @@ void gemm_knm(double* restrict C, double* restrict A, double* restrict B, long M
}
return;
}
void AtmulB(double* restrict C, double* restrict At, double* restrict B, long M, long K, long N){
for (long i = 0; i < M*N; i++){
C[i] = 0.0;
}
for (long n = 0; n < N; n++){
for (long m = 0; m < M; m++){
for (long k = 0; k < K; k++){
C[m + n*M] += At[k + m*K] * B[k + n*K];
}
}
}
return;
}
double dot(double* restrict a, double* restrict b, long N){
double s = 0.0;
for (long n = 0; n < N; n++){
Expand All @@ -92,7 +105,15 @@ double selfdot(double* restrict a, long N){
}
return s;
}

double dot3(double* restrict x, double* restrict A, double* restrict y, long M, long N){
double s = 0.0;
for (long n = 0; n < N; n++){
for (long m = 0; m < M; m++){
s += x[m] * A[m + n*M] * y[n];
}
}
return s;
}
void gemv(double* restrict y, double* restrict A, double* restrict x, long M, long K){
for (long m = 0; m < M; m++){
y[m] = 0.0;
Expand All @@ -104,7 +125,19 @@ void gemv(double* restrict y, double* restrict A, double* restrict x, long M, l
}
return;
}

double svexp(double* restrict a, long N){
double s = 0.0;
for (long n = 0; n < N; n++){
s += exp(a[n]);
}
return s;
}
void vexp(double* restrict b, double* restrict a, long N){
for (long n = 0; n < N; n++){
b[n] = exp(a[n]);
}
return;
}
void unscaledvar(double* restrict s, double* restrict A, double* restrict xb, long M, long N){
for (long m = 0; m < M; m++){
s[m] = 0.0;
Expand All @@ -117,7 +150,6 @@ void unscaledvar(double* restrict s, double* restrict A, double* restrict xb, lo
}
return;
}

void aplusBc(double* restrict D, double* restrict a, double* restrict B, double* restrict c, long M, long N){
for (long n = 0; n < N; n++){
for (long m = 0; m < M; m++){
Expand Down
Loading

2 comments on commit e8f621a

@chriselrod
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register()

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/7572

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if Julia TagBot is installed, or can be done manually through the github interface, or via:

git tag -a v0.3.0 -m "<description of version>" e8f621a07bc4c6f3db664747f443039bec8e8e2c
git push origin v0.3.0

Please sign in to comment.