Skip to content

Commit

Permalink
Fix to lowering to clean up generated code slightly when reductions a…
Browse files Browse the repository at this point in the history
…ren't vectorized.
  • Loading branch information
chriselrod committed Jan 17, 2020
1 parent ee89d70 commit 81b41c9
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 7 deletions.
30 changes: 30 additions & 0 deletions benchmark/benchmarkflops.jl
Original file line number Diff line number Diff line change
Expand Up @@ -308,3 +308,33 @@ function benchmark_aplusBc(sizes)
br
end

function benchmark_AplusAt(sizes)
tests = ["Julia", "Clang-Polly", "GFortran", "GFortran-builtin", "icc", "ifort", "ifort-builtin", "LoopVectorization"]
br = BenchmarkResult(tests, sizes)
for (i,s) enumerate(sizes)
A = rand(s,s); B = similar(A)
n_gflop = 1e-9*s^2
br[1,i] = n_gflop / @belapsed @. $B = $A + $A'
baseB = copy(B)
br[2,i] = n_gflop / @belapsed cAplusAt!($B, $A)
@assert B baseB "Clang wrong?"
br[3,i] = n_gflop / @belapsed fAplusAt!($B, $A)
@assert B baseB "Fort wrong?"
br[4,i] = n_gflop / @belapsed fAplusAtbuiltin!($B, $A)
@assert B baseB "Fort-builtin wrong?"
br[5,i] = n_gflop / @belapsed icAplusAt!($B, $A)
@assert B baseB "icc wrong?"
br[6,i] = n_gflop / @belapsed ifAplusAt!($B, $A)
@assert B baseB "ifort wrong?"
br[7,i] = n_gflop / @belapsed ifAplusAtbuiltin!($B, $A)
@assert B baseB "ifort-builtin wrong?"
br[8,i] = n_gflop / @belapsed @avx @. $B = $A + $A'
@assert B baseB "LoopVec wrong?"
# if i % 10 == 0
# percent_complete = round(100i/ length(sizes), sigdigits = 4)
# @show percent_complete
# end
end
br
end

9 changes: 6 additions & 3 deletions benchmark/driver.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ include(joinpath(LOOPVECBENCHDIR, "plotbenchmarks.jl"))

using Distributed

addprocs(9);
addprocs(10);

@everywhere begin
pkgdir(pkg::String) = abspath(joinpath(dirname(Base.find_package(pkg)), ".."))
Expand All @@ -28,16 +28,18 @@ dot3_future = @spawnat 7 benchmark_dot3(2:256);
sse_future = @spawnat 8 benchmark_sse(2:256);
exp_future = @spawnat 9 benchmark_exp(2:256);
aplusBc_future = @spawnat 10 benchmark_aplusBc(2:256);
AplusAt_future = @spawnat 11 benchmark_AplusAt(2:256);

gemm_bench = fetch(gemm_future)
AtmulB_bench = fetch(AtmulB_future)
dot_bench = fetch(dot_future)
selfdot_bench = fetch(selfdot_future)
AplusAt_bench = fetch(AplusAt_future)
gemv_bench = fetch(gemv_future)
dot3_bench = fetch(dot3_future)
sse_bench = fetch(sse_future)
exp_bench = fetch(exp_future)
aplusBc_bench = fetch(aplusBc_future)
gemm_bench = fetch(gemm_future)
AtmulB_bench = fetch(AtmulB_future)


plot(gemm_bench)
Expand All @@ -49,6 +51,7 @@ plot(dot3_bench)
plot(sse_bench)
plot(exp_bench)
plot(aplusBc_bench)
plot(AplusAt_bench)

save(joinpath("~/Pictures", "bench_gemm_v3.png"), plot(gemm_bench));
save(joinpath("~/Pictures", "bench_AtmulB_v3.png"), plot(AtmulB_bench));
Expand Down
28 changes: 26 additions & 2 deletions benchmark/loadsharedlibs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,15 @@ if !isfile(LIBCTEST) || mtime(cfile) > mtime(LIBCTEST)
run(`clang -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -lm -mllvm -polly -mllvm -polly-vectorizer=stripmine -shared -fPIC $cfile -o $LIBCTEST`)
end
if !isfile(LIBICTEST) || mtime(cfile) > mtime(LIBICTEST)
run(`icc -fast -qopt-zmm-usage=high -qopt-matmul -shared -fPIC $cfile -o $LIBICTEST`)
run(`icc -fast -qopt-zmm-usage=high -shared -fPIC $cfile -o $LIBICTEST`)
end
ffile = joinpath(LOOPVECBENCHDIR, "looptests.f90")
if !isfile(LIBFTEST) || mtime(ffile) > mtime(LIBFTEST)
# --param max-unroll-times defaults to ≥8, which is generally excessive
run(`gfortran -Ofast -march=native -funroll-loops --param max-unroll-times=4 -floop-nest-optimize -mprefer-vector-width=$(8REGISTER_SIZE) -shared -fPIC $ffile -o $LIBFTEST`)
end
if !isfile(LIBIFTEST) || mtime(ffile) > mtime(LIBIFTEST)
run(`ifort -fast -qopt-zmm-usage=high -qopt-matmul -shared -fPIC $ffile -o $LIBIFTEST`)
run(`ifort -fast -qopt-zmm-usage=high -shared -fPIC $ffile -o $LIBIFTEST`)
end

for (prefix,Cshared,Fshared) ((Symbol(""),LIBCTEST,LIBFTEST), (:i,LIBICTEST,LIBIFTEST))
Expand Down Expand Up @@ -223,4 +223,28 @@ for (prefix,Cshared,Fshared) ∈ ((Symbol(""),LIBCTEST,LIBFTEST), (:i,LIBICTEST,
s[]
end

@eval function $(Symbol(prefix,:fAplusAt!))(B, A)
N = size(B,1)
ccall(
(:AplusAt, $Fshared), Cvoid,
(Ptr{Float64}, Ptr{Float64}, Ref{Clong}),
B, A, Ref(N)
)
end
@eval function $(Symbol(prefix,:fAplusAtbuiltin!))(B, A)
N = size(B,1)
ccall(
(:AplusAtbuiltin, $Fshared), Cvoid,
(Ptr{Float64}, Ptr{Float64}, Ref{Clong}),
B, A, Ref(N)
)
end
@eval function $(Symbol(prefix,:cAplusAt!))(B, A)
N = size(B,1)
ccall(
(:AplusAt, $Cshared), Cvoid,
(Ptr{Float64}, Ptr{Float64}, Clong),
B, A, N
)
end
end
7 changes: 7 additions & 0 deletions benchmark/looptests.c
Original file line number Diff line number Diff line change
Expand Up @@ -171,4 +171,11 @@ double OLSlp(double* restrict y, double* restrict X, double* restrict b, long N,
return lp;
}

void AplusAt(double* restrict B, double* restrict A, long N){
for (long i = 0; i < N; i++){
for (long j = 0; j < N; j++){
B[j + i*N] = A[j + i*N] + A[i + j*N];
}
}
}

19 changes: 18 additions & 1 deletion benchmark/looptests.f90
Original file line number Diff line number Diff line change
Expand Up @@ -236,4 +236,21 @@ subroutine OLSlp(lp, y, X, b, N, P) BIND(C, name="OLSlp")
lp = lp + d*d
end do
end subroutine OLSlp
end module looptests
subroutine AplusAt(B, A, N) BIND(C, name="AplusAt")
integer(C_long), intent(in) :: N
real(C_double), dimension(N,N), intent(out) :: B
real(C_double), dimension(N,N), intent(in) :: A
integer(C_long) :: i, j
do concurrent(i = 1:N)
do concurrent(j = 1:N)
B(j,i) = A(j,i) + A(i,j)
end do
end do
end subroutine AplusAt
subroutine AplusAtbuiltin(B, A, N) BIND(C, name="AplusAtbuiltin")
integer(C_long), intent(in) :: N
real(C_double), dimension(N,N), intent(out) :: B
real(C_double), dimension(N,N), intent(in) :: A
B = A + transpose(A)
end subroutine AplusAtbuiltin
end module looptests
2 changes: 1 addition & 1 deletion src/lowering.jl
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ function lower_compute!(
# making BitArrays inefficient.
# parentsyms = [opp.variable for opp ∈ parents(op)]
Uiter = opunrolled ? U - 1 : 0
maskreduct = mask !== nothing && isreduction(op) && any(opp -> opp.variable === var, parents_op)
maskreduct = mask !== nothing && isreduction(op) && vectorized reduceddependencies(op) #any(opp -> opp.variable === var, parents_op)
# if a parent is not unrolled, the compiler should handle broadcasting CSE.
# because unrolled/tiled parents result in an unrolled/tiled dependendency,
# we handle both the tiled and untiled case here.
Expand Down

2 comments on commit 81b41c9

@chriselrod
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register()

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/8072

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if Julia TagBot is installed, or can be done manually through the github interface, or via:

git tag -a v0.3.7 -m "<description of version>" 81b41c959f48322cc778e20ccacb6a53db36986a
git push origin v0.3.7

Please sign in to comment.