From da1161b9806d630d989ecb8c2162cf3522b69ee9 Mon Sep 17 00:00:00 2001 From: Chris Elrod Date: Tue, 10 Jan 2023 15:04:41 -0500 Subject: [PATCH] format --- .JuliaFormatter.toml | 8 +- benchmark/benchmarkflops.jl | 7 +- benchmark/benchmarks.jl | 3 +- benchmark/driver.jl | 38 +- benchmark/loadsharedlibs.jl | 303 +++++++---- benchmark/looptests.jl | 55 +- benchmark/plotbenchmarks.jl | 42 +- docs/make.jl | 10 +- src/LoopVectorization.jl | 31 +- src/broadcast.jl | 264 +++++++--- src/codegen/line_number_nodes.jl | 15 +- src/codegen/loopstartstopmanager.jl | 240 ++++++--- src/codegen/lower_compute.jl | 168 ++++-- src/codegen/lower_constant.jl | 68 ++- src/codegen/lower_load.jl | 161 ++++-- src/codegen/lower_memory_common.jl | 217 ++++++-- src/codegen/lower_store.jl | 86 +++- src/codegen/lower_threads.jl | 339 ++++++++---- src/codegen/lowering.jl | 393 ++++++++++---- src/codegen/operation_evaluation_order.jl | 53 +- src/codegen/split_loops.jl | 76 ++- src/condense_loopset.jl | 223 +++++--- src/constructors.jl | 131 ++++- src/getconstindexes.jl | 2 +- src/modeling/costs.jl | 96 ++-- src/modeling/determinestrategy.jl | 223 +++++--- src/modeling/graphs.jl | 485 ++++++++++++------ src/modeling/operations.jl | 104 ++-- src/parse/add_compute.jl | 164 ++++-- src/parse/add_constants.jl | 44 +- src/parse/add_ifelse.jl | 89 +++- src/parse/add_loads.jl | 36 +- src/parse/add_stores.jl | 36 +- src/parse/memory_ops_common.jl | 263 +++++++--- src/predicates.jl | 3 +- src/reconstruct_loopset.jl | 371 ++++++++++---- src/simdfunctionals/filter.jl | 25 +- src/simdfunctionals/map.jl | 145 ++++-- src/simdfunctionals/mapreduce.jl | 45 +- src/simdfunctionals/vmap_grad_forwarddiff.jl | 2 +- src/simdfunctionals/vmap_grad_rrule.jl | 17 +- src/transforms.jl | 38 +- src/user_api_conveniences.jl | 40 +- src/vectorizationbase_compat/contract_pass.jl | 56 +- src/vectorizationbase_compat/subsetview.jl | 31 +- test/manyarrayrefs.jl | 30 +- utils/generate_costs.jl | 36 +- utils/generate_precompiles.jl | 16 +- 48 files changed, 3698 insertions(+), 1630 deletions(-) diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml index 5c2cf2f06..b74ff2c91 100644 --- a/.JuliaFormatter.toml +++ b/.JuliaFormatter.toml @@ -1 +1,7 @@ -indent = 2 \ No newline at end of file +indent = 2 +margin = 80 +remove_extra_newlines = true +long_to_short_function_def = true +format_docstrings = true +trailing_comma = false +separate_kwargs_with_semicolon = true diff --git a/benchmark/benchmarkflops.jl b/benchmark/benchmarkflops.jl index cbaf4fde2..8831931b3 100644 --- a/benchmark/benchmarkflops.jl +++ b/benchmark/benchmarkflops.jl @@ -28,8 +28,8 @@ function Base.vcat(br1::BenchmarkResult, br2::BenchmarkResult) br1.tests, SizedResults( hcat(br1.sizedresults.results, br2.sizedresults.results), - vcat(br1.sizedresults.sizes, br2.sizedresults.sizes), - ), + vcat(br1.sizedresults.sizes, br2.sizedresults.sizes) + ) ) end @@ -119,7 +119,6 @@ function At_mul_Bt_bench!(br, s, i) matmul_bench!(br, C, A, B, i) end - function dot_bench!(br, s, i) a = rand(s) b = rand(s) @@ -440,7 +439,6 @@ function logdettriangle_bench!(br, s, i) br[5+2INTEL_BENCH, i] = n_gflop / @belapsed logdet($U) end - function filter2d_bench_run!(br, s, i, K) A = rand(s + 2, s + 2) B = OffsetArray(similar(A, (s, s)), 1, 1) @@ -463,7 +461,6 @@ function filter2d_bench_run!(br, s, i, K) end end - function filter2dunrolled_bench_run!(br, s, i, K) A = rand(s + 2, s + 2) B = OffsetArray(similar(A, (s, s)), 1, 1) diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 4cd95d96f..715703739 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -14,7 +14,8 @@ for n ∈ 1:64 B = rand(n, n) C = Matrix{Float64}(undef, n, n) SUITE["linalg"]["matmul"]["AmulB", n] = @benchmarkable gemmavx!($C, $A, $B) - SUITE["linalg"]["matmul"]["A′mulB", n] = @benchmarkable jAtmulBavx!($C, $A′, $B) + SUITE["linalg"]["matmul"]["A′mulB", n] = + @benchmarkable jAtmulBavx!($C, $A′, $B) x = rand(n) y = rand(n) SUITE["linalg"]["dot"]["dot", n] = @benchmarkable jdotavx($x, $y) diff --git a/benchmark/driver.jl b/benchmark/driver.jl index ad605982d..cbcaa8286 100644 --- a/benchmark/driver.jl +++ b/benchmark/driver.jl @@ -7,21 +7,26 @@ const LOOPVECBENCHDIR = joinpath(pkgdir(LoopVectorization), "benchmark") include(joinpath(LOOPVECBENCHDIR, "benchmarkflops.jl")) include(joinpath(LOOPVECBENCHDIR, "plotbenchmarks.jl")) - nprocs_to_add() = ((Sys.CPU_THREADS)::Int >> 1) # nprocs_to_add() = ((Sys.CPU_THREADS)::Int >> 1) - 1 -start_worker(wid) = remotecall(include, wid, joinpath(LOOPVECBENCHDIR, "setup_worker.jl")) +start_worker(wid) = + remotecall(include, wid, joinpath(LOOPVECBENCHDIR, "setup_worker.jl")) function start_workers(nprocs = nprocs_to_add()) - addprocs(nprocs, exeflags = "--project=$(Base.active_project())") + addprocs(nprocs; exeflags = "--project=$(Base.active_project())") foreach(wait, map(start_worker, workers())) end stop_workers() = rmprocs(workers()) - function blastests() tests = ["LoopVectorization", "Julia", "Clang", "GFortran"] INTEL_BENCH && push!(tests, "icc", "ifort") - push!(tests, "g++ & Eigen-3", "clang++ & Eigen-3", "GFortran-builtin", "OpenBLAS") + push!( + tests, + "g++ & Eigen-3", + "clang++ & Eigen-3", + "GFortran-builtin", + "OpenBLAS" + ) INTEL_BENCH && push!(tests, "ifort-builtin") MKL_BENCH && push!(tests, "MKL") tests @@ -166,7 +171,10 @@ function benchmark_random_access(sizes) INTEL_BENCH && push!(tests, "icc", "ifort") start_workers() sm = SharedMatrix(Matrix{Float64}(undef, length(tests), length(sizes))) - @showprogress pmap(is -> randomaccess_bench!(sm, is[2], is[1]), enumerate(sizes)) + @showprogress pmap( + is -> randomaccess_bench!(sm, is[2], is[1]), + enumerate(sizes) + ) br = BenchmarkResult(Matrix(sm), tests, sizes) stop_workers() br @@ -178,7 +186,10 @@ function benchmark_logdettriangle(sizes) push!(tests, "LinearAlgebra") start_workers() sm = SharedMatrix(Matrix{Float64}(undef, length(tests), length(sizes))) - @showprogress pmap(is -> logdettriangle_bench!(sm, is[2], is[1]), enumerate(sizes)) + @showprogress pmap( + is -> logdettriangle_bench!(sm, is[2], is[1]), + enumerate(sizes) + ) br = BenchmarkResult(Matrix(sm), tests, sizes) stop_workers() br @@ -188,7 +199,10 @@ function benchmark_filter2d(sizes, K) INTEL_BENCH && push!(tests, "icc", "ifort") start_workers() sm = SharedMatrix(Matrix{Float64}(undef, length(tests), length(sizes))) - @showprogress pmap(is -> filter2d_bench_run!(sm, is[2], is[1], K), enumerate(sizes)) + @showprogress pmap( + is -> filter2d_bench_run!(sm, is[2], is[1], K), + enumerate(sizes) + ) br = BenchmarkResult(Matrix(sm), tests, sizes) stop_workers() br @@ -209,15 +223,13 @@ function benchmark_filter2dunrolled(sizes) K = SizedOffsetMatrix{Float64,-1,1,-1,1}(rand(3, 3)) @showprogress pmap( is -> filter2dunrolled_bench_run!(sm, is[2], is[1], K), - enumerate(sizes), + enumerate(sizes) ) br = BenchmarkResult(Matrix(sm), tests, sizes) stop_workers() br end - - # sizes = 23:23 sizes = 256:-1:2 longsizes = 1024:-1:2 @@ -287,7 +299,8 @@ const v = 2 # using Cairo, Fontconfig const PICTURES = joinpath(pkgdir(LoopVectorization), "docs", "src", "assets") # saveplot(f, br) = draw(PNG(joinpath(PICTURES, f * "$v.png"), 12inch, 8inch), plot(br)) -saveplot(f, br) = draw(SVG(joinpath(PICTURES, f * "$v.svg"), 12inch, 8inch), plot(br)) +saveplot(f, br) = + draw(SVG(joinpath(PICTURES, f * "$v.svg"), 12inch, 8inch), plot(br)) # If only rerunning a few, remove them from load. # @load "benchmarkresults.jld2" logdettriangle_bench filter2d_dynamic_bench filter2d_3x3_bench filter2d_unrolled_bench dot_bench selfdot_bench dot3_bench sse_bench aplusBc_bench AplusAt_bench vexp_bench randomaccess_bench AmulB_bench AmulBt_bench AtmulB_bench AtmulBt_bench Amulvb_bench Atmulvb_bench @@ -305,7 +318,6 @@ saveplot("bench_AtmulBt_v", AtmulBt_bench); saveplot("bench_Amulvb_v", Amulvb_bench); saveplot("bench_Atmulvb_v", Atmulvb_bench); - saveplot("bench_logdettriangle_v", logdettriangle_bench); saveplot("bench_filter2d_dynamic_v", filter2d_dynamic_bench); saveplot("bench_filter2d_3x3_v", filter2d_3x3_bench); diff --git a/benchmark/loadsharedlibs.jl b/benchmark/loadsharedlibs.jl index e54bd29eb..65656b7df 100644 --- a/benchmark/loadsharedlibs.jl +++ b/benchmark/loadsharedlibs.jl @@ -12,17 +12,16 @@ const LIBIFTEST = joinpath(LOOPVECBENCHDIR, "libiftests.so") const LIBEIGENTEST = joinpath(LOOPVECBENCHDIR, "libetest.so") const LIBIEIGENTEST = joinpath(LOOPVECBENCHDIR, "libietest.so") - # requires Clang with polly to build cfile = joinpath(LOOPVECBENCHDIR, "looptests.c") if !isfile(LIBCTEST) || mtime(cfile) > mtime(LIBCTEST) if (Sys.ARCH === :aarch64) && Sys.isapple() # assume no `-march=native` support run( - `clang -Ofast -mprefer-vector-width=$(8REGISTER_SIZE) -lm -shared -fPIC $cfile -o $LIBCTEST`, + `clang -Ofast -mprefer-vector-width=$(8REGISTER_SIZE) -lm -shared -fPIC $cfile -o $LIBCTEST` ) else run( - `clang -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -lm -shared -fPIC $cfile -o $LIBCTEST`, + `clang -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -lm -shared -fPIC $cfile -o $LIBCTEST` ) end @@ -33,11 +32,11 @@ if !isfile(LIBFTEST) || mtime(ffile) > mtime(LIBFTEST) # --param max-unroll-times defaults to ≥8, which is generally excessive if (Sys.ARCH === :x86_64) run( - `gfortran -Ofast -march=native -funroll-loops -mprefer-vector-width=$(8REGISTER_SIZE) -fvariable-expansion-in-unroller --param max-variable-expansions-in-unroller=4 -shared -fPIC $ffile -o $LIBFTEST`, + `gfortran -Ofast -march=native -funroll-loops -mprefer-vector-width=$(8REGISTER_SIZE) -fvariable-expansion-in-unroller --param max-variable-expansions-in-unroller=4 -shared -fPIC $ffile -o $LIBFTEST` ) else run( - `gfortran -Ofast -march=native -funroll-loops -fvariable-expansion-in-unroller --param max-variable-expansions-in-unroller=4 -shared -fPIC $ffile -o $LIBFTEST`, + `gfortran -Ofast -march=native -funroll-loops -fvariable-expansion-in-unroller --param max-variable-expansions-in-unroller=4 -shared -fPIC $ffile -o $LIBFTEST` ) end # run(`gfortran -Ofast -march=native -funroll-loops -floop-nest-optimize -mprefer-vector-width=$(8REGISTER_SIZE) -shared -fPIC $ffile -o $LIBFTEST`) @@ -46,12 +45,12 @@ end const INTEL_BENCH = try if !isfile(LIBIFTEST) || mtime(ffile) > mtime(LIBIFTEST) run( - `ifort -fast -qopt-zmm-usage=high -qoverride-limits -shared -fPIC $ffile -o $LIBIFTEST`, + `ifort -fast -qopt-zmm-usage=high -qoverride-limits -shared -fPIC $ffile -o $LIBIFTEST` ) end if !isfile(LIBICTEST) || mtime(cfile) > mtime(LIBICTEST) run( - `icc -fast -qopt-zmm-usage=high -fargument-noalias-global -qoverride-limits -shared -fPIC $cfile -o $LIBICTEST`, + `icc -fast -qopt-zmm-usage=high -fargument-noalias-global -qoverride-limits -shared -fPIC $cfile -o $LIBICTEST` ) end true @@ -65,15 +64,15 @@ if !isfile(LIBEIGENTEST) || mtime(eigenfile) > mtime(LIBEIGENTEST) # Clang seems to have trouble finding includes if Bool(LoopVectorization.VectorizationBase.has_feature(Val(:x86_64_avx512f))) run( - `g++ -O3 -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`, + `g++ -O3 -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST` ) elseif (Sys.ARCH === :aarch64) && Sys.isapple() # assume homebrew run( - `g++-10 -O3 -march=native -I/opt/homebrew/Cellar/eigen/3.3.9/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`, + `g++-10 -O3 -march=native -I/opt/homebrew/Cellar/eigen/3.3.9/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST` ) else run( - `g++ -O3 -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`, + `g++ -O3 -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST` ) end end @@ -81,15 +80,15 @@ if !isfile(LIBIEIGENTEST) || mtime(eigenfile) > mtime(LIBIEIGENTEST) # run(`/usr/bin/clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/c++/9 -I/usr/include/c++/9/x86_64-generic-linux -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`) if Bool(LoopVectorization.VectorizationBase.has_feature(Val(:x86_64_avx512f))) run( - `clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`, + `clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST` ) elseif (Sys.ARCH === :aarch64) && Sys.isapple() # assume homebrew and no `-march=native` run( - `clang++ -Ofast -I/opt/homebrew/Cellar/eigen/3.3.9/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`, + `clang++ -Ofast -I/opt/homebrew/Cellar/eigen/3.3.9/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST` ) else run( - `clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`, + `clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST` ) end # run(`icpc -fast -qopt-zmm-usage=high -fargument-noalias-global -qoverride-limits -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`) @@ -104,7 +103,6 @@ end # # run(`gfortran -Ofast -march=native -DMKL_DIRECT_CALL_SEQ_JIT -cpp -mprefer-vector-width=$(8REGISTER_SIZE) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`) # end - randa(::Type{T}, dim...) where {T} = rand(T, dim...) randa(::Type{T}, dim...) where {T<:Signed} = rand(T(-100):T(200), dim...) @@ -127,7 +125,8 @@ const libOpenBLAS = Libdl.dlopen(OpenBLAS_jll.libopenblas) const DGEMM_OpenBLAS = Libdl.dlsym(libOpenBLAS, :dgemm_64_) const SGEMM_OpenBLAS = Libdl.dlsym(libOpenBLAS, :sgemm_64_) const DGEMV_OpenBLAS = Libdl.dlsym(libOpenBLAS, :dgemv_64_) -const OPENBLAS_SET_NUM_THREADS = Libdl.dlsym(libOpenBLAS, :openblas_set_num_threads64_) +const OPENBLAS_SET_NUM_THREADS = + Libdl.dlsym(libOpenBLAS, :openblas_set_num_threads64_) istransposed(x) = 'N' istransposed(x::Adjoint{<:Real}) = 'T' @@ -137,7 +136,11 @@ for (lib, f) ∈ [(:GEMM_MKL, :gemmmkl!), (:GEMM_OpenBLAS, :gemmopenblas!)] for (T, prefix) ∈ [(Float32, :S), (Float64, :D)] fm = Symbol(prefix, lib) @eval begin - function $f(C::AbstractMatrix{$T}, A::AbstractMatrix{$T}, B::AbstractMatrix{$T}) + function $f( + C::AbstractMatrix{$T}, + A::AbstractMatrix{$T}, + B::AbstractMatrix{$T} + ) transA = istransposed(A) transB = istransposed(B) M, N = size(C) @@ -165,7 +168,7 @@ for (lib, f) ∈ [(:GEMM_MKL, :gemmmkl!), (:GEMM_OpenBLAS, :gemmopenblas!)] Ref{Int64}, Ref{$T}, Ref{$T}, - Ref{Int64}, + Ref{Int64} ), transA, transB, @@ -179,17 +182,19 @@ for (lib, f) ∈ [(:GEMM_MKL, :gemmmkl!), (:GEMM_OpenBLAS, :gemmopenblas!)] ldB, β, C, - ldC, + ldC ) end end end end if MKL_BENCH - mkl_set_num_threads(N::Integer) = ccall(MKL_SET_NUM_THREADS, Cvoid, (Int32,), N % Int32) + mkl_set_num_threads(N::Integer) = + ccall(MKL_SET_NUM_THREADS, Cvoid, (Int32,), N % Int32) mkl_set_num_threads(1) end -openblas_set_num_threads(N::Integer) = ccall(OPENBLAS_SET_NUM_THREADS, Cvoid, (Int64,), N) +openblas_set_num_threads(N::Integer) = + ccall(OPENBLAS_SET_NUM_THREADS, Cvoid, (Int64,), N) openblas_set_num_threads(1) function dgemvmkl!( @@ -197,7 +202,7 @@ function dgemvmkl!( A::AbstractMatrix{Float64}, x::AbstractVector{Float64}, α = 1.0, - β = 0.0, + β = 0.0 ) transA = istransposed(A) pA = parent(A) @@ -219,7 +224,7 @@ function dgemvmkl!( Ref{Int64}, Ref{Float64}, Ref{Float64}, - Ref{Int64}, + Ref{Int64} ), transA, M, @@ -231,13 +236,13 @@ function dgemvmkl!( incx, β, y, - incy, + incy ) end function dgemvopenblas!( y::AbstractVector{Float64}, A::AbstractMatrix{Float64}, - x::AbstractVector{Float64}, + x::AbstractVector{Float64} ) transA = istransposed(A) pA = parent(A) @@ -261,7 +266,7 @@ function dgemvopenblas!( Ref{Int64}, Ref{Float64}, Ref{Float64}, - Ref{Int64}, + Ref{Int64} ), transA, M, @@ -273,11 +278,10 @@ function dgemvopenblas!( incx, β, y, - incy, + incy ) end - for (prefix, Eshared) ∈ ((Symbol(""), LIBEIGENTEST), (:i, LIBIEIGENTEST)) @eval function $(Symbol(prefix, :egemm!))(C, A, B) M, N = size(C) @@ -291,7 +295,7 @@ for (prefix, Eshared) ∈ ((Symbol(""), LIBEIGENTEST), (:i, LIBIEIGENTEST)) B, M, K, - N, + N ) end let (p, s) = (:e, Eshared) @@ -307,7 +311,7 @@ for (prefix, Eshared) ∈ ((Symbol(""), LIBEIGENTEST), (:i, LIBIEIGENTEST)) B, M, K, - N, + N ) end @eval function $(Symbol(prefix, p, :gemm!))(C, A, B::Adjoint) @@ -322,7 +326,7 @@ for (prefix, Eshared) ∈ ((Symbol(""), LIBEIGENTEST), (:i, LIBIEIGENTEST)) parent(B), M, K, - N, + N ) end @eval function $(Symbol(prefix, p, :gemm!))(C, A::Adjoint, B::Adjoint) @@ -337,7 +341,7 @@ for (prefix, Eshared) ∈ ((Symbol(""), LIBEIGENTEST), (:i, LIBIEIGENTEST)) parent(B), M, K, - N, + N ) end @eval function $(Symbol(prefix, p, :dot))(a, b) @@ -354,7 +358,7 @@ for (prefix, Eshared) ∈ ((Symbol(""), LIBEIGENTEST), (:i, LIBIEIGENTEST)) A, x, M, - K, + K ) end @eval function $(Symbol(prefix, p, :selfdot))(a) @@ -371,7 +375,7 @@ for (prefix, Eshared) ∈ ((Symbol(""), LIBEIGENTEST), (:i, LIBIEIGENTEST)) A, y, M, - N, + N ) end @eval function $(Symbol(prefix, p, :gemv!))(y, A::Adjoint, x) @@ -384,7 +388,7 @@ for (prefix, Eshared) ∈ ((Symbol(""), LIBEIGENTEST), (:i, LIBIEIGENTEST)) parent(A), x, M, - K, + K ) end @eval function $(Symbol(prefix, p, :aplusBc!))(D, a, B, c) @@ -398,7 +402,7 @@ for (prefix, Eshared) ∈ ((Symbol(""), LIBEIGENTEST), (:i, LIBIEIGENTEST)) B, c, M, - K, + K ) end @eval function $(Symbol(prefix, p, :OLSlp))(y, X, β) @@ -411,7 +415,7 @@ for (prefix, Eshared) ∈ ((Symbol(""), LIBEIGENTEST), (:i, LIBIEIGENTEST)) X, β, N, - P, + P ) end @eval function $(Symbol(prefix, p, :AplusAt!))(B, A) @@ -419,7 +423,7 @@ for (prefix, Eshared) ∈ ((Symbol(""), LIBEIGENTEST), (:i, LIBIEIGENTEST)) ccall((:AplusAt, $s), Cvoid, (Ptr{Float64}, Ptr{Float64}, Clong), B, A, N) end @eval function $(Symbol(prefix, p, :logdettriangle))( - T::Union{LowerTriangular,UpperTriangular}, + T::Union{LowerTriangular,UpperTriangular} ) N = size(T, 1) Tp = parent(T) @@ -447,7 +451,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define B, M, K, - N, + N ) end @eval function $(Symbol(prefix, :f, gemm, :!))(C, A, B) @@ -456,31 +460,47 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define ccall( ($(QuoteNode(gemm)), $Fshared), Cvoid, - (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}), + ( + Ptr{Float64}, + Ptr{Float64}, + Ptr{Float64}, + Ref{Clong}, + Ref{Clong}, + Ref{Clong} + ), C, A, B, Ref(M), Ref(K), - Ref(N), + Ref(N) ) end end - @eval $(Symbol(prefix, :cgemm!))(C, A, B) = $(Symbol(prefix, :cgemm_nkm!))(C, A, B) - @eval $(Symbol(prefix, :fgemm!))(C, A, B) = $(Symbol(prefix, :fgemm_nkm!))(C, A, B) + @eval $(Symbol(prefix, :cgemm!))(C, A, B) = + $(Symbol(prefix, :cgemm_nkm!))(C, A, B) + @eval $(Symbol(prefix, :fgemm!))(C, A, B) = + $(Symbol(prefix, :fgemm_nkm!))(C, A, B) @eval function $(Symbol(prefix, :fgemm_builtin!))(C, A, B) M, N = size(C) K = size(B, 1) ccall( (:gemmbuiltin, $Fshared), Cvoid, - (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}), + ( + Ptr{Float64}, + Ptr{Float64}, + Ptr{Float64}, + Ref{Clong}, + Ref{Clong}, + Ref{Clong} + ), C, A, B, Ref(M), Ref(K), - Ref(N), + Ref(N) ) end let (p, s) = (:c, Cshared) @@ -496,7 +516,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define B, M, K, - N, + N ) end end @@ -506,13 +526,20 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define ccall( (:AtmulB, $Fshared), Cvoid, - (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}), + ( + Ptr{Float64}, + Ptr{Float64}, + Ptr{Float64}, + Ref{Clong}, + Ref{Clong}, + Ref{Clong} + ), C, parent(A), B, Ref(M), Ref(K), - Ref(N), + Ref(N) ) end @eval function $(Symbol(prefix, :fgemm_builtin!))(C, A::Adjoint, B) @@ -521,13 +548,20 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define ccall( (:AtmulBbuiltin, $Fshared), Cvoid, - (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}), + ( + Ptr{Float64}, + Ptr{Float64}, + Ptr{Float64}, + Ref{Clong}, + Ref{Clong}, + Ref{Clong} + ), C, parent(A), B, Ref(M), Ref(K), - Ref(N), + Ref(N) ) end let (p, s) = (:c, Cshared)# (:e,Eshared)] @@ -543,7 +577,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define parent(B), M, K, - N, + N ) end end @@ -553,13 +587,20 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define ccall( (:AmulBt, $Fshared), Cvoid, - (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}), + ( + Ptr{Float64}, + Ptr{Float64}, + Ptr{Float64}, + Ref{Clong}, + Ref{Clong}, + Ref{Clong} + ), C, A, parent(B), Ref(M), Ref(K), - Ref(N), + Ref(N) ) end @eval function $(Symbol(prefix, :fgemm_builtin!))(C, A, B::Adjoint) @@ -568,13 +609,20 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define ccall( (:AmulBtbuiltin, $Fshared), Cvoid, - (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}), + ( + Ptr{Float64}, + Ptr{Float64}, + Ptr{Float64}, + Ref{Clong}, + Ref{Clong}, + Ref{Clong} + ), C, A, parent(B), Ref(M), Ref(K), - Ref(N), + Ref(N) ) end @eval function $(Symbol(prefix, :fgemm!))(C, A::Adjoint, B::Adjoint) @@ -583,13 +631,20 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define ccall( (:AtmulBt, $Fshared), Cvoid, - (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}), + ( + Ptr{Float64}, + Ptr{Float64}, + Ptr{Float64}, + Ref{Clong}, + Ref{Clong}, + Ref{Clong} + ), C, parent(A), parent(B), Ref(M), Ref(K), - Ref(N), + Ref(N) ) end @eval function $(Symbol(prefix, :fgemm_builtin!))(C, A::Adjoint, B::Adjoint) @@ -598,18 +653,32 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define ccall( (:AtmulBtbuiltin, $Fshared), Cvoid, - (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}), + ( + Ptr{Float64}, + Ptr{Float64}, + Ptr{Float64}, + Ref{Clong}, + Ref{Clong}, + Ref{Clong} + ), C, parent(A), parent(B), Ref(M), Ref(K), - Ref(N), + Ref(N) ) end @eval function $(Symbol(prefix, :fdot))(a, b) N = length(a) - ccall((:dot, $Fshared), Float64, (Ptr{Float64}, Ptr{Float64}, Ref{Clong}), a, b, Ref(N)) + ccall( + (:dot, $Fshared), + Float64, + (Ptr{Float64}, Ptr{Float64}, Ref{Clong}), + a, + b, + Ref(N) + ) end @eval function $(Symbol(prefix, :fselfdot))(a) @@ -626,7 +695,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define A, y, Ref(M), - Ref(N), + Ref(N) ) end @eval function $(Symbol(prefix, :fgemv!))(y, A, x) @@ -639,7 +708,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define A, x, Ref(M), - Ref(K), + Ref(K) ) end @eval function $(Symbol(prefix, :fgemv_builtin!))(y, A, x) @@ -652,7 +721,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define A, x, Ref(M), - Ref(K), + Ref(K) ) end @eval function $(Symbol(prefix, :fgemv!))(y, A::Adjoint, x) @@ -665,7 +734,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define parent(A), x, Ref(M), - Ref(K), + Ref(K) ) end @eval function $(Symbol(prefix, :fgemv_builtin!))(y, A::Adjoint, x) @@ -678,7 +747,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define parent(A), x, Ref(M), - Ref(K), + Ref(K) ) end let (p, s) = (:c, Cshared)# (:e,Eshared)] @@ -694,7 +763,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define parent(B), M, K, - N, + N ) end @eval function $(Symbol(prefix, p, :dot))(a, b) @@ -711,7 +780,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define A, x, M, - K, + K ) end @eval function $(Symbol(prefix, p, :selfdot))(a) @@ -728,7 +797,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define A, y, M, - N, + N ) end @eval function $(Symbol(prefix, p, :gemv!))(y, A::Adjoint, x) @@ -741,7 +810,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define parent(A), x, M, - K, + K ) end @eval function $(Symbol(prefix, p, :aplusBc!))(D, a, B, c) @@ -755,7 +824,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define B, c, M, - K, + K ) end @eval function $(Symbol(prefix, p, :OLSlp))(y, X, β) @@ -768,7 +837,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define X, β, N, - P, + P ) end @eval function $(Symbol(prefix, p, :AplusAt!))(B, A) @@ -776,7 +845,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define ccall((:AplusAt, $s), Cvoid, (Ptr{Float64}, Ptr{Float64}, Clong), B, A, N) end @eval function $(Symbol(prefix, p, :logdettriangle))( - T::Union{LowerTriangular,UpperTriangular}, + T::Union{LowerTriangular,UpperTriangular} ) N = size(T, 1) Tp = parent(T) @@ -788,13 +857,20 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define ccall( (:aplusBc, $Fshared), Cvoid, - (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}), + ( + Ptr{Float64}, + Ptr{Float64}, + Ptr{Float64}, + Ptr{Float64}, + Ref{Clong}, + Ref{Clong} + ), D, a, B, c, Ref(M), - Ref(K), + Ref(K) ) end @eval function $(Symbol(prefix, :fOLSlp))(y, X, β) @@ -807,21 +883,42 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define X, β, Ref(N), - Ref(P), + Ref(P) ) end @eval function $(Symbol(prefix, :cvexp!))(b, a) N = length(b) - ccall((:vexp, $Cshared), Cvoid, (Ptr{Float64}, Ptr{Float64}, Clong), b, a, N) + ccall( + (:vexp, $Cshared), + Cvoid, + (Ptr{Float64}, Ptr{Float64}, Clong), + b, + a, + N + ) end @eval function $(Symbol(prefix, :fvexp!))(b, a) N = length(b) - ccall((:vexp, $Fshared), Cvoid, (Ptr{Float64}, Ptr{Float64}, Ref{Clong}), b, a, Ref(N)) + ccall( + (:vexp, $Fshared), + Cvoid, + (Ptr{Float64}, Ptr{Float64}, Ref{Clong}), + b, + a, + Ref(N) + ) end @eval function $(Symbol(prefix, :fvexpsum))(a) N = length(a) s = Ref{Float64}() - ccall((:svexp, $Fshared), Cvoid, (Ref{Float64}, Ptr{Float64}, Ref{Clong}), s, a, Ref(N)) + ccall( + (:svexp, $Fshared), + Cvoid, + (Ref{Float64}, Ptr{Float64}, Ref{Clong}), + s, + a, + Ref(N) + ) s[] end @eval function $(Symbol(prefix, :fAplusAt!))(B, A) @@ -832,7 +929,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define (Ptr{Float64}, Ptr{Float64}, Ref{Clong}), B, A, - Ref(N), + Ref(N) ) end @eval function $(Symbol(prefix, :fAplusAt_builtin!))(B, A) @@ -843,7 +940,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define (Ptr{Float64}, Ptr{Float64}, Ref{Clong}), B, A, - Ref(N), + Ref(N) ) end @@ -857,7 +954,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define basis, coefs, A, - C, + C ) end @eval function $(Symbol(prefix, :frandomaccess))(P, basis, coefs) @@ -870,20 +967,26 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define basis, coefs, Ref(A), - Ref(C), + Ref(C) ) end @eval function $(Symbol(prefix, :flogdettriangle))( - T::Union{LowerTriangular,UpperTriangular}, + T::Union{LowerTriangular,UpperTriangular} ) N = size(T, 1) Tp = parent(T) - ccall((:logdettriangle, $Fshared), Float64, (Ptr{Float64}, Ref{Clong}), Tp, Ref(N)) + ccall( + (:logdettriangle, $Fshared), + Float64, + (Ptr{Float64}, Ref{Clong}), + Tp, + Ref(N) + ) end @eval function $(Symbol(prefix, :cfilter2d!))( B::OffsetArray, A::AbstractArray, - K::OffsetArray, + K::OffsetArray ) Ma, Na = size(A) offset = first(B.offsets) @@ -896,32 +999,39 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define parent(K), Ma, Na, - offset, + offset ) end @eval function $(Symbol(prefix, :ffilter2d!))( B::OffsetArray, A::AbstractArray, - K::OffsetArray, + K::OffsetArray ) Ma, Na = size(A) offset = first(B.offsets) ccall( (:filter2d, $Fshared), Cvoid, - (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}), + ( + Ptr{Float64}, + Ptr{Float64}, + Ptr{Float64}, + Ref{Clong}, + Ref{Clong}, + Ref{Clong} + ), parent(B), A, parent(K), Ref(Ma), Ref(Na), - Ref(offset), + Ref(offset) ) end @eval function $(Symbol(prefix, :cfilter2d!))( B::OffsetArray, A::AbstractArray, - K::SizedOffsetMatrix{Float64,-1,1,-1,1}, + K::SizedOffsetMatrix{Float64,-1,1,-1,1} ) Ma, Na = size(A) ccall( @@ -932,13 +1042,13 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define A, K, Ma, - Na, + Na ) end @eval function $(Symbol(prefix, :ffilter2d!))( B::OffsetArray, A::AbstractArray, - K::SizedOffsetMatrix{Float64,-1,1,-1,1}, + K::SizedOffsetMatrix{Float64,-1,1,-1,1} ) Ma, Na = size(A) ccall( @@ -949,13 +1059,13 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define A, K, Ref(Ma), - Ref(Na), + Ref(Na) ) end @eval function $(Symbol(prefix, :cfilter2dunrolled!))( B::OffsetArray, A::AbstractArray, - K::SizedOffsetMatrix{Float64,-1,1,-1,1}, + K::SizedOffsetMatrix{Float64,-1,1,-1,1} ) Ma, Na = size(A) ccall( @@ -966,13 +1076,13 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define A, K, Ma, - Na, + Na ) end @eval function $(Symbol(prefix, :ffilter2dunrolled!))( B::OffsetArray, A::AbstractArray, - K::SizedOffsetMatrix{Float64,-1,1,-1,1}, + K::SizedOffsetMatrix{Float64,-1,1,-1,1} ) Ma, Na = size(A) ccall( @@ -983,8 +1093,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define A, K, Ref(Ma), - Ref(Na), + Ref(Na) ) end - end diff --git a/benchmark/looptests.jl b/benchmark/looptests.jl index 3c8d4223b..a5f417afb 100644 --- a/benchmark/looptests.jl +++ b/benchmark/looptests.jl @@ -1,7 +1,6 @@ using LoopVectorization, LinearAlgebra, OffsetArrays, ArrayInterface BLAS.set_num_threads(1) - using LoopVectorization: Static # TODO: remove this once this PR merges: https://github.com/JuliaArrays/OffsetArrays.jl/pull/170 @inline Base.unsafe_convert(::Type{Ptr{T}}, A::OffsetArray{T}) where {T} = @@ -15,20 +14,24 @@ Base.size(::SizedOffsetMatrix{<:Any,LR,UR,LC,UC}) where {LR,UR,LC,UC} = Base.axes(::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC} = (StaticInt{LR}():StaticInt{UR}(), StaticInt{LC}():StaticInt{UC}()) Base.parent(A::SizedOffsetMatrix) = A.data -Base.unsafe_convert(::Type{Ptr{T}}, A::SizedOffsetMatrix{T}) where {T} = pointer(A.data) +Base.unsafe_convert(::Type{Ptr{T}}, A::SizedOffsetMatrix{T}) where {T} = + pointer(A.data) ArrayInterface.contiguous_axis(::Type{<:SizedOffsetMatrix}) = StaticInt(1) ArrayInterface.contiguous_batch_size(::Type{<:SizedOffsetMatrix}) = StaticInt(0) -ArrayInterface.stride_rank(::Type{<:SizedOffsetMatrix}) = (StaticInt(1), StaticInt(2)) -function ArrayInterface.strides(A::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC} +ArrayInterface.stride_rank(::Type{<:SizedOffsetMatrix}) = + (StaticInt(1), StaticInt(2)) +function ArrayInterface.strides( + A::SizedOffsetMatrix{T,LR,UR,LC,UC} +) where {T,LR,UR,LC,UC} (StaticInt{1}(), (StaticInt{UR}() - StaticInt{LR}() + StaticInt{1}())) end -ArrayInterface.offsets(A::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC} = - (StaticInt{LR}(), StaticInt{LC}()) +ArrayInterface.offsets( + A::SizedOffsetMatrix{T,LR,UR,LC,UC} +) where {T,LR,UR,LC,UC} = (StaticInt{LR}(), StaticInt{LC}()) ArrayInterface.parent_type(::Type{<:SizedOffsetMatrix{T}}) where {T} = Matrix{T} Base.getindex(A::SizedOffsetMatrix, i, j) = LoopVectorization.vload(LoopVectorization.stridedpointer(A), (i, j)) - function jgemm!(𝐂, 𝐀, 𝐁) 𝐂 .= 0 M, N = size(𝐂) @@ -72,19 +75,17 @@ function jgemm!(𝐂, 𝐀ᵀ::Adjoint, 𝐁ᵀ::Adjoint) end end end -function gemmavx!(𝐂, 𝐀, 𝐁) - @turbo for m ∈ indices((𝐀, 𝐂), 1), n ∈ indices((𝐁, 𝐂), 2) +gemmavx!(𝐂, 𝐀, 𝐁) = @turbo for m ∈ indices((𝐀, 𝐂), 1), n ∈ indices((𝐁, 𝐂), 2) 𝐂ₘₙ = zero(eltype(𝐂)) for k ∈ indices((𝐀, 𝐁), (2, 1)) 𝐂ₘₙ += 𝐀[m, k] * 𝐁[k, n] end 𝐂[m, n] = 𝐂ₘₙ end -end function gemmavx!( Cc::AbstractMatrix{Complex{T}}, Ac::AbstractMatrix{Complex{T}}, - Bc::AbstractMatrix{Complex{T}}, + Bc::AbstractMatrix{Complex{T}} ) where {T} A = reinterpret(reshape, T, Ac) B = reinterpret(reshape, T, Bc) @@ -100,19 +101,17 @@ function gemmavx!( C[2, m, n] = Cim end end -function gemmavxt!(𝐂, 𝐀, 𝐁) - @tturbo for m ∈ indices((𝐀, 𝐂), 1), n ∈ indices((𝐁, 𝐂), 2) +gemmavxt!(𝐂, 𝐀, 𝐁) = @tturbo for m ∈ indices((𝐀, 𝐂), 1), n ∈ indices((𝐁, 𝐂), 2) 𝐂ₘₙ = zero(eltype(𝐂)) for k ∈ indices((𝐀, 𝐁), (2, 1)) 𝐂ₘₙ += 𝐀[m, k] * 𝐁[k, n] end 𝐂[m, n] = 𝐂ₘₙ end -end function gemmavxt!( Cc::AbstractMatrix{Complex{T}}, Ac::AbstractMatrix{Complex{T}}, - Bc::AbstractMatrix{Complex{T}}, + Bc::AbstractMatrix{Complex{T}} ) where {T} A = reinterpret(reshape, T, Ac) B = reinterpret(reshape, T, Bc) @@ -204,16 +203,12 @@ function jdot3avx(x, A, y) end s end -function jvexp!(b, a) - @inbounds for i ∈ eachindex(a) +jvexp!(b, a) = @inbounds for i ∈ eachindex(a) b[i] = exp(a[i]) end -end -function jvexpavx!(b, a) - @turbo for i ∈ eachindex(a) +jvexpavx!(b, a) = @turbo for i ∈ eachindex(a) b[i] = exp(a[i]) end -end function jsvexp(a) s = zero(eltype(a)) @inbounds for i ∈ eachindex(a) @@ -246,15 +241,13 @@ function jgemv!(𝐲, 𝐀ᵀ::Adjoint, 𝐱) 𝐲[i] = 𝐲ᵢ end end -function jgemvavx!(𝐲, 𝐀, 𝐱) - @turbo for i ∈ eachindex(𝐲) +jgemvavx!(𝐲, 𝐀, 𝐱) = @turbo for i ∈ eachindex(𝐲) 𝐲ᵢ = zero(eltype(𝐲)) for j ∈ eachindex(𝐱) 𝐲ᵢ += 𝐀[i, j] * 𝐱[j] end 𝐲[i] = 𝐲ᵢ end -end function jvar!(𝐬², 𝐀, x̄) @. s² = zero(eltype(𝐬²)) @inbounds @fastmath for i ∈ 1:size(𝐀, 2) @@ -264,8 +257,7 @@ function jvar!(𝐬², 𝐀, x̄) end end end -function jvaravx!(𝐬², 𝐀, x̄) - @turbo for j ∈ eachindex(𝐬²) +jvaravx!(𝐬², 𝐀, x̄) = @turbo for j ∈ eachindex(𝐬²) 𝐬²ⱼ = zero(eltype(𝐬²)) x̄ⱼ = x̄[j] for i ∈ 1:size(𝐀, 2) @@ -274,7 +266,6 @@ function jvaravx!(𝐬², 𝐀, x̄) end 𝐬²[j] = 𝐬²ⱼ end -end japlucBc!(D, a, B, c) = @. D = a + B * c'; japlucBcavx!(D, a, B, c) = @turbo @. D = a + B * c'; @@ -342,9 +333,6 @@ function jlogdettriangleavx(B::Union{LowerTriangular,UpperTriangular}) ld end - - - function filter2d!(out::AbstractMatrix, A::AbstractMatrix, kern) @inbounds @fastmath for J in CartesianIndices(out) tmp = zero(eltype(out)) @@ -369,7 +357,7 @@ end function filter2dunrolled!( out::AbstractMatrix, A::AbstractMatrix, - kern::SizedOffsetMatrix{T,-1,1,-1,1}, + kern::SizedOffsetMatrix{T,-1,1,-1,1} ) where {T} rng1, rng2 = axes(out) Base.Cartesian.@nexprs 3 jk -> @@ -380,7 +368,7 @@ function filter2dunrolled!( Base.Cartesian.@nexprs 3 jk -> Base.Cartesian.@nexprs 3 ik -> tmp_{ik + (jk - 1) * 3} = Base.FastMath.add_fast( Base.FastMath.mul_fast(A[i+(ik-2), j+(jk-2)], kern_ik_jk), - tmp_{ik + (jk - 1) * 3 - 1}, + tmp_{ik + (jk - 1) * 3 - 1} ) out[i, j] = tmp_9 end @@ -390,7 +378,7 @@ end function filter2dunrolledavx!( out::AbstractMatrix, A::AbstractMatrix, - kern::SizedOffsetMatrix{T,-1,1,-1,1}, + kern::SizedOffsetMatrix{T,-1,1,-1,1} ) where {T} rng1, rng2 = axes(out) Base.Cartesian.@nexprs 3 jk -> @@ -405,7 +393,6 @@ function filter2dunrolledavx!( out end - # function smooth_line!(sl,nrm1,j,i1,rl,ih2,denom) # @fastmath @inbounds @simd ivdep for i=i1:2:nrm1 # sl[i,j]=denom*(rl[i,j]+ih2*(sl[i,j-1]+sl[i-1,j]+sl[i+1,j]+sl[i,j+1])) diff --git a/benchmark/plotbenchmarks.jl b/benchmark/plotbenchmarks.jl index f505b89ed..e9f984505 100644 --- a/benchmark/plotbenchmarks.jl +++ b/benchmark/plotbenchmarks.jl @@ -2,17 +2,22 @@ using PrettyTables function Base.show(io::IO, br::BenchmarkResult) hb = Highlighter( - (br, i, j) -> (j > 1 && maximum(@view(br.results[:, i])) == br.results[j-1, i]), - foreground = :green, + (br, i, j) -> + (j > 1 && maximum(@view(br.results[:, i])) == br.results[j-1, i]); + foreground = :green + ) + pretty_table( + io, + br.sizedresults, + br.tests; + crop = :none, + highlighters = (hb,) ) - pretty_table(io, br.sizedresults, br.tests, crop = :none, highlighters = (hb,)) end - if (Sys.ARCH === :aarch64) && Sys.isapple() nothing else - using Colors, ColorSchemes, Gadfly const COLORS = [RGB(0.0, 0.0, 0.0), RGB(1.0, 0.0, 0.0)] # const COLORS = [RGB(0.0,0.0,0.0),RGB(0.0,1.0,0.0)] @@ -24,11 +29,9 @@ else # const COLOR_MAP = Dict{String,RGB{Float64}}() # const COLOR_MAP = Dict{String,RGB{Colors.N0f8}}() const COLOR_MAP64 = Dict{String,RGB{Float64}}() - function getcolor(s::String) - get!(COLOR_MAP64, s) do + getcolor(s::String) = get!(COLOR_MAP64, s) do COLORS[length(COLOR_MAP64)+1] end - end replace_and(str) = replace(str, '&' => "with") function Gadfly.plot(br::BenchmarkResult) @@ -46,7 +49,7 @@ else maxtick = 10round(Int, 0.1maxres) yt = if iszero(maxtick) maxtick = 10round(0.1maxres) - range(0, maxres, length = 20) + range(0, maxres; length = 20) elseif maxtick < 10 0:1:maxtick elseif maxtick < 20 @@ -60,24 +63,31 @@ else Gadfly.Guide.manual_color_key("Methods", tests, colors), Guide.xlabel("Size"), Guide.ylabel("GFLOPS"), - Guide.xticks(ticks = collect(xt)), - Guide.yticks(ticks = collect(yt)), + Guide.xticks(; ticks = collect(xt)), + Guide.yticks(; ticks = collect(yt)) ) for i ∈ eachindex(tests) - push!(p, layer(x = sizes, y = res[i, :], Geom.line, Theme(default_color = colors[i]))) + push!( + p, + layer(; + x = sizes, + y = res[i, :], + Geom.line, + Theme(; default_color = colors[i]) + ) + ) end addlabel && push!( p, - layer( + layer(; x = fill(maxxtick - 10, length(tests)), y = res[:, maxxind], label = tests, - Geom.label(position = :centered), - ), + Geom.label(; position = :centered) + ) ) p end - end # using VegaLite, IndexedTables # function plot(br::BenchmarkResult) diff --git a/docs/make.jl b/docs/make.jl index 9168c0022..b537d66f5 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -2,7 +2,7 @@ using Documenter, LoopVectorization makedocs(; modules = [LoopVectorization], - format = Documenter.HTML(prettyurls = get(ENV, "CI", nothing) == "true"), + format = Documenter.HTML(; prettyurls = get(ENV, "CI", nothing) == "true"), pages = [ "Home" => "index.md", "Getting Started" => "getting_started.md", @@ -15,7 +15,7 @@ makedocs(; "examples/datetime_arrays.md", "examples/special_functions.md", "examples/sum_of_squared_error.md", - "examples/filtering.md", + "examples/filtering.md" ], "Vectorized Convenience Functions" => "vectorized_convenience_functions.md", "Future Work" => "future_work.md", @@ -26,12 +26,12 @@ makedocs(; "devdocs/constructing_loopsets.md", "devdocs/evaluating_loops.md", "devdocs/lowering.md", - "devdocs/reference.md", - ], + "devdocs/reference.md" + ] ], # repo="https://github.com/JuliaSIMD/LoopVectorization.jl/blob/{commit}{path}#L{line}", sitename = "LoopVectorization.jl", - authors = "Chris Elrod", + authors = "Chris Elrod" # assets=[], ) diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl index 570c0a576..cc103bd7f 100644 --- a/src/LoopVectorization.jl +++ b/src/LoopVectorization.jl @@ -3,7 +3,11 @@ module LoopVectorization using ArrayInterfaceCore: UpTri, LoTri using Static: StaticInt, gt, static, Zero, One, reduce_tup using VectorizationBase, - SLEEFPirates, UnPack, OffsetArrays, ArrayInterfaceOffsetArrays, ArrayInterfaceStaticArrays + SLEEFPirates, + UnPack, + OffsetArrays, + ArrayInterfaceOffsetArrays, + ArrayInterfaceStaticArrays using LayoutPointers: AbstractStridedPointer, StridedPointer, @@ -113,7 +117,6 @@ using HostCPUFeatures: get_cpu_name using CPUSummary: num_cores, cache_linesize, cache_size - using IfElse: ifelse using ThreadingUtilities, PolyesterWeave @@ -138,7 +141,14 @@ using Base.FastMath: pow_fast, sqrt_fast using SLEEFPirates: - log_fast, log2_fast, log10_fast, pow, sin_fast, cos_fast, sincos_fast, tan_fast + log_fast, + log2_fast, + log10_fast, + pow, + sin_fast, + cos_fast, + sincos_fast, + tan_fast using ArrayInterface using ArrayInterface: @@ -190,7 +200,6 @@ export LowDimArray, const VECTORWIDTHSYMBOL, ELTYPESYMBOL, MASKSYMBOL = Symbol("##Wvecwidth##"), Symbol("##Tloopeltype##"), Symbol("##mask##") - include("vectorizationbase_compat/contract_pass.jl") include("vectorizationbase_compat/subsetview.jl") include("getconstindexes.jl") @@ -230,13 +239,13 @@ include("broadcast.jl") LoopVectorization provides macros and functions that combine SIMD vectorization and loop-reordering so as to improve performance: -- [`@turbo`](@ref): transform `for`-loops and broadcasting -- [`vmapreduce`](@ref): vectorized version of `mapreduce` -- [`vreduce`](@ref): vectorized version of `reduce` -- [`vmap`](@ref) and `vmap!`: vectorized version of `map` and `map!` -- [`vmapnt`](@ref) and `vmapnt!`: non-temporal variants of `vmap` and `vmap!` -- [`vmapntt`](@ref) and `vmapntt!`: threaded variants of `vmapnt` and `vmapnt!` -- [`vfilter`](@ref) and `vfilter!`: vectorized versions of `filter` and `filter!` + - [`@turbo`](@ref): transform `for`-loops and broadcasting + - [`vmapreduce`](@ref): vectorized version of `mapreduce` + - [`vreduce`](@ref): vectorized version of `reduce` + - [`vmap`](@ref) and `vmap!`: vectorized version of `map` and `map!` + - [`vmapnt`](@ref) and `vmapnt!`: non-temporal variants of `vmap` and `vmap!` + - [`vmapntt`](@ref) and `vmapntt!`: threaded variants of `vmapnt` and `vmapnt!` + - [`vfilter`](@ref) and `vfilter!`: vectorized versions of `filter` and `filter!` """ LoopVectorization diff --git a/src/broadcast.jl b/src/broadcast.jl index e94ce9e62..28d110112 100644 --- a/src/broadcast.jl +++ b/src/broadcast.jl @@ -19,13 +19,13 @@ struct LowDimArray{D,T,N,A<:AbstractArray{T,N}} <: AbstractArray{T,N} end end function LowDimArray{D0}( - data::LowDimArray{D1,T,N,A}, + data::LowDimArray{D1,T,N,A} ) where {D0,T,N,D1,A<:AbstractArray{T,N}} LowDimArray{map(|, D0, D1),T,N,A}(parent(data)) end Base.@propagate_inbounds Base.getindex( A::LowDimArray, - i::Vararg{Union{StaticInt,Integer,CartesianIndex},K}, + i::Vararg{Union{StaticInt,Integer,CartesianIndex},K} ) where {K} = getindex(A.data, i...) @inline Base.size(A::LowDimArray) = Base.size(A.data) @inline Base.size(A::LowDimArray, i) = Base.size(A.data, i) @@ -36,10 +36,14 @@ Base.@propagate_inbounds Base.getindex( l = _pick_lowdim_known(Base.tail(b), Base.tail(x)) (f, l...) end -@inline function ArrayInterface.known_size(::Type{LowDimArray{D,T,N,A}}) where {D,T,N,A} +@inline function ArrayInterface.known_size( + ::Type{LowDimArray{D,T,N,A}} +) where {D,T,N,A} _pick_lowdim_known(D, ArrayInterface.known_size(A)) end -@inline ArrayInterface.parent_type(::Type{LowDimArray{D,T,N,A}}) where {T,D,N,A} = A +@inline ArrayInterface.parent_type( + ::Type{LowDimArray{D,T,N,A}} +) where {T,D,N,A} = A @inline Base.strides(A::LowDimArray) = map(Int, strides(A)) @inline ArrayInterface.device(::LowDimArray) = ArrayInterface.CPUPointer() @generated function ArrayInterface.size(A::LowDimArray{D,T,N}) where {D,T,N} @@ -54,14 +58,20 @@ end Expr(:block, Expr(:meta, :inline), :(s = ArrayInterface.size(parent(A))), t) end Base.parent(A::LowDimArray) = getfield(A, :data) -Base.unsafe_convert(::Type{Ptr{T}}, A::LowDimArray{D,T}) where {D,T} = pointer(parent(A)) -ArrayInterface.contiguous_axis(A::LowDimArray) = ArrayInterface.contiguous_axis(parent(A)) +Base.unsafe_convert(::Type{Ptr{T}}, A::LowDimArray{D,T}) where {D,T} = + pointer(parent(A)) +ArrayInterface.contiguous_axis(A::LowDimArray) = + ArrayInterface.contiguous_axis(parent(A)) ArrayInterface.contiguous_batch_size(A::LowDimArray) = ArrayInterface.contiguous_batch_size(parent(A)) -ArrayInterface.stride_rank(A::LowDimArray) = ArrayInterface.stride_rank(parent(A)) +ArrayInterface.stride_rank(A::LowDimArray) = + ArrayInterface.stride_rank(parent(A)) ArrayInterface.offsets(A::LowDimArray) = ArrayInterface.offsets(parent(A)) -@generated function _lowdimfilter(::Val{D}, tup::Tuple{Vararg{Any,N}}) where {D,N} +@generated function _lowdimfilter( + ::Val{D}, + tup::Tuple{Vararg{Any,N}} +) where {D,N} t = Expr(:tuple) for n ∈ 1:N if n > length(D) || D[n] @@ -75,14 +85,20 @@ struct ForBroadcast{T,N,A<:AbstractArray{T,N}} <: AbstractArray{T,N} data::A end @inline Base.parent(fb::ForBroadcast) = getfield(fb, :data) -@inline ArrayInterface.parent_type(::Type{ForBroadcast{T,N,A}}) where {T,N,A} = A -Base.@propagate_inbounds Base.getindex(A::ForBroadcast, i::Vararg{Any,K}) where {K} = - parent(A)[i...] +@inline ArrayInterface.parent_type(::Type{ForBroadcast{T,N,A}}) where {T,N,A} = + A +Base.@propagate_inbounds Base.getindex( + A::ForBroadcast, + i::Vararg{Any,K} +) where {K} = parent(A)[i...] const LowDimArrayForBroadcast{D,T,N,A} = ForBroadcast{T,N,LowDimArray{D,T,N,A}} @inline function VectorizationBase.contiguous_axis( - fb::LowDimArrayForBroadcast{D,T,N,A}, + fb::LowDimArrayForBroadcast{D,T,N,A} ) where {D,T,N,A} - _contiguous_axis(Val{D}(), VectorizationBase.contiguous_axis(parent(parent(fb)))) + _contiguous_axis( + Val{D}(), + VectorizationBase.contiguous_axis(parent(parent(fb))) + ) end @inline forbroadcast(A::AbstractArray) = ForBroadcast(A) @inline forbroadcast(A::AbstractRange) = A @@ -95,11 +111,10 @@ end size(A), strides(B), VectorizationBase.val_stride_rank(B), - VectorizationBase.val_dense_dims(B), + VectorizationBase.val_dense_dims(B) ) end - # @inline function VectorizationBase.contiguous_batch_size(fb::LowDimArrayForBroadcast{D,T,N,A}) where {D,T,N,A} # _contiguous_axis(Val{D}(), VectorizationBase.contiguous_batch_size(parent(parent(fb)))) # end @@ -116,30 +131,44 @@ end Expr(:block, Expr(:meta, :inline), staticexpr(Cnew)) end function ArrayInterface.contiguous_axis( - ::Type{LowDimArrayForBroadcast{D,T,N,A}}, + ::Type{LowDimArrayForBroadcast{D,T,N,A}} ) where {D,T,N,A} ArrayInterface.contiguous_axis(A) end @inline function ArrayInterface.stride_rank( - ::Type{LowDimArrayForBroadcast{D,T,N,A}}, + ::Type{LowDimArrayForBroadcast{D,T,N,A}} ) where {D,T,N,A} _lowdimfilter(Val(D), ArrayInterface.stride_rank(A)) end @inline function ArrayInterface.dense_dims( - ::Type{LowDimArrayForBroadcast{D,T,N,A}}, + ::Type{LowDimArrayForBroadcast{D,T,N,A}} ) where {D,T,N,A} _lowdimfilter(Val(D), ArrayInterface.dense_dims(A)) end -@inline function ArrayInterface.strides(fb::LowDimArrayForBroadcast{D}) where {D} +@inline function ArrayInterface.strides( + fb::LowDimArrayForBroadcast{D} +) where {D} _lowdimfilter(Val(D), strides(parent(fb))) end -@inline function ArrayInterface.offsets(fb::LowDimArrayForBroadcast{D}) where {D} +@inline function ArrayInterface.offsets( + fb::LowDimArrayForBroadcast{D} +) where {D} _lowdimfilter(Val(D), ArrayInterface.offsets(parent(parent(fb)))) end -@inline function ArrayInterface.StrideIndex(a::A) where {A<:LowDimArrayForBroadcast} - _stride_index(ArrayInterface.stride_rank(A), ArrayInterface.contiguous_axis(A), a) +@inline function ArrayInterface.StrideIndex( + a::A +) where {A<:LowDimArrayForBroadcast} + _stride_index( + ArrayInterface.stride_rank(A), + ArrayInterface.contiguous_axis(A), + a + ) end -@inline function _stride_index(r::Tuple{Vararg{StaticInt,N}}, ::StaticInt{C}, A) where {N,C} +@inline function _stride_index( + r::Tuple{Vararg{StaticInt,N}}, + ::StaticInt{C}, + A +) where {N,C} StrideIndex{N,ArrayInterface.known(r),C}(A) end @@ -147,7 +176,7 @@ for f ∈ [ # groupedstridedpointer support :(ArrayInterface.contiguous_axis), :(ArrayInterface.contiguous_batch_size), :(ArrayInterface.device), - :(ArrayInterface.stride_rank), + :(ArrayInterface.stride_rank) ] @eval @inline $f(::Type{ForBroadcast{T,N,A}}) where {T,N,A} = $f(A) end @@ -159,7 +188,7 @@ for f ∈ [ # groupedstridedpointer support :(ArrayInterface.stride_rank), :(VectorizationBase.val_dense_dims), :(ArrayInterface.offsets), - :(Base.size),#, :(ArrayInterface.strides) + :(Base.size)#, :(ArrayInterface.strides) ] @eval @inline $f(fb::ForBroadcast) = $f(getfield(fb, :data)) end @@ -172,7 +201,12 @@ function is_column_major(x) end is_row_major(x) = is_column_major(reverse(x)) # @inline _bytestrides(s,paren) = VectorizationBase.bytestrides(paren) -function _strides_expr(@nospecialize(s), @nospecialize(x), R::Vector{Int}, D::Vector{Bool}) +function _strides_expr( + @nospecialize(s), + @nospecialize(x), + R::Vector{Int}, + D::Vector{Bool} +) N = length(R) q = Expr(:block, Expr(:meta, :inline)) strd_tup = Expr(:tuple) @@ -214,7 +248,7 @@ function _strides_expr(@nospecialize(s), @nospecialize(x), R::Vector{Int}, D::Ve else push!( strd_tup.args, - :($ifel(isone($getfield(s, $n)), zero($xₙ_type), $getfield(x, $n))), + :($ifel(isone($getfield(s, $n)), zero($xₙ_type), $getfield(x, $n))) ) end end @@ -242,7 +276,7 @@ end s::Tuple{Vararg{Union{StaticInt,Integer},N}}, x::Tuple{Vararg{Union{StaticInt,Integer},N}}, ::Val{R}, - ::Val{D}, + ::Val{D} ) where {N,R,D} Rv = Vector{Int}(undef, N) Dv = Vector{Bool}(undef, N) @@ -272,16 +306,20 @@ end @inline Base.ndims(::Type{Product{A,B}}) where {A,B} = numdims(B) # This numdims nonsense is a hack to avoid type piracy in defining: @inline numdims( - ::Type{B}, -) where {N,S<:Base.Broadcast.AbstractArrayStyle{N},B<:Base.Broadcast.Broadcasted{S}} = N + ::Type{B} +) where { + N, + S<:Base.Broadcast.AbstractArrayStyle{N}, + B<:Base.Broadcast.Broadcasted{S} +} = N Base.Broadcast._broadcast_getindex_eltype( - ::Product{A,B}, + ::Product{A,B} ) where {T,A<:AbstractVecOrMat{T},B<:AbstractVecOrMat{T}} = T function Base.Broadcast._broadcast_getindex_eltype(p::Product) promote_type( Base.Broadcast._broadcast_getindex_eltype(p.a), - Base.Broadcast._broadcast_getindex_eltype(p.b), + Base.Broadcast._broadcast_getindex_eltype(p.b) ) end @@ -316,7 +354,7 @@ function add_broadcast!( loopsyms::Vector{Symbol}, @nospecialize(prod::Type{<:Product}), dontbc, - elementbytes::Int, + elementbytes::Int ) A, B = prod.parameters Krange = gensym!(ls, "K") @@ -325,7 +363,10 @@ function add_broadcast!( mB = gensym!(ls, "Bₖₙ") pushprepreamble!(ls, Expr(:(=), mA, Expr(:(.), bcname, QuoteNode(:a)))) pushprepreamble!(ls, Expr(:(=), mB, Expr(:(.), bcname, QuoteNode(:b)))) - pushprepreamble!(ls, Expr(:(=), Klen, Expr(:call, getfield, Expr(:call, :size, mB), 1))) + pushprepreamble!( + ls, + Expr(:(=), Klen, Expr(:call, getfield, Expr(:call, :size, mB), 1)) + ) pushpreamble!(ls, Expr(:(=), Krange, Expr(:call, :(:), staticexpr(1), Klen))) k = gensym!(ls, "k") add_loop!(ls, Loop(k, 1, Klen, 1, Krange, Klen), k) @@ -346,18 +387,46 @@ function add_broadcast!( end # load A # loadA = add_load!(ls, gensym!(ls, :A), productref(A, mA, m, k), elementbytes) - loadA = add_broadcast!(ls, gensym!(ls, "A"), mA, Symbol[m, k], A, dontbc[1], elementbytes) + loadA = add_broadcast!( + ls, + gensym!(ls, "A"), + mA, + Symbol[m, k], + A, + dontbc[1], + elementbytes + ) # load B - loadB = add_broadcast!(ls, gensym!(ls, "B"), mB, bloopsyms, B, dontbc[2], elementbytes) + loadB = add_broadcast!( + ls, + gensym!(ls, "B"), + mB, + bloopsyms, + B, + dontbc[2], + elementbytes + ) # set Cₘₙ = 0 # setC = add_constant!(ls, zero(promote_type(recursive_eltype(A), recursive_eltype(B))), cloopsyms, mC, elementbytes) # targetC will be used for reduce_to_add mCt = gensym!(ls, mC) - targetC = - add_constant!(ls, gensym!(ls, "zero"), cloopsyms, mCt, elementbytes, :numericconstant) + targetC = add_constant!( + ls, + gensym!(ls, "zero"), + cloopsyms, + mCt, + elementbytes, + :numericconstant + ) push!(ls.preamble_zeros, (identifier(targetC), IntOrFloat)) - setC = - add_constant!(ls, gensym!(ls, "zero"), cloopsyms, mC, elementbytes, :numericconstant) + setC = add_constant!( + ls, + gensym!(ls, "zero"), + cloopsyms, + mC, + elementbytes, + :numericconstant + ) push!(ls.preamble_zeros, (identifier(setC), IntOrFloat)) setC.reduced_children = kvec # compute Cₘₙ += Aₘₖ * Bₖₙ @@ -370,7 +439,7 @@ function add_broadcast!( compute, reductdeps, kvec, - Operation[loadA, loadB, setC], + Operation[loadA, loadB, setC] ) reductop = pushop!(ls, reductop, mC) reductfinal = Operation( @@ -381,12 +450,17 @@ function add_broadcast!( compute, cloopsyms, kvec, - Operation[reductop, targetC], + Operation[reductop, targetC] ) pushop!(ls, reductfinal, mCt) end -function extract_all_1_array!(ls::LoopSet, bcname::Symbol, N::Int, elementbytes::Int) +function extract_all_1_array!( + ls::LoopSet, + bcname::Symbol, + N::Int, + elementbytes::Int +) refextract = gensym!(ls, bcname) ref = Expr(:ref, bcname) for _ ∈ 1:N @@ -408,7 +482,7 @@ function add_broadcast!( loopsyms::Vector{Symbol}, @nospecialize(_::Type{<:AbstractArray{T,N}}), @nospecialize(dontbc::NTuple{N,Bool}), - elementbytes::Int, + elementbytes::Int ) where {T,N} any(dontbc) || return extract_all_1_array!(ls, bcname, N, elementbytes) bcname2 = gensym!(ls, bcname) @@ -437,7 +511,7 @@ function add_broadcast!( loopsyms::Vector{Symbol}, @nospecialize(_::Type{T}), @nospecialize(__), - elementbytes::Int, + elementbytes::Int ) where {T<:Number} add_constant!(ls, bcname, elementbytes) # or replace elementbytes with sizeof(T) ? u end @@ -448,13 +522,14 @@ function add_broadcast!( loopsyms::Vector{Symbol}, @nospecialize(_::Type{Base.RefValue{T}}), @nospecialize(__), - elementbytes::Int, + elementbytes::Int ) where {T} refextract = gensym!(ls, bcname) pushprepreamble!(ls, Expr(:(=), refextract, Expr(:ref, bcname))) add_constant!(ls, refextract, elementbytes) # or replace elementbytes with sizeof(T) ? u end -const BroadcastedArray{S<:Broadcast.AbstractArrayStyle,F,A} = Broadcasted{S,Nothing,F,A} +const BroadcastedArray{S<:Broadcast.AbstractArrayStyle,F,A} = + Broadcasted{S,Nothing,F,A} function add_broadcast!( ls::LoopSet, destname::Symbol, @@ -462,7 +537,7 @@ function add_broadcast!( loopsyms::Vector{Symbol}, @nospecialize(B::Type{<:BroadcastedArray}), @nospecialize(dontbc), - elementbytes::Int, + elementbytes::Int ) S, _, F, A = B.parameters instr = get(FUNCTIONSYMBOLS, F) do @@ -488,7 +563,7 @@ function add_broadcast!( loopsyms, arg, dontbc[i], - elementbytes, + elementbytes )::Operation push!(parents, parent) mergesetdiffv!(deps, loopdependencies(parent), reduceddependencies(parent)) @@ -501,12 +576,16 @@ function add_broadcast!( compute, deps, NODEPENDENCY, - parents, + parents ) pushop!(ls, op, destname) end -function add_broadcast_loops!(ls::LoopSet, loopsyms::Vector{Symbol}, destsym::Symbol) +function add_broadcast_loops!( + ls::LoopSet, + loopsyms::Vector{Symbol}, + destsym::Symbol +) axes_tuple = Expr(:tuple) pushpreamble!(ls, Expr(:(=), axes_tuple, Expr(:call, :axes, destsym))) for itersym ∈ loopsyms @@ -516,11 +595,21 @@ function add_broadcast_loops!(ls::LoopSet, loopsyms::Vector{Symbol}, destsym::Sy Nlen = gensym!(ls, "N") add_loop!(ls, Loop(itersym, Nlower, Nupper, 1, Nrange, Nlen), itersym) push!(axes_tuple.args, Nrange) - pushpreamble!(ls, Expr(:(=), Nlower, Expr(:call, lv(:maybestaticfirst), Nrange))) - pushpreamble!(ls, Expr(:(=), Nupper, Expr(:call, lv(:maybestaticlast), Nrange))) pushpreamble!( ls, - Expr(:(=), Nlen, Expr(:call, GlobalRef(ArrayInterface, :static_length), Nrange)), + Expr(:(=), Nlower, Expr(:call, lv(:maybestaticfirst), Nrange)) + ) + pushpreamble!( + ls, + Expr(:(=), Nupper, Expr(:call, lv(:maybestaticlast), Nrange)) + ) + pushpreamble!( + ls, + Expr( + :(=), + Nlen, + Expr(:call, GlobalRef(ArrayInterface, :static_length), Nrange) + ) ) end end @@ -532,24 +621,30 @@ function vmaterialize_fun( Mod, UNROLL, dontbc, - transpose::Bool, + transpose::Bool ) where {BC} # 2 + 1 # we have an N dimensional loop. # need to construct the LoopSet ls = LoopSet(Mod) - inline, u₁, u₂, v, isbroadcast, _, rs, rc, cls, threads, warncheckarg, safe = UNROLL + inline, u₁, u₂, v, isbroadcast, _, rs, rc, cls, threads, warncheckarg, safe = + UNROLL set_hw!(ls, rs, rc, cls) ls.isbroadcast = isbroadcast # maybe set `false` in a DiffEq-like `@..` macro loopsyms = [gensym!(ls, "n") for _ ∈ 1:N] - transpose && pushprepreamble!(ls, Expr(:(=), :dest, Expr(:call, :parent, :dest′))) + transpose && + pushprepreamble!(ls, Expr(:(=), :dest, Expr(:call, :parent, :dest′))) ret = transpose ? :dest′ : :dest add_broadcast_loops!(ls, loopsyms, ret) elementbytes = sizeofT add_broadcast!(ls, :destination, :bc, loopsyms, BC, dontbc, elementbytes) transpose && reverse!(loopsyms) - storeop = - add_simple_store!(ls, :destination, ArrayReference(:dest, loopsyms), elementbytes) + storeop = add_simple_store!( + ls, + :destination, + ArrayReference(:dest, loopsyms), + elementbytes + ) doaddref!(ls, storeop) resize!(ls.loop_order, num_loops(ls)) # num_loops may be greater than N, eg Product # return ls @@ -564,7 +659,7 @@ function vmaterialize_fun( v, threads % Int, warncheckarg, - safe, + safe ) Expr(:block, Expr(:meta, :inline), sc, ret) end @@ -576,7 +671,7 @@ end bc::BC, ::Val{Mod}, ::Val{UNROLL}, - ::Val{dontbc}, + ::Val{dontbc} ) where {T<:NativeTypes,N,BC<:Union{Broadcasted,Product},Mod,UNROLL,dontbc} vmaterialize_fun(sizeof(T), N, BC, Mod, UNROLL, dontbc, false) end @@ -585,7 +680,7 @@ end bc::BC, ::Val{Mod}, ::Val{UNROLL}, - ::Val{dontbc}, + ::Val{dontbc} ) where { T<:NativeTypes, N, @@ -593,25 +688,31 @@ end BC<:Union{Broadcasted,Product}, Mod, UNROLL, - dontbc, + dontbc } vmaterialize_fun(sizeof(T), N, BC, Mod, UNROLL, dontbc, true) end # these are marked `@inline` so the `@turbo` itself can choose whether or not to inline. @generated function vmaterialize!( dest::AbstractArray{T,N}, - bc::Broadcasted{Base.Broadcast.DefaultArrayStyle{0},Nothing,typeof(identity),Tuple{T2}}, + bc::Broadcasted{ + Base.Broadcast.DefaultArrayStyle{0}, + Nothing, + typeof(identity), + Tuple{T2} + }, ::Val{Mod}, ::Val{UNROLL}, - ::Val{dontbc}, + ::Val{dontbc} ) where {T<:NativeTypes,N,T2<:Number,Mod,UNROLL,dontbc} - inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, threads, warncheckarg, safe = UNROLL + inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, threads, warncheckarg, safe = + UNROLL quote $(Expr(:meta, :inline)) arg = T(first(bc.args)) @turbo inline = $inline unroll = ($u₁, $u₂) thread = $threads vectorize = $v for i ∈ eachindex( - dest, + dest ) dest[i] = arg end @@ -620,37 +721,52 @@ end end @generated function vmaterialize!( dest′::Union{Adjoint{T,A},Transpose{T,A}}, - bc::Broadcasted{Base.Broadcast.DefaultArrayStyle{0},Nothing,typeof(identity),Tuple{T2}}, + bc::Broadcasted{ + Base.Broadcast.DefaultArrayStyle{0}, + Nothing, + typeof(identity), + Tuple{T2} + }, ::Val{Mod}, ::Val{UNROLL}, - ::Val{dontbc}, + ::Val{dontbc} ) where {T<:NativeTypes,N,A<:AbstractArray{T,N},T2<:Number,Mod,UNROLL,dontbc} - inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, threads, warncheckarg, safe = UNROLL + inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, threads, warncheckarg, safe = + UNROLL quote $(Expr(:meta, :inline)) arg = T(first(bc.args)) dest = parent(dest′) @turbo inline = $inline unroll = ($u₁, $u₂) thread = $threads vectorize = $v for i ∈ eachindex( - dest, + dest ) dest[i] = arg end dest′ end end -@inline function vmaterialize!(dest, bc, ::Val{Mod}, ::Val{UNROLL}) where {Mod,UNROLL} +@inline function vmaterialize!( + dest, + bc, + ::Val{Mod}, + ::Val{UNROLL} +) where {Mod,UNROLL} vmaterialize!(dest, bc, Val{Mod}(), Val{UNROLL}(), Val(_dontbc(bc))) end @inline function vmaterialize( bc::Broadcasted{Base.Broadcast.DefaultArrayStyle{0}}, ::Val{Mod}, - ::Val{UNROLL}, + ::Val{UNROLL} ) where {Mod,UNROLL} Base.materialize(bc) end -@inline function vmaterialize(bc::Broadcasted, ::Val{Mod}, ::Val{UNROLL}) where {Mod,UNROLL} +@inline function vmaterialize( + bc::Broadcasted, + ::Val{Mod}, + ::Val{UNROLL} +) where {Mod,UNROLL} ElType = Base.Broadcast.combine_eltypes(bc.f, bc.args) dest = similar(bc, ElType) vmaterialize!(dest, bc, Val{Mod}(), Val{UNROLL}(), Val(_dontbc(bc))) diff --git a/src/codegen/line_number_nodes.jl b/src/codegen/line_number_nodes.jl index 81d9533f8..9d9485546 100644 --- a/src/codegen/line_number_nodes.jl +++ b/src/codegen/line_number_nodes.jl @@ -3,7 +3,10 @@ function extract_all_lnns(x) return extract_all_lnns!(lnns, x) end -function extract_all_lnns!(lnns::AbstractVector{<:LineNumberNode}, lnn::LineNumberNode) +function extract_all_lnns!( + lnns::AbstractVector{<:LineNumberNode}, + lnn::LineNumberNode +) push!(lnns, lnn) return lnns end @@ -13,14 +16,16 @@ function extract_all_lnns!(lnns::AbstractVector{<:LineNumberNode}, ex::Expr) end return lnns end -function extract_all_lnns!(lnns::AbstractVector{<:LineNumberNode}, ::Any) - return lnns -end +extract_all_lnns!(lnns::AbstractVector{<:LineNumberNode}, ::Any) = lnns function prepend_lnns!(ex::Expr, lnns::AbstractVector{<:LineNumberNode}) return prepend_lnns!(ex, lnns, Val(ex.head)) end -function prepend_lnns!(ex::Expr, lnns::AbstractVector{<:LineNumberNode}, ::Val{:block}) +function prepend_lnns!( + ex::Expr, + lnns::AbstractVector{<:LineNumberNode}, + ::Val{:block} +) for lnn in lnns pushfirst!(ex.args, Expr(:block, lnn, :(nothing))) end diff --git a/src/codegen/loopstartstopmanager.jl b/src/codegen/loopstartstopmanager.jl index 7b36d035a..c41de4d4b 100644 --- a/src/codegen/loopstartstopmanager.jl +++ b/src/codegen/loopstartstopmanager.jl @@ -22,7 +22,10 @@ function uniquearrayrefs_csesummary(ls::LoopSet) if unique # matching name, no matching ref push!(uniquerefs, arrayref) push!(namev, length(uniquerefs)) - push!(unique_to_name_and_op_map, Tuple{Int,Int,Int}[(j, length(namev), i)]) + push!( + unique_to_name_and_op_map, + Tuple{Int,Int,Int}[(j, length(namev), i)] + ) unique = false end break @@ -32,7 +35,7 @@ function uniquearrayrefs_csesummary(ls::LoopSet) push!(name_to_array_map, Int[length(uniquerefs)]) push!( unique_to_name_and_op_map, - Tuple{Int,Int,Int}[(length(name_to_array_map), 1, i)], + Tuple{Int,Int,Int}[(length(name_to_array_map), 1, i)] ) end end @@ -65,12 +68,15 @@ function uniquearrayrefs(ls::LoopSet) end otherindexunrolled(loopsym::Symbol, ind::Symbol, loopdeps::Vector{Symbol}) = - ((loopsym ≢ ind) & (loopsym ≢ Symbol("##undefined##"))) && (loopsym ∈ loopdeps) + ((loopsym ≢ ind) & (loopsym ≢ Symbol("##undefined##"))) && + (loopsym ∈ loopdeps) function otherindexunrolled(ls::LoopSet, ind::Symbol, ref::ArrayReferenceMeta) us = ls.unrollspecification @unpack u₁loopnum, u₂loopnum, u₁, u₂ = us u₁sym = u₁ > 1 ? names(ls)[u₁loopnum] : Symbol("##undefined##") - u₂sym = ((u₂ > 1) & (u₂loopnum > 0)) ? names(ls)[u₂loopnum] : Symbol("##undefined##") + u₂sym = + ((u₂ > 1) & (u₂loopnum > 0)) ? names(ls)[u₂loopnum] : + Symbol("##undefined##") # u₁sym = names(ls)[u₁loopnum] # u₂sym = ((u₂loopnum > 0)) ? names(ls)[u₂loopnum] : Symbol("##undefined##") otherindexunrolled(u₁sym, ind, loopdependencies(ref)) || @@ -87,7 +93,10 @@ function multiple_with_name(n::Symbol, v::Vector{ArrayReferenceMeta}) end # multiple_with_name(n::Symbol, v::Vector{ArrayReferenceMeta}) = sum(ref -> n === vptr(ref), v) > 1 # TODO: DRY between indices_calculated_by_pointer_offsets and use_loop_induct_var -function indices_calculated_by_pointer_offsets(ls::LoopSet, ar::ArrayReferenceMeta) +function indices_calculated_by_pointer_offsets( + ls::LoopSet, + ar::ArrayReferenceMeta +) indices = getindices(ar) ls.isbroadcast && return fill(false, length(indices)) looporder = names(ls) @@ -156,7 +165,7 @@ function set_ref_loopedindex_and_ind!( i::Int, ii::Int, li::Bool, - ind::Symbol, + ind::Symbol ) ref.loopedindex[i] = li getindices(ref)[ii] = ind @@ -168,12 +177,18 @@ function set_all_to_constant_index!( indop::Operation, allarrayrefs::Vector{ArrayReferenceMeta}, array_refs_with_same_name::Vector{Int}, - arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}}, + arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}} ) ops = operations(ls) for j ∈ array_refs_with_same_name arrayref_to_name_op = arrayref_to_name_op_collection[j] - set_ref_loopedindex_and_ind!(allarrayrefs[j], i, ii, true, CONSTANTZEROINDEX) + set_ref_loopedindex_and_ind!( + allarrayrefs[j], + i, + ii, + true, + CONSTANTZEROINDEX + ) for (_, __, opid) ∈ arrayref_to_name_op op = ops[opid] set_ref_loopedindex_and_ind!(op.ref, i, ii, true, CONSTANTZEROINDEX) @@ -188,7 +203,8 @@ function set_all_to_constant_index!( end end end -maybeloopvaluename(op::Operation) = isloopvalue(op) ? instruction(op).instr : name(op) +maybeloopvaluename(op::Operation) = + isloopvalue(op) ? instruction(op).instr : name(op) function substitute_ops_all!( ls::LoopSet, i::Int, @@ -197,7 +213,7 @@ function substitute_ops_all!( new_parent::Operation, allarrayrefs::Vector{ArrayReferenceMeta}, array_refs_with_same_name::Vector{Int}, - arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}}, + arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}} ) newindsym = maybeloopvaluename(new_parent) isloopval = isloopvalue(new_parent) @@ -303,7 +319,7 @@ end function isloopvalue( ls::LoopSet, ind::Symbol, - isrooted::Union{Nothing,Vector{Bool}} = nothing, + isrooted::Union{Nothing,Vector{Bool}} = nothing ) for (i, op) ∈ enumerate(operations(ls)) if (isrooted ≢ nothing) @@ -324,12 +340,13 @@ function cse_constant_offsets!( allarrayrefs::Vector{ArrayReferenceMeta}, allarrayrefsind::Int, name_to_array_map::Vector{Vector{Int}}, - arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}}, + arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}} ) ar = allarrayrefs[allarrayrefsind] # vptrar = vptr(ar) arrayref_to_name_op = arrayref_to_name_op_collection[allarrayrefsind] - array_refs_with_same_name = name_to_array_map[first(first(arrayref_to_name_op))] + array_refs_with_same_name = + name_to_array_map[first(first(arrayref_to_name_op))] li = ar.loopedindex indices = getindices(ar) offset = first(indices) === DISCONTIGUOUS @@ -350,7 +367,9 @@ function cse_constant_offsets!( ref = allarrayrefs[j] refinds = getindices(ref) # refinds === indices && continue # fast check, should be covered by `j == position_in_array_refs_with_same_name` - if !((refinds[ii] === ind) & (getstrides(ar)[i] == getstrides(ref)[i])) + if !( + (refinds[ii] === ind) & (getstrides(ar)[i] == getstrides(ref)[i]) + ) # For now, we'll only bother with `licm` if all share the same indices # This is so that we can apply the same `licm` to each and keep the same array name. # Otherwise, we'll rely on LLVM to optimize indexing. @@ -415,7 +434,7 @@ function cse_constant_offsets!( indop, allarrayrefs, array_refs_with_same_name, - arrayref_to_name_op_collection, + arrayref_to_name_op_collection ) else # new_parent is a new parent to replace `indop` ind = maybeloopvaluename(new_parent) @@ -427,7 +446,7 @@ function cse_constant_offsets!( new_parent, allarrayrefs, array_refs_with_same_name, - arrayref_to_name_op_collection, + arrayref_to_name_op_collection ) end end @@ -445,7 +464,7 @@ function cse_constant_offsets!( :call, GlobalRef(Base, :(-)), Expr(:call, GlobalRef(Base, :(+)), gespsymbol, name(op1)), - name(op2), + name(op2) ) end gespsymbol = gensym!(ls, "#gespsym#") @@ -458,7 +477,7 @@ function cse_constant_offsets!( indop, allarrayrefs, array_refs_with_same_name, - arrayref_to_name_op_collection, + arrayref_to_name_op_collection ) else# op1const, op2dynamic # won't bother with this for now @@ -481,7 +500,7 @@ function cse_constant_offsets!( op1, allarrayrefs, array_refs_with_same_name, - arrayref_to_name_op_collection, + arrayref_to_name_op_collection ) else licmoffset = false @@ -516,12 +535,11 @@ end # return nothing # end - function adjust_offsets!( ls::LoopSet, i::Int, array_refs_with_same_name::Vector{Int}, - arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}}, + arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}} ) ops = operations(ls) if length(ops) ≤ 256 @@ -532,7 +550,7 @@ function adjust_offsets!( i, poffsets, array_refs_with_same_name, - arrayref_to_name_op_collection, + arrayref_to_name_op_collection ) else offsetsv = similar(ops, Int8) @@ -542,7 +560,7 @@ function adjust_offsets!( i, poffsets, array_refs_with_same_name, - arrayref_to_name_op_collection, + arrayref_to_name_op_collection ) end end @@ -551,7 +569,7 @@ function adjust_offsets!( i::Int, poffsets::Ptr{Int8}, array_refs_with_same_name::Vector{Int}, - arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}}, + arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}} ) ops = operations(ls) minoffset = typemax(Int8) @@ -595,7 +613,7 @@ function calcgespinds( gespindsummary::Vector{Symbol}, shouldindbyind::Vector{Bool}, array_refs_with_same_name::Vector{Int}, - arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}}, + arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}} ) gespinds = Expr(:tuple) li = ar.loopedindex @@ -614,8 +632,12 @@ function calcgespinds( # end # end # constoffset ≠ 0 && - constoffset = - adjust_offsets!(ls, i, array_refs_with_same_name, arrayref_to_name_op_collection) + constoffset = adjust_offsets!( + ls, + i, + array_refs_with_same_name, + arrayref_to_name_op_collection + ) index_by_index = isli ? check_shouldindbyind(ls, ind, shouldindbyind) : true # (stridesunequal & isli) && (@assert isknown(first(getloop(ls, ind)))) @@ -642,7 +664,7 @@ function calcgespinds( ind, isli, index_by_index, - true, + true ) end gespinds @@ -657,7 +679,7 @@ function pushgespind!( ind::Symbol, isli::Bool, index_by_index::Bool, - fromgsp::Bool, + fromgsp::Bool ) if isli if ind === CONSTANTZEROINDEX @@ -668,7 +690,12 @@ function pushgespind!( else push!( gespinds.args, - Expr(:call, GlobalRef(Base, :(+)), gespsymbol, staticexpr(constoffset)), + Expr( + :call, + GlobalRef(Base, :(+)), + gespsymbol, + staticexpr(constoffset) + ) ) end else @@ -701,7 +728,10 @@ function pushgespind!( if gespsymbol === Symbol("") if isknown(first(loop)) # @show constoffset, gethint(first(loop)) - push!(gespinds.args, staticexpr(constoffset + stride * gethint(first(loop)))) + push!( + gespinds.args, + staticexpr(constoffset + stride * gethint(first(loop))) + ) elseif constoffset == 0 if stride == 1 push!(gespinds.args, getsym(first(loop))) @@ -711,7 +741,10 @@ function pushgespind!( elseif stride == 1 push!(gespinds.args, addexpr(getsym(first(loop)), constoffset)) else - push!(gespinds.args, addexpr(mulexpr(getsym(first(loop)), stride), constoffset)) + push!( + gespinds.args, + addexpr(mulexpr(getsym(first(loop)), stride), constoffset) + ) end elseif isknown(first(loop)) loopfirst = gethint(first(loop)) * stride + constoffset @@ -720,7 +753,12 @@ function pushgespind!( else push!( gespinds.args, - Expr(:call, GlobalRef(Base, :(+)), gespsymbol, staticexpr(loopfirst)), + Expr( + :call, + GlobalRef(Base, :(+)), + gespsymbol, + staticexpr(loopfirst) + ) ) end else @@ -731,7 +769,7 @@ function pushgespind!( :call, GlobalRef(Base, :(+)), mulexpr(stride, gespsymbol), - getsym(first(loop)), + getsym(first(loop)) ) end if constoffset == 0 @@ -739,7 +777,12 @@ function pushgespind!( else push!( gespinds.args, - Expr(:call, GlobalRef(Base, :(+)), addedstarts, staticexpr(constoffset)), + Expr( + :call, + GlobalRef(Base, :(+)), + addedstarts, + staticexpr(constoffset) + ) ) end end @@ -758,8 +801,8 @@ function pushgespind!( else isconstantop(op) || throw( LoopError( - "Please file an issue with LoopVectorization.jl with a reproducer; tried to eliminate a non-constant operation.", - ), + "Please file an issue with LoopVectorization.jl with a reproducer; tried to eliminate a non-constant operation." + ) ) rangesym = name(op) end @@ -791,7 +834,7 @@ function pushsimdims!( gespinds::Expr, rangesym::Symbol, gespsymbol::Symbol, - constoffset::Int, + constoffset::Int ) simdimscall = Expr(:call, lv(:similardims), rangesym) pushgespsym!(simdimscall, gespsymbol, constoffset) @@ -811,7 +854,7 @@ function use_loop_induct_var!( q::Expr, ar::ArrayReferenceMeta, allarrayrefs::Vector{ArrayReferenceMeta}, - includeinlet::Bool, + includeinlet::Bool # array_refs_with_same_name::Vector{Int}, arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}} )::Vector{Int} us = ls.unrollspecification @@ -846,28 +889,33 @@ function use_loop_induct_var!( if !li[i] # if it wasn't set uliv[i] = 0 push!(offsetprecalc_descript.args, 0) - Wisz || pushgespind!(gespinds, ls, Symbol(""), 0, 1, ind, isli, true, false) + Wisz || + pushgespind!(gespinds, ls, Symbol(""), 0, 1, ind, isli, true, false) elseif ind === CONSTANTZEROINDEX uliv[i] = 0 push!(offsetprecalc_descript.args, 0) - Wisz || pushgespind!(gespinds, ls, Symbol(""), 0, 1, ind, isli, true, false) + Wisz || + pushgespind!(gespinds, ls, Symbol(""), 0, 1, ind, isli, true, false) elseif isbroadcast || ( (isone(ii) && (last(looporder) === ind)) && - !(otherindexunrolled(ls, ind, ar)) || multiple_with_name(vptrar, allarrayrefs) + !(otherindexunrolled(ls, ind, ar)) || + multiple_with_name(vptrar, allarrayrefs) ) || (iszero(ls.vector_width) && isstaticloop(getloop(ls, ind))) || (strds[i] ≤ 0) # Not doing normal offset indexing uliv[i] = -findfirst(Base.Fix2(===, ind), looporder)::Int push!(offsetprecalc_descript.args, 0) # not doing offset indexing, so push 0 - Wisz || pushgespind!(gespinds, ls, Symbol(""), 0, 1, ind, isli, true, false) + Wisz || + pushgespind!(gespinds, ls, Symbol(""), 0, 1, ind, isli, true, false) else uliv[i] = findfirst(Base.Fix2(===, ind), looporder)::Int # loop = getloop(ls, ind) push!(offsetprecalc_descript.args, max(5, us.u₁ + 1, us.u₂ + 1)) use_offsetprecalc = true - Wisz || pushgespind!(gespinds, ls, Symbol(""), 0, 1, ind, isli, false, false) + Wisz || + pushgespind!(gespinds, ls, Symbol(""), 0, 1, ind, isli, false, false) end # cases for pushgespind! and loopval! # if !isloopval, same as before @@ -881,7 +929,7 @@ function use_loop_induct_var!( :call, lv(:offsetprecalc), vpgesped, - Expr(:call, Expr(:curly, :Val, offsetprecalc_descript)), + Expr(:call, Expr(:curly, :Val, offsetprecalc_descript)) ) end push!(q.args, Expr(:(=), vptrar, vpgesped)) @@ -890,8 +938,8 @@ function use_loop_induct_var!( Expr( :(=), vptr_offset(vptrar), - Expr(:call, GlobalRef(VectorizationBase, :increment_ptr), vptrar), - ), + Expr(:call, GlobalRef(VectorizationBase, :increment_ptr), vptrar) + ) ) end uliv @@ -912,7 +960,8 @@ function add_loop_start_stop_manager!(ls::LoopSet) use_livs = Vector{Vector{Int}}(undef, length(arrayrefs)) # for i ∈ eachindex(name_to_array_map) for i ∈ eachindex(arrayrefs) - use_livs[i] = use_loop_induct_var!(ls, q, arrayrefs[i], arrayrefs, includeinlet[i]) + use_livs[i] = + use_loop_induct_var!(ls, q, arrayrefs[i], arrayrefs, includeinlet[i]) #name_to_array_map[first(first(unique_to_name_and_op_map[i]))], unique_to_name_and_op_map) end # loops, sorted from outer-most to inner-most @@ -958,7 +1007,7 @@ function pointermax( ar::ArrayReferenceMeta, n::Int, sub::Int, - isvectorized::Bool, + isvectorized::Bool )::Expr pointermax(ls, ar, n, sub, isvectorized, getloop(ls, names(ls)[n])) end @@ -968,7 +1017,7 @@ function pointermax( n::Int, sub::Int, isvectorized::Bool, - loop::Loop, + loop::Loop )::Expr start = first(loop) stop = last(loop) @@ -981,7 +1030,7 @@ function pointermax( sub, isvectorized, 1 + gethint(stop) - gethint(start), - incr, + incr ) end looplensym = isone(start) ? getsym(stop) : loop.lensym @@ -994,7 +1043,7 @@ function pointermax_index( sub::Int, isvectorized::Bool, stophint::Int, - incr::MaybeKnown, + incr::MaybeKnown )::Tuple{Expr,Int} # @unpack u₁loopnum, u₂loopnum, vloopnum, u₁, u₂ = us loopsym = names(ls)[n] @@ -1010,7 +1059,12 @@ function pointermax_index( if isone(sub) Expr(:call, lv(:vsub_nsw), staticexpr(stophint), VECTORWIDTHSYMBOL) else - Expr(:call, lv(:vsub_nsw), staticexpr(stophint), mulexpr(VECTORWIDTHSYMBOL, sub)) + Expr( + :call, + lv(:vsub_nsw), + staticexpr(stophint), + mulexpr(VECTORWIDTHSYMBOL, sub) + ) end else staticexpr(stophint - sub) @@ -1041,7 +1095,7 @@ function pointermax_index( sub::Int, isvectorized::Bool, stopsym, - incr::MaybeKnown, + incr::MaybeKnown )::Tuple{Expr,Int} loopsym = names(ls)[n] index = Expr(:tuple) @@ -1087,7 +1141,7 @@ function pointermax( sub::Int, isvectorized::Bool, stopsym, - incr::MaybeKnown, + incr::MaybeKnown )::Expr index = first(pointermax_index(ls, ar, n, sub, isvectorized, stopsym, incr)) vptrar = vptr(ar) @@ -1096,7 +1150,7 @@ function pointermax( GlobalRef(VectorizationBase, :increment_ptr), vptrar, vptr_offset(vptrar), - index, + index ) end @@ -1105,11 +1159,17 @@ function defpointermax( ar::ArrayReferenceMeta, n::Int, sub::Int, - isvectorized::Bool, + isvectorized::Bool )::Expr Expr(:(=), maxsym(vptr(ar), sub), pointermax(ls, ar, n, sub, isvectorized)) end -function offsetindex(dim::Int, ind::Int, scale::Int, isvectorized::Bool, incr::MaybeKnown) +function offsetindex( + dim::Int, + ind::Int, + scale::Int, + isvectorized::Bool, + incr::MaybeKnown +) index = Expr(:tuple) for d ∈ 1:dim if d ≠ ind || iszero(scale) @@ -1134,7 +1194,7 @@ function append_pointer_maxes!( submax::Int, isvectorized::Bool, stopindicator, - incr::MaybeKnown, + incr::MaybeKnown ) vptr_ar = vptr(ar) if submax < 2 @@ -1144,17 +1204,22 @@ function append_pointer_maxes!( Expr( :(=), maxsym(vptr_ar, sub), - pointermax(ls, ar, n, sub, isvectorized, stopindicator, incr), - ), + pointermax(ls, ar, n, sub, isvectorized, stopindicator, incr) + ) ) end else - index, ind = pointermax_index(ls, ar, n, submax, isvectorized, stopindicator, incr) + index, ind = + pointermax_index(ls, ar, n, submax, isvectorized, stopindicator, incr) pointercompbase = maxsym(vptr_ar, submax) ip = GlobalRef(VectorizationBase, :increment_ptr) push!( loopstart.args, - Expr(:(=), pointercompbase, Expr(:call, ip, vptr_ar, vptr_offset(vptr_ar), index)), + Expr( + :(=), + pointercompbase, + Expr(:call, ip, vptr_ar, vptr_offset(vptr_ar), index) + ) ) dim = length(getindicesonly(ar)) # OFFSETPRECALCDEF = true @@ -1166,7 +1231,7 @@ function append_pointer_maxes!( ip, vptr_ar, pointercompbase, - offsetindex(dim, ind, (submax - sub) * strd, isvectorized, incr), + offsetindex(dim, ind, (submax - sub) * strd, isvectorized, incr) ) push!(loopstart.args, Expr(:(=), maxsym(vptr_ar, sub), ptrcmp)) end @@ -1178,7 +1243,7 @@ function append_pointer_maxes!( ar::ArrayReferenceMeta, n::Int, submax::Int, - isvectorized::Bool, + isvectorized::Bool ) loop = getloop(ls, n) @assert loop.itersymbol == names(ls)[n] @@ -1194,11 +1259,20 @@ function append_pointer_maxes!( submax, isvectorized, startstopΔ(loop) + 1, - incr, + incr ) end looplensym = isone(start) ? getsym(stop) : loop.lensym - append_pointer_maxes!(loopstart, ls, ar, n, submax, isvectorized, looplensym, incr) + append_pointer_maxes!( + loopstart, + ls, + ar, + n, + submax, + isvectorized, + looplensym, + incr + ) end function maxunroll(us::UnrollSpecification, n) @@ -1212,8 +1286,12 @@ function maxunroll(us::UnrollSpecification, n) end end - -function startloop(ls::LoopSet, us::UnrollSpecification, n::Int, staticinit::Bool = false) +function startloop( + ls::LoopSet, + us::UnrollSpecification, + n::Int, + staticinit::Bool = false +) @unpack u₁loopnum, u₂loopnum, vloopnum, u₁, u₂ = us lssm = ls.lssm termind = lssm.terminators[n] @@ -1229,7 +1307,14 @@ function startloop(ls::LoopSet, us::UnrollSpecification, n::Int, staticinit::Boo else isvectorized = n == vloopnum submax = maxunroll(us, n) - append_pointer_maxes!(loopstart, ls, ptrdefs[termind], n, submax, isvectorized) + append_pointer_maxes!( + loopstart, + ls, + ptrdefs[termind], + n, + submax, + isvectorized + ) end loopstart end @@ -1240,7 +1325,7 @@ function offset_ptr( n::Int, UF::Int, offsetinds::Vector{Bool}, - loop::Loop, + loop::Loop ) indices = getindices(ar) strides = getstrides(ar) @@ -1258,8 +1343,13 @@ function offset_ptr( # ind == loopsym && break end vpoff = vptr_offset(ar) - call = - Expr(:call, GlobalRef(VectorizationBase, :increment_ptr), vptr(ar), vpoff, gespinds) + call = Expr( + :call, + GlobalRef(VectorizationBase, :increment_ptr), + vptr(ar), + vpoff, + gespinds + ) Expr(:(=), vpoff, call) end function incrementloopcounter!( @@ -1267,7 +1357,7 @@ function incrementloopcounter!( ls::LoopSet, us::UnrollSpecification, n::Int, - UF::Int, + UF::Int ) @unpack u₁loopnum, u₂loopnum, vloopnum, u₁, u₂ = us lssm = ls.lssm @@ -1290,7 +1380,7 @@ function terminatecondition( us::UnrollSpecification, n::Int, inclmask::Bool, - UF::Int, + UF::Int ) lssm = ls.lssm termind = lssm.terminators[n] diff --git a/src/codegen/lower_compute.jl b/src/codegen/lower_compute.jl index 7c1cb7f2b..86a3c062b 100644 --- a/src/codegen/lower_compute.jl +++ b/src/codegen/lower_compute.jl @@ -1,11 +1,10 @@ - function load_constrained( op::Operation, u₁loop::Symbol, u₂loop::Symbol, innermost_loop_or_vloop::Symbol, - forprefetch::Bool = false, + forprefetch::Bool = false ) dependsonu₁ = isu₁unrolled(op) dependsonu₂ = isu₂unrolled(op) @@ -42,10 +41,17 @@ function check_if_remfirst(ls::LoopSet, ua::UnrollArgs) end function sub_fmas(ls::LoopSet, op::Operation, ua::UnrollArgs) @unpack u₁, u₁loopsym, u₂loopsym, vloopsym, u₂max = ua - !(load_constrained(op, u₁loopsym, u₂loopsym, vloopsym) || check_if_remfirst(ls, ua)) + !( + load_constrained(op, u₁loopsym, u₂loopsym, vloopsym) || + check_if_remfirst(ls, ua) + ) end -function parent_unroll_status(op::Operation, u₁loop::Symbol, us::UnrollSpecification) +function parent_unroll_status( + op::Operation, + u₁loop::Symbol, + us::UnrollSpecification +) parentsop = parents(op) u2 = fill(false, length(parentsop)) u1 = similar(u2) @@ -60,7 +66,7 @@ function parent_unroll_status( u₂loop::Symbol, vloop::Symbol, u₂max::Int, - us::UnrollSpecification, + us::UnrollSpecification ) u₂max == -1 && return parent_unroll_status(op, u₁loop, us) vparents = parents(op) @@ -75,7 +81,13 @@ function parent_unroll_status( parents_u₁syms, parents_u₂syms end -function _add_loopvalue!(ex::Expr, loopval::Symbol, vloop::Loop, u::Int, loop::Loop) +function _add_loopvalue!( + ex::Expr, + loopval::Symbol, + vloop::Loop, + u::Int, + loop::Loop +) vloopsym = vloop.itersymbol if loopval === vloopsym if iszero(u) @@ -86,18 +98,30 @@ function _add_loopvalue!(ex::Expr, loopval::Symbol, vloop::Loop, u::Int, loop::L if isone(u) & isone(vstep) push!(ex.args, Expr(:call, lv(:vadd_nsw), VECTORWIDTHSYMBOL, mm)) else - push!(ex.args, Expr(:call, lv(:vadd_nsw), mulexpr(VECTORWIDTHSYMBOL, u, vstep), mm)) + push!( + ex.args, + Expr(:call, lv(:vadd_nsw), mulexpr(VECTORWIDTHSYMBOL, u, vstep), mm) + ) end end elseif u == 0 push!(ex.args, loopval) elseif isknown(step(loop)) - push!(ex.args, Expr(:call, lv(:vadd_nsw), loopval, staticexpr(u * gethint(step(loop))))) + push!( + ex.args, + Expr(:call, lv(:vadd_nsw), loopval, staticexpr(u * gethint(step(loop)))) + ) else push!(ex.args, Expr(:call, lv(:vadd_nsw), loopval, mulexpr(step(loop), u))) end end -function add_loopvalue!(instrcall::Expr, loopval, ua::UnrollArgs, u₁::Int, loop::Loop) +function add_loopvalue!( + instrcall::Expr, + loopval, + ua::UnrollArgs, + u₁::Int, + loop::Loop +) @unpack u₁loopsym, u₂loopsym, vloopsym, vloop, suffix = ua if loopval === u₁loopsym #parentsunrolled[n] if isone(u₁) @@ -120,7 +144,14 @@ end vecunrolllen(::Type{VecUnroll{N,W,T,V}}) where {N,W,T,V} = (N::Int + 1) vecunrolllen(_) = -1 -function ifelselastexpr(hasf::Bool, M::Int, vargtypes, K::Int, S::Int, maskearly::Bool) +function ifelselastexpr( + hasf::Bool, + M::Int, + vargtypes, + K::Int, + S::Int, + maskearly::Bool +) q = Expr(:block, Expr(:meta, :inline)) vargs = Vector{Symbol}(undef, K) for k ∈ 1:K @@ -152,8 +183,8 @@ function ifelselastexpr(hasf::Bool, M::Int, vargtypes, K::Int, S::Int, maskearly else hasf || throw( ArgumentError( - "Argument reduction only supported for `ifelse(last/partial)(f::Function, args...)`", - ), + "Argument reduction only supported for `ifelse(last/partial)(f::Function, args...)`" + ) ) M = maxlen t = q @@ -166,8 +197,8 @@ function ifelselastexpr(hasf::Bool, M::Int, vargtypes, K::Int, S::Int, maskearly end for m ∈ start:M call = if hasf - (maskearly | (m == M)) ? Expr(:call, VectorizationBase.vifelse, :f, :m) : - Expr(:call, :f) + (maskearly | (m == M)) ? + Expr(:call, VectorizationBase.vifelse, :f, :m) : Expr(:call, :f) else# m == M because !hasf Expr(:call, :ifelse, :m) end @@ -202,7 +233,7 @@ end m::AbstractMask{W}, ::StaticInt{M}, ::StaticInt{S}, - vargs::Vararg{Any,K}, + vargs::Vararg{Any,K} ) where {F,W,K,M,S} ifelselastexpr(true, M, vargs, K, S, false) end @@ -211,7 +242,7 @@ end ::StaticInt{M}, ::StaticInt{S}, varg_1::V1, - varg_2::V2, + varg_2::V2 ) where {W,V1,V2,M,S} ifelselastexpr(false, M, (V1, V2), 2, S, false) end @@ -220,7 +251,7 @@ end m::AbstractMask{W}, ::StaticInt{M}, ::StaticInt{S}, - vargs::Vararg{Any,K}, + vargs::Vararg{Any,K} ) where {F,W,K,M,S} ifelselastexpr(true, M, vargs, K, S, true) end @@ -229,21 +260,25 @@ end ::StaticInt{M}, ::StaticInt{S}, varg_1::V1, - varg_2::V2, + varg_2::V2 ) where {W,V1,V2,M,S} ifelselastexpr(false, M, (V1, V2), 2, S, true) end # @inline ifelselast(f::F, m::AbstractMask{W}, ::StaticInt{M}, ::StaticInt{S}, vargs::Vararg{NativeTypes,K}) where {F,W,K,M,S} = f(vargs...) # @inline ifelsepartial(f::F, m::AbstractMask{W}, ::StaticInt{M}, ::StaticInt{S}, vargs::Vararg{NativeTypes,K}) where {F,W,K,M,S} = f(vargs...) -@generated function subset_vec_unroll(vu::VecUnroll{N}, ::StaticInt{S}) where {N,S} +@generated function subset_vec_unroll( + vu::VecUnroll{N}, + ::StaticInt{S} +) where {N,S} (1 ≤ S ≤ N + 1) || throw( ArgumentError( - "`vu` isa `VecUnroll` of `$(N+1)` elements, but trying to subset $S of them.", - ), + "`vu` isa `VecUnroll` of `$(N+1)` elements, but trying to subset $S of them." + ) ) t = Expr(:tuple) gf = GlobalRef(Core, :getfield) - S == 1 && return Expr(:block, Expr(:meta, :inline), :($gf($gf(vu, 1), 1, false))) + S == 1 && + return Expr(:block, Expr(:meta, :inline), :($gf($gf(vu, 1), 1, false))) for s ∈ 1:S push!(t.args, Expr(:call, gf, :vud, s, false)) end @@ -259,7 +294,7 @@ end default::D, ::StaticInt{M}, ::StaticInt{S}, - vargs::Vararg{Any,K}, + vargs::Vararg{Any,K} ) where {F,M,K,D,S} lengths = Vector{Int}(undef, K) q = Expr(:block, Expr(:meta, :inline)) @@ -343,7 +378,7 @@ function parent_op_name!( u₂max, u₂unrolledsym, op, - tiledouterreduction, + tiledouterreduction ) opp = parents_op[n] opisvectorized = isvectorized(op) @@ -367,14 +402,15 @@ function parent_op_name!( if parents_u₂syms[n] if isu₂unrolled(op) # u₂unrolledsym || parent = - isouterreduct ? Symbol(parent, suffix) : Symbol(parent, suffix, '_', '_', u) + isouterreduct ? Symbol(parent, suffix) : + Symbol(parent, suffix, '_', '_', u) elseif u₂max > 1 t = Expr(:tuple) reduction = Expr( :call, GlobalRef(ArrayInterface, :reduce_tup), reduce_to_onevecunroll(opp), - t, + t ) for u₂ ∈ 0:u₂max-1 push!(t.args, Symbol(parent, u₂, '_', '_', u)) @@ -437,8 +473,16 @@ function getu₁forreduct(ls::LoopSet, op::Operation, u₁::Int) end end isidentityop(op::Operation) = - iscompute(op) && (instruction(op).instr === :identity) && (length(parents(op)) == 1) -function reduce_parent!(q::Expr, ls::LoopSet, op::Operation, opp::Operation, parent::Symbol) + iscompute(op) && + (instruction(op).instr === :identity) && + (length(parents(op)) == 1) +function reduce_parent!( + q::Expr, + ls::LoopSet, + op::Operation, + opp::Operation, + parent::Symbol +) isvectorized(op) && return parent if isvectorized(opp) oppt = opp @@ -449,7 +493,8 @@ function reduce_parent!(q::Expr, ls::LoopSet, op::Operation, opp::Operation, par return parent end reduct_class = reduction_instruction_class(oppt.instruction) - if (instruction(op).instr === :mul_fast) & (reduct_class == ADDITIVE_IN_REDUCTIONS) + if (instruction(op).instr === :mul_fast) & + (reduct_class == ADDITIVE_IN_REDUCTIONS) op.vectorized = true return parent end @@ -457,7 +502,11 @@ function reduce_parent!(q::Expr, ls::LoopSet, op::Operation, opp::Operation, par if instruction(op).instr ≢ :ifelse push!( q.args, - Expr(:(=), newp, Expr(:call, lv(reduction_to_scalar(reduct_class)), parent)), + Expr( + :(=), + newp, + Expr(:call, lv(reduction_to_scalar(reduct_class)), parent) + ) )#IfElseReducer else reductexpr = ifelse_reduction(:IfElseReducer, op) do opv @@ -467,7 +516,13 @@ function reduce_parent!(q::Expr, ls::LoopSet, op::Operation, opp::Operation, par end newp end -function lower_compute!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs, mask::Bool) +function lower_compute!( + q::Expr, + op::Operation, + ls::LoopSet, + ua::UnrollArgs, + mask::Bool +) @unpack u₁, u₁loopsym, u₂loopsym, vloopsym, u₂max, suffix = ua var = name(op) instr = instruction(op) @@ -506,7 +561,7 @@ function lower_compute!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs, mas parentop.reduced_deps, parentop.parents, parentop.ref, - parentop.reduced_children, + parentop.reduced_children ) newparentop.vectorized = false newparentop.u₁unrolled = false @@ -519,14 +574,21 @@ function lower_compute!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs, mas newparentname = Symbol(newparentname, suffix_) end if isconstant(newparentop) - push!(q.args, Expr(:(=), Symbol(newparentname, '_', 1), Symbol(parentname, '_', 1))) + push!( + q.args, + Expr(:(=), Symbol(newparentname, '_', 1), Symbol(parentname, '_', 1)) + ) else newpname = Symbol(newparentname, '_', u₁) push!(q.args, Expr(:(=), newpname, Symbol(parentname, '_', u₁))) reduce_expr!(q, newparentname, newparentop, u₁, -1, true, false) push!( q.args, - Expr(:(=), Symbol(newparentname, '_', 1), Symbol(newparentname, "##onevec##")), + Expr( + :(=), + Symbol(newparentname, '_', 1), + Symbol(newparentname, "##onevec##") + ) ) end end @@ -544,14 +606,15 @@ function lower_compute!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs, mas # instrfid = findfirst(isequal(instr.instr), (:vfmadd, :vfnmadd, :vfmsub, :vfnmsub)) instrfid = findfirst( Base.Fix2(===, instr.instr), - (:vfmadd_fast, :vfnmadd_fast, :vfmsub_fast, :vfnmsub_fast), + (:vfmadd_fast, :vfnmadd_fast, :vfmsub_fast, :vfnmsub_fast) ) # instrfid = findfirst(isequal(instr.instr), (:vfnmadd_fast, :vfmsub_fast, :vfnmsub_fast)) # want to instcombine when parent load's deps are superset # also make sure opp is unrolled if !(instrfid === nothing) && (opunrolled && u₁ > 1) && sub_fmas(ls, op, ua) specific_fmas = - Base.libllvm_version >= v"11.0.0" ? (:vfmadd, :vfnmadd, :vfmsub, :vfnmsub) : + Base.libllvm_version >= v"11.0.0" ? + (:vfmadd, :vfnmadd, :vfmsub, :vfnmsub) : (:vfmadd231, :vfnmadd231, :vfmsub231, :vfnmsub231) # specific_fmas = Base.libllvm_version >= v"11.0.0" ? (:vfnmadd, :vfmsub, :vfnmsub) : (:vfnmadd231, :vfmsub231, :vfnmsub231) # specific_fmas = (:vfmadd231, :vfnmadd231, :vfmsub231, :vfnmsub231) @@ -597,7 +660,8 @@ function lower_compute!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs, mas if isreduct #(isanouterreduction(ls, op)) # isouterreduct = true isouterreduct = isanouterreduction(ls, op) - u₁reduct = isouterreduct ? getu₁full(ls, u₁) : getu₁forreduct(ls, op, u₁) + u₁reduct = + isouterreduct ? getu₁full(ls, u₁) : getu₁forreduct(ls, op, u₁) dopartialmap = u₁reduct ≠ u₁ Symbol(mvar, '_', u₁reduct) else @@ -618,7 +682,6 @@ function lower_compute!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs, mas if ((isvectorized(opp) && !isvectorized(op))) || (parents_u₁syms[n] != u₁unrolledsym) || (parents_u₂syms[n] != u₂unrolledsym) - selfopname, uₚ = parent_op_name!( q, ls, @@ -632,7 +695,7 @@ function lower_compute!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs, mas u₂max, u₂unrolledsym, op, - tiledouterreduction, + tiledouterreduction ) push!(instrcall.args, selfopname) else @@ -665,7 +728,7 @@ function lower_compute!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs, mas u₂max, u₂unrolledsym, op, - 0, + 0 ) parent = reduce_parent!(q, ls, op, opp, parent) if (selfdep == 0) && search_tree(parents(opp), name(op)) @@ -678,7 +741,8 @@ function lower_compute!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs, mas end end end - selfdepreduce = ifelse(((!u₁unrolledsym) & isu₁unrolled(op)) & (u₁ > 1), selfdep, 0) + selfdepreduce = + ifelse(((!u₁unrolledsym) & isu₁unrolled(op)) & (u₁ > 1), selfdep, 0) if maskreduct ifelsefunc = if us.u₁ == 1 :ifelse # don't need to be fancy @@ -705,8 +769,8 @@ function lower_compute!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs, mas Expr( :(=), varsym, - Expr(:call, lv(ifelsefunc), MASKSYMBOL, instrcall, selfopname), - ), + Expr(:call, lv(ifelsefunc), MASKSYMBOL, instrcall, selfopname) + ) ) elseif ((u₁ ≡ 1) | (selfdepreduce ≡ 0)) # if the current unroll is 1, no need to accumulate. Same if there is no selfdepreduce, but there has to be if we're here? @@ -723,16 +787,20 @@ function lower_compute!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs, mas staticexpr(u₁), staticexpr(selfdepreduce), instrcall, - selfopname, - ), - ), + selfopname + ) + ) ) else make_partial_map!(instrcall, selfopname, u₁, selfdepreduce) # partialmap accumulates push!( q.args, - Expr(:(=), varsym, Expr(:call, lv(:ifelse), MASKSYMBOL, instrcall, selfopname)), + Expr( + :(=), + varsym, + Expr(:call, lv(:ifelse), MASKSYMBOL, instrcall, selfopname) + ) ) end return @@ -761,7 +829,11 @@ function lower_compute!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs, mas elseif identifier(op) ∉ ls.outer_reductions && should_broadcast_op(op) push!( q.args, - Expr(:(=), varsym, Expr(:call, lv(:vbroadcast), VECTORWIDTHSYMBOL, instrcall)), + Expr( + :(=), + varsym, + Expr(:call, lv(:vbroadcast), VECTORWIDTHSYMBOL, instrcall) + ) ) else push!(q.args, Expr(:(=), varsym, instrcall)) diff --git a/src/codegen/lower_constant.jl b/src/codegen/lower_constant.jl index add374880..0429b75dd 100644 --- a/src/codegen/lower_constant.jl +++ b/src/codegen/lower_constant.jl @@ -6,8 +6,9 @@ function should_broadcast_op(op::Operation) true end - -@inline sizeequivalentfloat(::Type{T}) where {T<:Union{Float16,Float32,Float64}} = T +@inline sizeequivalentfloat( + ::Type{T} +) where {T<:Union{Float16,Float32,Float64}} = T @inline sizeequivalentfloat(::Type{T}) where {T<:Union{Int8,UInt8}} = Float32 @inline sizeequivalentfloat(::Type{T}) where {T<:Union{Int16,UInt16}} = Float16 @inline sizeequivalentfloat(::Type{T}) where {T<:Union{Int32,UInt32}} = Float32 @@ -19,8 +20,9 @@ end if (Sys.ARCH === :x86_64) || (Sys.ARCH === :i686) @inline widest_supported_integer(::True) = Int64 @inline widest_supported_integer(::False) = Int32 - @inline sizeequivalentint(::Type{Float64}) = - widest_supported_integer(VectorizationBase.has_feature(Val(:x86_64_avx512dq))) + @inline sizeequivalentint(::Type{Float64}) = widest_supported_integer( + VectorizationBase.has_feature(Val(:x86_64_avx512dq)) + ) else @inline sizeequivalentint(::Type{Float64}) = Int end @@ -35,14 +37,14 @@ function typeof_sym(ls::LoopSet, op::Operation, zerotyp::NumberType) newtypeT = gensym(:IntType) pushpreamble!( ls, - Expr(:(=), newtypeT, Expr(:call, lv(:sizeequivalentint), ELTYPESYMBOL)), + Expr(:(=), newtypeT, Expr(:call, lv(:sizeequivalentint), ELTYPESYMBOL)) ) newtypeT elseif zerotyp == HardFloat newtypeT = gensym(:FloatType) pushpreamble!( ls, - Expr(:(=), newtypeT, Expr(:call, lv(:sizeequivalentfloat), ELTYPESYMBOL)), + Expr(:(=), newtypeT, Expr(:call, lv(:sizeequivalentfloat), ELTYPESYMBOL)) ) newtypeT else @@ -55,7 +57,7 @@ function lower_zero!( op::Operation, ls::LoopSet, ua::UnrollArgs, - zerotyp::NumberType = zerotype(ls, op), + zerotyp::NumberType = zerotype(ls, op) ) @unpack u₁, u₁loopsym, u₂loopsym, vloopsym, vloop, u₂max, suffix = ua mvar, opu₁, opu₂ = @@ -79,10 +81,16 @@ function lower_zero!( staticexpr(u₁), VECTORWIDTHSYMBOL, typeT, - staticexpr(reg_size(ls)), + staticexpr(reg_size(ls)) ) else - call = Expr(:call, lv(:_vzero), VECTORWIDTHSYMBOL, typeT, staticexpr(reg_size(ls))) + call = Expr( + :call, + lv(:_vzero), + VECTORWIDTHSYMBOL, + typeT, + staticexpr(reg_size(ls)) + ) end else call = Expr(:call, :zero, typeT) @@ -96,7 +104,10 @@ function lower_zero!( end if (suffix == -1) && opu₂ for u ∈ 0:u₂max-1 - push!(q.args, Expr(:(=), Symbol(mvar, u, "__", Core.ifelse(opu₁, u₁, 1)), call)) + push!( + q.args, + Expr(:(=), Symbol(mvar, u, "__", Core.ifelse(opu₁, u₁, 1)), call) + ) end else mvar = Symbol(mvar, '_', Core.ifelse(opu₁, u₁, 1)) @@ -118,8 +129,11 @@ function getparentsreductzero(ls::LoopSet, op::Operation)::Float64 end throw("Reduct zero not found for operation $(name(op)).") end -vecbasefunc(f) = - Expr(:(.), Expr(:(.), :LoopVectorization, QuoteNode(:VectorizationBase)), QuoteNode(f)) +vecbasefunc(f) = Expr( + :(.), + Expr(:(.), :LoopVectorization, QuoteNode(:VectorizationBase)), + QuoteNode(f) +) function lower_constant!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs) @unpack u₁, u₁loopsym, u₂loopsym, vloopsym, u₂max, suffix = ua mvar, opu₁, opu₂ = @@ -141,14 +155,19 @@ function lower_constant!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs) :call, vecbasefunc(:addscalar), Expr(:call, lv(:vzero), VECTORWIDTHSYMBOL, ELTYPESYMBOL), - constsym, + constsym ) elseif instrclass == MULTIPLICATIVE_IN_REDUCTIONS Expr( :call, vecbasefunc(:mulscalar), - Expr(:call, lv(:vbroadcast), VECTORWIDTHSYMBOL, Expr(:call, :one, ELTYPESYMBOL)), - constsym, + Expr( + :call, + lv(:vbroadcast), + VECTORWIDTHSYMBOL, + Expr(:call, :one, ELTYPESYMBOL) + ), + constsym ) elseif instrclass == MAX Expr( @@ -158,9 +177,9 @@ function lower_constant!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs) :call, lv(:vbroadcast), VECTORWIDTHSYMBOL, - Expr(:call, :typemin, ELTYPESYMBOL), + Expr(:call, :typemin, ELTYPESYMBOL) ), - constsym, + constsym ) elseif instrclass == MIN Expr( @@ -170,13 +189,13 @@ function lower_constant!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs) :call, lv(:vbroadcast), VECTORWIDTHSYMBOL, - Expr(:call, :typemax, ELTYPESYMBOL), + Expr(:call, :typemax, ELTYPESYMBOL) ), - constsym, + constsym ) else throw( - "Reductions of type $(reduction_zero(instrclass)) not yet supported; please file an issue as a reminder to take care of this.", + "Reductions of type $(reduction_zero(instrclass)) not yet supported; please file an issue as a reminder to take care of this." ) end else @@ -219,7 +238,8 @@ function lower_constant!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs) end isconstantop(op::Operation) = - (instruction(op) == LOOPCONSTANT) || (isconstant(op) && length(loopdependencies(op)) == 0) + (instruction(op) == LOOPCONSTANT) || + (isconstant(op) && length(loopdependencies(op)) == 0) function isinitializedconst(op::Operation) if isconstant(op) return true @@ -281,7 +301,11 @@ function lower_licm_constants!(ls::LoopSet) end end for (id, floatval) ∈ ls.preamble_symfloat - setop!(ls, ops[id], Expr(:call, lv(:sizeequivalentfloat), ELTYPESYMBOL, floatval)) + setop!( + ls, + ops[id], + Expr(:call, lv(:sizeequivalentfloat), ELTYPESYMBOL, floatval) + ) end for (id, typ) ∈ ls.preamble_zeros instruction(ops[id]) === LOOPCONSTANT || continue diff --git a/src/codegen/lower_load.jl b/src/codegen/lower_load.jl index 7795db04f..d9af6a55c 100644 --- a/src/codegen/lower_load.jl +++ b/src/codegen/lower_load.jl @@ -35,7 +35,13 @@ function prefetchisagoodidea(ls::LoopSet, op::Operation, td::UnrollArgs) for opp ∈ operations(ls) if iscompute(opp) && (innermostloopsym ∈ loopdependencies(opp)) && - load_constrained(opp, u₁loopsym, u₂loopsym, innermostloopsym, true) + load_constrained( + opp, + u₁loopsym, + u₂loopsym, + innermostloopsym, + true + ) return 0 end end @@ -51,7 +57,7 @@ function add_prefetches!( ls::LoopSet, op::Operation, td::UnrollArgs, - prefetchind::Int, + prefetchind::Int ) # TODO: maybe prefetch for non-x86_64? ((Sys.ARCH === :x86_64) || (Sys.ARCH === :i686)) || return nothing @@ -62,7 +68,8 @@ function add_prefetches!( innermostloopsym = first(names(ls)) us = ls.unrollspecification prefetch_distance = - u₁loopsym === innermostloopsym ? us.u₁ : (u₂loopsym === innermostloopsym ? us.u₂ : 1) + u₁loopsym === innermostloopsym ? us.u₁ : + (u₂loopsym === innermostloopsym ? us.u₂ : 1) # prefetch_distance = u₁loopsym === innermostloopsym ? u₁ : ( u₂loopsym === innermostloopsym ? u₂max : 1 ) prefetch_multiplier = 5 prefetch_distance *= prefetch_multiplier @@ -81,7 +88,7 @@ function add_prefetches!( false, 0, ls, - false, + false ) offsets[prefetchind] = inner_offset ptr = vptr(op) @@ -89,14 +96,19 @@ function add_prefetches!( if !isknown(prefetchloop_step) for i ∈ eachindex(gespinds.args) if i == prefetchind - gespinds.args[i] = - mulexpr(getsym(prefetchloop_step), (gespinds.args[i])::Union{Symbol,Expr}) + gespinds.args[i] = mulexpr( + getsym(prefetchloop_step), + (gespinds.args[i])::Union{Symbol,Expr} + ) end # gespinds.args[i] = Expr(:call, lv(:data), gespinds.args[i]) end end ip = GlobalRef(VectorizationBase, :increment_ptr) - push!(q.args, Expr(:(=), gptr, Expr(:call, ip, ptr, vptr_offset(ptr), gespinds))) + push!( + q.args, + Expr(:(=), gptr, Expr(:call, ip, ptr, vptr_offset(ptr), gespinds)) + ) inds = Expr(:tuple) indices = getindicesonly(op) @@ -126,7 +138,10 @@ function add_prefetches!( else inds.args[i] = staticexpr(u) end - push!(q.args, Expr(:call, prefetch0, Expr(:call, ip, ptr, gptr, copy(inds)))) + push!( + q.args, + Expr(:call, prefetch0, Expr(:call, ip, ptr, gptr, copy(inds))) + ) end nothing end @@ -137,8 +152,8 @@ function pushbroadcast!(q::Expr, mvar::Symbol) Expr( :(=), broadcastedname(mvar), - Expr(:call, lv(:vbroadcast), VECTORWIDTHSYMBOL, mvar), - ), + Expr(:call, lv(:vbroadcast), VECTORWIDTHSYMBOL, mvar) + ) ) end @@ -168,7 +183,7 @@ function lower_load_no_optranslation!( op::Operation, td::UnrollArgs, mask::Bool, - inds_calc_by_ptr_offset::Vector{Bool}, + inds_calc_by_ptr_offset::Vector{Bool} ) @unpack u₁, u₁loopsym, u₂loopsym, vloopsym, suffix = td # @assert isvectorized(op) @@ -187,9 +202,11 @@ function lower_load_no_optranslation!( t = Expr(:tuple) sptrsym = sptr!(q, op) for u ∈ 1:u₁ - inds = mem_offset_u(op, td, inds_calc_by_ptr_offset, true, u - 1, ls, false) + inds = + mem_offset_u(op, td, inds_calc_by_ptr_offset, true, u - 1, ls, false) loadexpr = Expr(:call, lv(:_vload), sptrsym, inds) - domask = mask && (isvectorized(op) & ((u == u₁) | (vloopsym !== u₁loopsym))) + domask = + mask && (isvectorized(op) & ((u == u₁) | (vloopsym !== u₁loopsym))) add_memory_mask!(loadexpr, op, td, domask, ls, u) push!(loadexpr.args, falseexpr, rs) push!(t.args, loadexpr) @@ -230,7 +247,7 @@ function lower_load_for_optranslation!( ls::LoopSet, td::UnrollArgs, mask::Bool, - translationind::Int, + translationind::Int ) @unpack u₁loop, u₂loop, vloop, u₁, u₂max, suffix = td # @unpack u₁, u₁loopsym, u₂loopsym, vloopsym, u₂max, suffix = td @@ -245,7 +262,14 @@ function lower_load_for_optranslation!( step₂ = gethint(step(u₂loop)) # abs of steps are equal equal_steps = (step₁ == step₂) ⊻ (posindicator ≠ 0x03) - _td = UnrollArgs(u₁loop, u₂loop, vloop, u₁, u₂max, Core.ifelse(equal_steps, 0, u₂max - 1)) + _td = UnrollArgs( + u₁loop, + u₂loop, + vloop, + u₁, + u₂max, + Core.ifelse(equal_steps, 0, u₂max - 1) + ) gespinds = mem_offset(op, _td, inds_by_ptroff, false, ls, false) ptr = vptr(op) gptr = Symbol(ptr, "##GESPED##") @@ -254,13 +278,16 @@ function lower_load_for_optranslation!( gespinds.args[i] = Expr( :call, lv(Core.ifelse(equal_steps, :firstunroll, :lastunroll)), - gespinds.args[i], + gespinds.args[i] ) end end ip = GlobalRef(VectorizationBase, :increment_ptr) vpo = vptr_offset(gptr) - push!(q.args, Expr(:(=), vpo, Expr(:call, ip, ptr, vptr_offset(ptr), gespinds))) + push!( + q.args, + Expr(:(=), vpo, Expr(:call, ip, ptr, vptr_offset(ptr), gespinds)) + ) push!(q.args, Expr(:(=), gptr, ptr))#Expr(:call, GlobalRef(VectorizationBase, :reconstruct_ptr), fill!(inds_by_ptroff, true) @unpack ref, loopedindex = mref @@ -268,9 +295,11 @@ function lower_load_for_optranslation!( # old_translation_index = indices[translationind] # indices[translationind] = u₁loop.itersymbol # getindicesonly returns a view of `getindices` - dummyref = ArrayReference(ref.array, indices, zero(getoffsets(ref)), getstrides(ref)) + dummyref = + ArrayReference(ref.array, indices, zero(getoffsets(ref)), getstrides(ref)) # loopedindex[translationind] = true - dummymref = ArrayReferenceMeta(dummyref, fill!(similar(loopedindex), true), gptr) + dummymref = + ArrayReferenceMeta(dummyref, fill!(similar(loopedindex), true), gptr) indonly = getindicesonly(dummyref) for i ∈ eachindex(indonly) if i == translationind @@ -299,7 +328,10 @@ function lower_load_for_optranslation!( push!(q.args, :($variable_name_data = getfield($variable_name_u, 1))) if shouldbroadcast broadcasted_data = broadcastedname(variable_name_data) - push!(q.args, :($broadcasted_data = getfield($(broadcastedname(variable_name_u)), 1))) + push!( + q.args, + :($broadcasted_data = getfield($(broadcastedname(variable_name_u)), 1)) + ) end gf = GlobalRef(Core, :getfield) for u₂ ∈ 0:u₂max-1 @@ -323,7 +355,11 @@ function lower_load_for_optranslation!( if shouldbroadcast push!( q.args, - Expr(:(=), broadcastedname(variable_name_u₂), Expr(:call, lv(:VecUnroll), tb)), + Expr( + :(=), + broadcastedname(variable_name_u₂), + Expr(:call, lv(:VecUnroll), tb) + ) ) end end @@ -332,11 +368,18 @@ end # TODO: this code should be rewritten to be more "orthogonal", so that we're just combining separate pieces. # Using sentinel values (eg, T = -1 for non tiling) in part to avoid recompilation. -function lower_load!(q::Expr, op::Operation, ls::LoopSet, td::UnrollArgs, mask::Bool) +function lower_load!( + q::Expr, + op::Operation, + ls::LoopSet, + td::UnrollArgs, + mask::Bool +) @unpack u₁, u₂max, u₁loopsym, u₂loopsym, vloopsym, suffix = td if (suffix != -1) && ls.loadelimination if (u₁ > 1) & (u₂max > 1) - istr, ispl = isoptranslation(ls, op, UnrollSymbols(u₁loopsym, u₂loopsym, vloopsym)) + istr, ispl = + isoptranslation(ls, op, UnrollSymbols(u₁loopsym, u₂loopsym, vloopsym)) if istr ≠ 0x00 return lower_load_for_optranslation!(q, op, ispl, ls, td, mask, istr) end @@ -347,7 +390,10 @@ function lower_load!(q::Expr, op::Operation, ls::LoopSet, td::UnrollArgs, mask:: varnew = variable_name(op, suffix) varold = variable_name(operations(ls)[id], suffix + mno) u = isu₁unrolled(op) ? u₁ : 1 - push!(q.args, Expr(:(=), Symbol(varnew, '_', u), Symbol(varold, '_', u))) + push!( + q.args, + Expr(:(=), Symbol(varnew, '_', u), Symbol(varold, '_', u)) + ) return end end @@ -360,13 +406,24 @@ function _lower_load!( op::Operation, td::UnrollArgs, mask::Bool, - inds_calc_by_ptr_offset::Vector{Bool} = indices_calculated_by_pointer_offsets(ls, op.ref), + inds_calc_by_ptr_offset::Vector{Bool} = indices_calculated_by_pointer_offsets( + ls, + op.ref + ) ) if rejectinterleave(op) - return lower_load_no_optranslation!(q, ls, op, td, mask, inds_calc_by_ptr_offset) + return lower_load_no_optranslation!( + q, + ls, + op, + td, + mask, + inds_calc_by_ptr_offset + ) else omop = offsetloadcollection(ls) - @unpack opids, opidcollectionmap, batchedcollections, batchedcollectionmap = omop + @unpack opids, opidcollectionmap, batchedcollections, batchedcollectionmap = + omop batchid, opind = batchedcollectionmap[identifier(op)] for (bid, oid) ∈ batchedcollectionmap # this relies on `for op ∈ ops` in codegen/operation_evaluation_order.jl if bid == batchid @@ -381,7 +438,7 @@ function _lower_load!( idsformap, td, mask, - inds_calc_by_ptr_offset, + inds_calc_by_ptr_offset ) end return nothing @@ -403,7 +460,12 @@ function rejectcurly(ls::LoopSet, op::Operation, td::UnrollArgs) @unpack u₁loopsym, vloopsym = td rejectcurly(ls, op, u₁loopsym, vloopsym) end -function rejectcurly(ls::LoopSet, op::Operation, u₁loopsym::Symbol, vloopsym::Symbol) +function rejectcurly( + ls::LoopSet, + op::Operation, + u₁loopsym::Symbol, + vloopsym::Symbol +) indices = getindicesonly(op) li = op.ref.loopedindex AV = AU = false @@ -427,7 +489,8 @@ function rejectcurly(ls::LoopSet, op::Operation, u₁loopsym::Symbol, vloopsym:: end if instruction(opp).instr === :(+) || instruction(opp).instr === :add_fast isadd = true - elseif instruction(opp).instr === :(-) || instruction(opp).instr === :sub_fast + elseif instruction(opp).instr === :(-) || + instruction(opp).instr === :sub_fast isadd = false else return true @@ -445,8 +508,8 @@ function rejectcurly(ls::LoopSet, op::Operation, u₁loopsym::Symbol, vloopsym:: isloopvalue(opp2) || return true end end - if (u₁loopsym === CONSTANTZEROINDEX) ? (CONSTANTZEROINDEX ∈ loopdependencies(opp)) : - (isu₁unrolled(opp)) + if (u₁loopsym === CONSTANTZEROINDEX) ? + (CONSTANTZEROINDEX ∈ loopdependencies(opp)) : (isu₁unrolled(opp)) AU && return true AU = true end @@ -458,7 +521,13 @@ function rejectinterleave( ls::LoopSet, op::Operation, vloop::Loop, - idsformap::SubArray{Tuple{Int,Int},1,Vector{Tuple{Int,Int}},Tuple{UnitRange{Int}},true}, + idsformap::SubArray{ + Tuple{Int,Int}, + 1, + Vector{Tuple{Int,Int}}, + Tuple{UnitRange{Int}}, + true + } ) strd = step(vloop) isknown(strd) || return true @@ -497,10 +566,16 @@ function lower_load_collection!( q::Expr, ls::LoopSet, opidmap::Vector{Int}, - idsformap::SubArray{Tuple{Int,Int},1,Vector{Tuple{Int,Int}},Tuple{UnitRange{Int}},true}, + idsformap::SubArray{ + Tuple{Int,Int}, + 1, + Vector{Tuple{Int,Int}}, + Tuple{UnitRange{Int}}, + true + }, ua::UnrollArgs, mask::Bool, - inds_calc_by_ptr_offset::Vector{Bool}, + inds_calc_by_ptr_offset::Vector{Bool} ) @unpack u₁, u₁loop, u₁loopsym, u₂loopsym, vloopsym, vloop, suffix = ua @@ -523,7 +598,7 @@ function lower_load_collection!( MaybeKnown(1024), MaybeKnown(1), Symbol(""), - Symbol(""), + Symbol("") ) unrollcurl₂ = unrolled_curly(op, nouter, offset_dummy_loop, vloop, mask, 1) # interleave always 1 here inds = mem_offset_u(op, ua, inds_calc_by_ptr_offset, false, 0, ls, false) @@ -532,13 +607,15 @@ function lower_load_collection!( rs = staticexpr(reg_size(ls)) opu₁, opu₂ = isunrolled_sym(op, u₁loopsym, u₂loopsym, vloopsym, ls) manualunrollu₁ = if opu₁ && u₁ > 1 # both unrolled - if isknown(step(u₁loop)) && sum(Base.Fix2(===, u₁loopsym), getindicesonly(op)) == 1 + if isknown(step(u₁loop)) && + sum(Base.Fix2(===, u₁loopsym), getindicesonly(op)) == 1 # if first(opindices) === u₁loopsym#vloopsym # interleaveval = -nouter # else interleaveval = 0 # end - unrollcurl₁ = unrolled_curly(op, u₁, ua.u₁loop, vloop, mask, interleaveval) + unrollcurl₁ = + unrolled_curly(op, u₁, ua.u₁loop, vloop, mask, interleaveval) inds = Expr(:call, unrollcurl₁, inds) false else @@ -567,7 +644,7 @@ function lower_load_collection!( "##size##", nouter, "##u₁##", - u₁, + u₁ ) gf = GlobalRef(Core, :getfield) if manualunrollu₁ @@ -579,7 +656,8 @@ function lower_load_collection!( for u ∈ 0:u₁-1 collectionname_u = Symbol(collectionname, :_, u) if u ≠ 0 - inds = mem_offset_u(op, ua, inds_calc_by_ptr_offset, false, u, ls, false) + inds = + mem_offset_u(op, ua, inds_calc_by_ptr_offset, false, u, ls, false) uinds = Expr(:call, unrollcurl₂, inds) loadexpr = copy(loadexpr) loadexpr.args[3] = Expr(:call, unrollcurl₂, inds) @@ -593,7 +671,8 @@ function lower_load_collection!( ext = extractedvs[i] if (u + 1) == u₁ _op = ops[opidmap[opid]] - mvar = Symbol(variable_name(_op, Core.ifelse(opu₂, suffix, -1)), '_', u₁) + mvar = + Symbol(variable_name(_op, Core.ifelse(opu₂, suffix, -1)), '_', u₁) push!(q.args, Expr(:(=), mvar, Expr(:call, lv(:VecUnroll), ext))) end push!(ext.args, Expr(:call, gf, collectionname_u, i, false)) diff --git a/src/codegen/lower_memory_common.jl b/src/codegen/lower_memory_common.jl index e41084f18..4741ba348 100644 --- a/src/codegen/lower_memory_common.jl +++ b/src/codegen/lower_memory_common.jl @@ -9,8 +9,14 @@ function symbolind(ind::Symbol, op::Operation, td::UnrollArgs, ls::LoopSet) id == -1 && return ind, op @unpack u₁, u₁loopsym, u₂loopsym, vloopsym, u₂max, suffix = td parent = parents(op)[id] - pvar, u₁op, u₂op = - variable_name_and_unrolled(parent, u₁loopsym, u₂loopsym, vloopsym, suffix, ls) + pvar, u₁op, u₂op = variable_name_and_unrolled( + parent, + u₁loopsym, + u₂loopsym, + vloopsym, + suffix, + ls + ) Symbol(pvar, '_', Core.ifelse(u₁op, u₁, 1)), parent end @@ -60,12 +66,19 @@ function _addoffset!( indexstride::Union{Int,MaybeKnown}, index, offset, - calcbypointeroffset::Bool, + calcbypointeroffset::Bool ) # 6 -> 5 args if _isone(indexstride) addoffset!(ret, vloopstride, index, offset, calcbypointeroffset) else - __addoffset!(ret, vloopstride, indexstride, index, offset, calcbypointeroffset) + __addoffset!( + ret, + vloopstride, + indexstride, + index, + offset, + calcbypointeroffset + ) end end function _addoffset!( @@ -74,9 +87,16 @@ function _addoffset!( indexstride, index, offset, - calcbypointeroffset::Bool, + calcbypointeroffset::Bool ) # 6 -> 5 args - ___addoffset!(ret, vloopstride, indexstride, index, offset, calcbypointeroffset) + ___addoffset!( + ret, + vloopstride, + indexstride, + index, + offset, + calcbypointeroffset + ) end function __addoffset!( ret::Expr, @@ -84,9 +104,16 @@ function __addoffset!( indexstride, index, offset, - calcbypointeroffset::Bool, + calcbypointeroffset::Bool ) # 6 -> 5 args - ___addoffset!(ret, vloopstride, indexstride, index, offset, calcbypointeroffset) + ___addoffset!( + ret, + vloopstride, + indexstride, + index, + offset, + calcbypointeroffset + ) end function __addoffset!( ret::Expr, @@ -94,7 +121,7 @@ function __addoffset!( indexstride::Union{Int,MaybeKnown}, index, offset, - calcbypointeroffset::Bool, + calcbypointeroffset::Bool ) # 6 -> 5 args if isknown(vloopstride) & isknown(indexstride) addoffset!( @@ -102,10 +129,17 @@ function __addoffset!( gethint(vloopstride) * gethint(indexstride), index, offset, - calcbypointeroffset, + calcbypointeroffset ) else - ___addoffset!(ret, vloopstride, indexstride, index, offset, calcbypointeroffset) + ___addoffset!( + ret, + vloopstride, + indexstride, + index, + offset, + calcbypointeroffset + ) end end function ___addoffset!( @@ -114,9 +148,15 @@ function ___addoffset!( indexstride, index, offset, - calcbypointeroffset::Bool, + calcbypointeroffset::Bool ) # 6 -> 5 args - addoffset!(ret, mulexpr(vloopstride, indexstride), index, offset, calcbypointeroffset) + addoffset!( + ret, + mulexpr(vloopstride, indexstride), + index, + offset, + calcbypointeroffset + ) end # multiply `index` by `indexstride` function addoffset!( @@ -125,12 +165,19 @@ function addoffset!( indexstride, index, offset, - calcbypointeroffset::Bool, + calcbypointeroffset::Bool ) # 6 -> (5 or 6) args if _isone(indexstride) addoffset!(ret, vloopstride, index, offset, calcbypointeroffset) # 5 elseif calcbypointeroffset # `ind` is getting dropped, no need to allocate via `mulexpr` - _addoffset!(ret, vloopstride, indexstride, index, offset, calcbypointeroffset) # 6 + _addoffset!( + ret, + vloopstride, + indexstride, + index, + offset, + calcbypointeroffset + ) # 6 else # multiply index by stride _addoffset!( ret, @@ -138,12 +185,11 @@ function addoffset!( indexstride, mulexpr(index, indexstride), offset, - calcbypointeroffset, + calcbypointeroffset ) # 6 end end - function addoffset!( ret::Expr, indvectorized::Bool, @@ -151,14 +197,27 @@ function addoffset!( indexstride, index, offset, - calcbypointeroffset::Bool, + calcbypointeroffset::Bool ) # 7 -> (5 or 6) args if indvectorized - addoffset!(ret, vloopstride, indexstride, index, offset, calcbypointeroffset) + addoffset!( + ret, + vloopstride, + indexstride, + index, + offset, + calcbypointeroffset + ) elseif _isone(indexstride) addoffset!(ret, 0, index, offset, calcbypointeroffset) else - addoffset!(ret, 0, lazymulexpr(index, indexstride), offset, calcbypointeroffset) + addoffset!( + ret, + 0, + lazymulexpr(index, indexstride), + offset, + calcbypointeroffset + ) end end @@ -170,7 +229,7 @@ function addvectoroffset!( indexstride, index, offset, - calcbypointeroffset::Bool, + calcbypointeroffset::Bool ) # 8 -> 7 args # if _iszero(unrolledsteps) # if no steps, pass through; should be unreachable # addoffset!(ret, indvectorized, vloopstride, indexstride, index, offset, calcbypointeroffset) @@ -184,7 +243,7 @@ function addvectoroffset!( indexstride, VECTORWIDTHSYMBOL, offset, - false, + false ) else addoffset!( @@ -194,7 +253,7 @@ function addvectoroffset!( indexstride, mulexpr(VECTORWIDTHSYMBOL, unrolledsteps), offset, - false, + false ) end elseif _isone(unrolledsteps) # add the step to the index @@ -205,7 +264,7 @@ function addvectoroffset!( indexstride, addexpr(VECTORWIDTHSYMBOL, index), offset, - false, + false ) else addoffset!( @@ -215,7 +274,7 @@ function addvectoroffset!( indexstride, addexpr(mulexpr(VECTORWIDTHSYMBOL, unrolledsteps), index), offset, - false, + false ) end end @@ -230,10 +289,18 @@ function addvectoroffset!( index, offset::Integer, calcbypointeroffset::Bool, - indvectorized::Bool, + indvectorized::Bool ) # 10 -> (7 or 8) args if unrolledsteps == 0 # neither unrolledloopstride or indexstride can be 0 - addoffset!(ret, mm, vloopstride, indexstride, index, offset, calcbypointeroffset) # 7 arg + addoffset!( + ret, + mm, + vloopstride, + indexstride, + index, + offset, + calcbypointeroffset + ) # 7 arg elseif indvectorized unrolledsteps *= indexstride if isknown(unrolledloopstride) @@ -245,7 +312,7 @@ function addvectoroffset!( indexstride, index, offset, - calcbypointeroffset, + calcbypointeroffset ) # 8 arg elseif unrolledsteps == 1 addvectoroffset!( @@ -256,7 +323,7 @@ function addvectoroffset!( indexstride, index, offset, - calcbypointeroffset, + calcbypointeroffset ) # 8 arg else addvectoroffset!( @@ -267,7 +334,7 @@ function addvectoroffset!( indexstride, index, offset, - calcbypointeroffset, + calcbypointeroffset ) # 8 arg end elseif _isone(unrolledloopstride) @@ -278,7 +345,7 @@ function addvectoroffset!( indexstride, index, offset + unrolledsteps, - calcbypointeroffset, + calcbypointeroffset ) # 7 arg else addoffset!( @@ -288,7 +355,7 @@ function addvectoroffset!( mulexpr(unrolledloopstride, indexstride), index, addexpr(offset, lazymulexpr(unrolledloopstride, unrolledsteps)), - calcbypointeroffset, + calcbypointeroffset ) # 7 arg end end @@ -304,7 +371,7 @@ function mem_offset( inds_calc_by_ptr_offset::Vector{Bool}, _mm::Bool, ls::LoopSet, - preserve_vecunroll::Bool, + preserve_vecunroll::Bool ) # @assert accesses_memory(op) "Computing memory offset only makes sense for operations that access memory." ret = Expr(:tuple) @@ -347,7 +414,12 @@ function mem_offset( end function sptr(op::Operation) vp = vptr(op) - Expr(:call, GlobalRef(VectorizationBase, :reconstruct_ptr), vp, vptr_offset(vp)) + Expr( + :call, + GlobalRef(VectorizationBase, :reconstruct_ptr), + vp, + vptr_offset(vp) + ) end function sptr!(q::Expr, op::Operation) vp = vptr(op) @@ -365,7 +437,7 @@ function unrolled_curly( u₁loop::Loop, vloop::Loop, mask::Bool, - interleave::Int = 0, + interleave::Int = 0 ) u₁loopsym = u₁loop.itersymbol vloopsym = vloop.itersymbol @@ -396,14 +468,16 @@ function unrolled_curly( end # if (u₁loopsym === CONSTANTZEROINDEX) ? (CONSTANTZEROINDEX ∈ loopdependencies(opp)) : (isu₁unrolled(opp) || (ind === u₁loopsym)) # can't check isu₁unrolled(opp) because we may be lying. - if (u₁loopsym === CONSTANTZEROINDEX) ? (CONSTANTZEROINDEX ∈ loopdependencies(opp)) : + if (u₁loopsym === CONSTANTZEROINDEX) ? + (CONSTANTZEROINDEX ∈ loopdependencies(opp)) : (u₁loopsym ∈ loopdependencies(opp) || (ind === u₁loopsym)) @assert AU == -1 AU = n end end end - AU == -1 && throw(LoopError("Failed to find $(u₁loopsym) in args of $(repr(op)).")) + AU == -1 && + throw(LoopError("Failed to find $(u₁loopsym) in args of $(repr(op)).")) vecnotunrolled = AU != AV conditional_memory_op = isconditionalmemop(op) if mask || conditional_memory_op @@ -458,7 +532,7 @@ function unrolled_curly( 0, 1, M, - 1, + 1 ) end end @@ -467,10 +541,11 @@ function unrolledindex( td::UnrollArgs, mask::Bool, inds_calc_by_ptr_offset::Vector{Bool}, - ls::LoopSet, + ls::LoopSet ) @unpack u₁, u₁loopsym, u₁loop, vloop = td - isone(u₁) && return mem_offset_u(op, td, inds_calc_by_ptr_offset, true, 0, ls, false) + isone(u₁) && + return mem_offset_u(op, td, inds_calc_by_ptr_offset, true, 0, ls, false) any(==(u₁loopsym), getindicesonly(op)) || return mem_offset_u(op, td, inds_calc_by_ptr_offset, true, 0, ls, true) @@ -486,7 +561,7 @@ function mem_offset_u( _mm::Bool, incr₁::Int, ls::LoopSet, - preserve_vecunroll::Bool, + preserve_vecunroll::Bool ) @assert accesses_memory(op) "Computing memory offset only makes sense for operations that access memory." @unpack u₁loopsym, u₂loopsym, vloopsym, u₁step, u₂step, vstep, suffix = td @@ -500,7 +575,14 @@ function mem_offset_u( # allbasezero = all(inds_calc_by_ptr_offset) && all(iszero, offsets) loopedindex = op.ref.loopedindex if iszero(incr₁) & iszero(incr₂) - return mem_offset(op, td, inds_calc_by_ptr_offset, _mm, ls, preserve_vecunroll) + return mem_offset( + op, + td, + inds_calc_by_ptr_offset, + _mm, + ls, + preserve_vecunroll + ) # append_inds!(ret, indices, loopedindex) else for (n, ind) ∈ enumerate(indices) @@ -522,7 +604,7 @@ function mem_offset_u( ind, offset, ind_by_offset, - indvectorized, + indvectorized ) # 9 arg elseif ind === u₂loopsym addvectoroffset!( @@ -535,10 +617,18 @@ function mem_offset_u( ind, offset, ind_by_offset, - indvectorized, + indvectorized ) # 9 arg elseif loopedindex[n] - addoffset!(ret, indvectorizedmm, vstep, stride, ind, offset, ind_by_offset) # 7 arg + addoffset!( + ret, + indvectorizedmm, + vstep, + stride, + ind, + offset, + ind_by_offset + ) # 7 arg else offset += (stride - 1) # 1 -> 0-based indexing newname, parent = symbolind(ind, op, td, ls) @@ -551,7 +641,7 @@ function mem_offset_u( newname_unmm = Expr( :call, lv(:unmm), - Expr(:call, gf, Expr(:call, gf, newname, 1), incr₁ + 1, false), + Expr(:call, gf, Expr(:call, gf, newname, 1), incr₁ + 1, false) ) else newname_unmm = Expr(:call, lv(:unmm), newname) @@ -562,7 +652,8 @@ function mem_offset_u( addoffset!(ret, 0, newname_unmm, offset, false) elseif (isu₁unrolled(parent) & (td.u₁ > 1)) && !preserve_vecunroll gf = GlobalRef(Core, :getfield) - firstnew = Expr(:call, gf, Expr(:call, gf, newname, 1), incr₁ + 1, false) + firstnew = + Expr(:call, gf, Expr(:call, gf, newname, 1), incr₁ + 1, false) if stride ≠ 1 firstnew = mulexpr(firstnew, stride) end @@ -591,7 +682,6 @@ end q end - isconditionalmemop(op::Operation) = (instruction(op).instr === :conditionalload) || (instruction(op).instr === :conditionalstore!) @@ -601,14 +691,21 @@ function add_memory_mask!( td::UnrollArgs, mask::Bool, ls::LoopSet, - u₁ᵢ::Int, + u₁ᵢ::Int ) @unpack u₁, u₁loopsym, u₂loopsym, vloopsym, u₂max, suffix = td if isconditionalmemop(op) condop = last(parents(op)) opu₂ = (suffix ≠ -1) && isu₂unrolled(op) - condvar, condu₁unrolled = - condvarname_and_unroll(condop, u₁loopsym, u₂loopsym, vloopsym, suffix, opu₂, ls) + condvar, condu₁unrolled = condvarname_and_unroll( + condop, + u₁loopsym, + u₂loopsym, + vloopsym, + suffix, + opu₂, + ls + ) # if it isn't unrolled, then `m` u = condu₁unrolled ? u₁ : 1 # u = isu₁unrolled(condop) ? u₁ : 1 @@ -626,7 +723,10 @@ function add_memory_mask!( if (u₁ᵢ == 0) | (u == 1) push!(memopexpr.args, condvar) else - push!(memopexpr.args, :($getfield($getfield($condvar, 1), $(u₁ᵢ), false))) + push!( + memopexpr.args, + :($getfield($getfield($condvar, 1), $(u₁ᵢ), false)) + ) end elseif (u₁loopsym ≢ vloopsym) | (u₁ == 1) # mask all equivalenetly push!(memopexpr.args, Expr(:call, lv(:&), condvar, MASKSYMBOL)) @@ -648,12 +748,17 @@ function add_memory_mask!( elseif u₁ᵢ == u₁ # mask push!( memopexpr.args, - Expr(:call, lv(:&), :($getfield($getfield(condvar, 1), $u₁ᵢ, false)), MASKSYMBOL), + Expr( + :call, + lv(:&), + :($getfield($getfield(condvar, 1), $u₁ᵢ, false)), + MASKSYMBOL + ) ) else push!( memopexpr.args, - Expr(:call, lv(:&), :($getfield($getfield(condvar, 1), $u₁ᵢ, false))), + Expr(:call, lv(:&), :($getfield($getfield(condvar, 1), $u₁ᵢ, false))) ) end elseif mask && isvectorized(op) @@ -669,7 +774,7 @@ function condvarname_and_unroll( vloop::Symbol, suffix::Int, opu₂::Bool, - ls::LoopSet, + ls::LoopSet ) condvar, condu₁, condu₂ = variable_name_and_unrolled( cond, @@ -677,7 +782,7 @@ function condvarname_and_unroll( u₂loop, vloop, Core.ifelse(opu₂, suffix, -1), - ls, + ls ) condvar, condu₁ end diff --git a/src/codegen/lower_store.jl b/src/codegen/lower_store.jl index fb6862448..58fad9fef 100644 --- a/src/codegen/lower_store.jl +++ b/src/codegen/lower_store.jl @@ -34,7 +34,12 @@ function storeinstr_preprend(op::Operation, vloopsym::Symbol) # end end -function reduce_expr_u₂(toreduct::Symbol, op::Operation, u₂::Int, suffix::Symbol) +function reduce_expr_u₂( + toreduct::Symbol, + op::Operation, + u₂::Int, + suffix::Symbol +) t = Expr(:tuple) for u ∈ 0:u₂-1 push!(t.args, Symbol(toreduct, u, suffix)) @@ -48,11 +53,14 @@ function reduce_expr!( u₁::Int, u₂::Int, isu₁unrolled::Bool, - isu₂unrolled::Bool, + isu₂unrolled::Bool ) if isu₂unrolled# u₂ != -1 _toreduct = Symbol(toreduct, 0) - push!(q.args, Expr(:(=), _toreduct, reduce_expr_u₂(toreduct, op, u₂, Symbol("")))) + push!( + q.args, + Expr(:(=), _toreduct, reduce_expr_u₂(toreduct, op, u₂, Symbol(""))) + ) else#if u₂ == -1 _toreduct = Symbol(toreduct, '_', u₁) end @@ -64,8 +72,8 @@ function reduce_expr!( Expr( :(=), Symbol(toreduct, "##onevec##"), - Expr(:call, reduction_to_single_vector(op), _toreduct), - ), + Expr(:call, reduction_to_single_vector(op), _toreduct) + ) ) else fifelse = let u₁ = u₁ @@ -78,8 +86,8 @@ function reduce_expr!( Expr( :(=), Symbol(toreduct, "##onevec##"), - Expr(:call, fifelse, _toreduct, staticexpr(1)), - ), + Expr(:call, fifelse, _toreduct, staticexpr(1)) + ) ) end nothing @@ -91,7 +99,7 @@ function lower_store_collection!( op::Operation, ua::UnrollArgs, mask::Bool, - inds_calc_by_ptr_offset::Vector{Bool}, + inds_calc_by_ptr_offset::Vector{Bool} ) omop = offsetloadcollection(ls) batchid, _ = omop.batchedcollectionmap[identifier(op)] @@ -124,7 +132,7 @@ function lower_store_collection!( MaybeKnown(1024), MaybeKnown(1), Symbol(""), - Symbol(""), + Symbol("") ) unrollcurl₂ = unrolled_curly(op, nouter, offset_dummy_loop, vloop, mask, 1) inds = mem_offset_u(op, ua, inds_calc_by_ptr_offset, false, 0, ls, false) @@ -136,7 +144,10 @@ function lower_store_collection!( # unrollcurl₂ is unrolled along `first(getindices(op))` by factor of `nouter` # # if isknown(step(u₁loop)) && sum(Base.Fix2(===,u₁loopsym), getindicesonly(op)) == 1 - if (isknown(step(u₁loop)) && sum(Base.Fix2(===, u₁loopsym), getindicesonly(op)) == 1)# && (isone(step(u₁loop)) | (first(getindices(op)) ≢ u₁loopsym)) + if ( + isknown(step(u₁loop)) && + sum(Base.Fix2(===, u₁loopsym), getindicesonly(op)) == 1 + )# && (isone(step(u₁loop)) | (first(getindices(op)) ≢ u₁loopsym)) # if first(getindices(op)) === u₁loopsym#vloopsym # interleaveval = -nouter # else @@ -153,7 +164,8 @@ function lower_store_collection!( end uinds = Expr(:call, unrollcurl₂, inds) sptrsym = sptr!(q, op) - storeexpr = Expr(:call, lv(:_vstore!), sptrsym, Expr(:call, lv(:VecUnroll), t), uinds) + storeexpr = + Expr(:call, lv(:_vstore!), sptrsym, Expr(:call, lv(:VecUnroll), t), uinds) # not using `add_memory_mask!(storeexpr, op, ua, mask, ls)` because we checked `isconditionalmemop` earlier in `lower_load_collection!` u₁vectorized = u₁loopsym === vloopsym if mask# && isvectorized(op)) @@ -190,7 +202,7 @@ function lower_store_collection!( storeexpr_tmp.args[4] = Expr( :call, unrollcurl₂, - mem_offset_u(op, ua, inds_calc_by_ptr_offset, false, u, ls, false), + mem_offset_u(op, ua, inds_calc_by_ptr_offset, false, u, ls, false) ) end push!(q.args, storeexpr_tmp) @@ -208,13 +220,14 @@ function lower_store!( ua::UnrollArgs, mask::Bool, reductfunc::Symbol = storeinstr_preprend(op, ua.vloop.itersymbol), - inds_calc_by_ptr_offset = indices_calculated_by_pointer_offsets(ls, op.ref), + inds_calc_by_ptr_offset = indices_calculated_by_pointer_offsets(ls, op.ref) ) @unpack u₁, u₁loopsym, u₂loopsym, vloopsym, vloop, u₂max, suffix = ua omop = offsetloadcollection(ls) batchid, opind = omop.batchedcollectionmap[identifier(op)] if ((batchid ≠ 0) && isvectorized(op)) && (!rejectinterleave(op)) - (opind == 1) && lower_store_collection!(q, ls, op, ua, mask, inds_calc_by_ptr_offset) + (opind == 1) && + lower_store_collection!(q, ls, op, ua, mask, inds_calc_by_ptr_offset) return end falseexpr = Expr(:call, lv(:False)) @@ -222,7 +235,10 @@ function lower_store!( # trueexpr = Expr(:call, lv(:True)); rs = staticexpr(reg_size(ls)) opp = first(parents(op)) - if ((opp.instruction.instr === reductfunc) || (opp.instruction.instr === :identity)) + if ( + (opp.instruction.instr === reductfunc) || + (opp.instruction.instr === :identity) + ) parents_opp = parents(opp) opppstate = Base.iterate(parents_opp) if opppstate ≢ nothing @@ -267,19 +283,28 @@ function lower_store!( data_u₁ && push!(q.args, Expr(:(=), mvard, Expr(:call, lv(:data), mvar))) sptrsym = sptr!(q, op) for u ∈ 1:u₁ - inds = mem_offset_u(op, ua, inds_calc_by_ptr_offset, true, u - 1, ls, false) + inds = + mem_offset_u(op, ua, inds_calc_by_ptr_offset, true, u - 1, ls, false) storeexpr = if data_u₁ if reductfunc === Symbol("") Expr(:call, lv(:_vstore!), sptrsym, gf(mvard, u), inds) else - Expr(:call, lv(:_vstore!), lv(reductfunc), sptrsym, gf(mvard, u), inds) + Expr( + :call, + lv(:_vstore!), + lv(reductfunc), + sptrsym, + gf(mvard, u), + inds + ) end elseif reductfunc === Symbol("") Expr(:call, lv(:_vstore!), sptrsym, mvar, inds) else Expr(:call, lv(:_vstore!), lv(reductfunc), sptrsym, mvar, inds) end - domask = mask && (isvectorized(op) & ((u == u₁) | (vloopsym !== u₁loopsym))) + domask = + mask && (isvectorized(op) & ((u == u₁) | (vloopsym !== u₁loopsym))) add_memory_mask!(storeexpr, op, ua, domask, ls, u)# & ((u == u₁) | isvectorized(op))) push!(storeexpr.args, falseexpr, aliasexpr, falseexpr, rs) push!(q.args, storeexpr) @@ -307,7 +332,7 @@ function lower_tiled_store!( unrollsyms::UnrollSymbols, u₁::Int, u₂::Int, - mask::Bool, + mask::Bool ) ua = UnrollArgs(ls, u₁, unrollsyms, u₂, 0) for opsv ∈ (opsv1, opsv2) @@ -317,7 +342,12 @@ function lower_tiled_store!( end end -function donot_tile_store(ls::LoopSet, op::Operation, reductfunc::Symbol, u₂::Int) +function donot_tile_store( + ls::LoopSet, + op::Operation, + reductfunc::Symbol, + u₂::Int +) ( (!((reductfunc === Symbol("")) && all(op.ref.loopedindex))) || (u₂ ≤ 1) || @@ -340,7 +370,7 @@ function lower_tiled_store!( ua::UnrollArgs, u₁::Int, u₂::Int, - mask::Bool, + mask::Bool ) @unpack u₁loopsym, u₂loopsym, vloopsym, u₁loop, u₂loop, vloop = ua reductfunc = storeinstr_preprend(op, vloopsym) @@ -352,7 +382,15 @@ function lower_tiled_store!( @unpack u₁, u₂max = ua for t ∈ 0:u₂-1 unrollargs = UnrollArgs(u₁loop, u₂loop, vloop, u₁, u₂max, t) - lower_store!(blockq, ls, op, unrollargs, mask, reductfunc, inds_calc_by_ptr_offset) + lower_store!( + blockq, + ls, + op, + unrollargs, + mask, + reductfunc, + inds_calc_by_ptr_offset + ) end return end @@ -360,8 +398,8 @@ function lower_tiled_store!( if (opp.instruction.instr === reductfunc) && isone(length(parents(opp))) throw( LoopError( - "Operation $opp's instruction is $reductfunc, shouldn't be able to reach here.", - ), + "Operation $opp's instruction is $reductfunc, shouldn't be able to reach here." + ) ) # opp = only(parents(opp)) end diff --git a/src/codegen/lower_threads.jl b/src/codegen/lower_threads.jl index 3f0a86d34..e1c0d770e 100644 --- a/src/codegen/lower_threads.jl +++ b/src/codegen/lower_threads.jl @@ -4,7 +4,7 @@ struct TURBO{UNROLL,OPS,ARF,AM,LPSYM,LBV,FLBV} <: Function end # hopefully shouldn't add much to compile time. function (::TURBO{UNROLL,OPS,ARF,AM,LPSYM,LBV,FLBV})( - p::Ptr{UInt}, + p::Ptr{UInt} ) where {UNROLL,OPS,ARF,AM,LPSYM,K,LBV,FLBV<:Tuple{Vararg{Any,K}}} (_, _vargs) = ThreadingUtilities.load(p, FLBV, 2 * sizeof(UInt)) # Main.VARGS[Threads.threadid()] = first(_vargs) @@ -16,14 +16,14 @@ function (::TURBO{UNROLL,OPS,ARF,AM,LPSYM,LBV,FLBV})( Val{AM}(), Val{LPSYM}(), Val{LBV}(), - _vargs..., + _vargs... ) ThreadingUtilities.store!(p, ret, Int(register_size())) ThreadingUtilities._atomic_store!(p, ThreadingUtilities.SPIN) nothing end @generated function Base.pointer( - ::TURBO{UNROLL,OPS,ARF,AM,LPSYM,LBV,FLBV}, + ::TURBO{UNROLL,OPS,ARF,AM,LPSYM,LBV,FLBV} ) where {UNROLL,OPS,ARF,AM,LPSYM,K,LBV,FLBV<:Tuple{Vararg{Any,K}}} f = TURBO{UNROLL,OPS,ARF,AM,LPSYM,LBV,FLBV}() precompile(f, (Ptr{UInt},)) @@ -36,7 +36,7 @@ end @inline function setup_turbo_threads!( p::Ptr{UInt}, fptr::Ptr{Cvoid}, - args::LBV, + args::LBV ) where {K,LBV<:Tuple{Vararg{Any,K}}} offset = ThreadingUtilities.store!(p, fptr, sizeof(UInt)) offset = ThreadingUtilities.store!(p, args, offset) @@ -54,13 +54,13 @@ struct StaticType{T} end ::Val{LPSYM}, ::StaticType{LBV}, fargs::FARGS, - tid, + tid ) where {UNROLL,OPS,ARF,AM,LPSYM,K,LBV<:Tuple{Vararg{Any,K}},FARGS} ThreadingUtilities.launch( setup_turbo_threads!, tid, pointer(TURBO{UNROLL,OPS,ARF,AM,LPSYM,LBV,FARGS}()), - fargs, + fargs ) end @@ -73,7 +73,11 @@ end t end @inline cld_fast(x, y) = Base.udiv_int(vsub_nw(vadd_nw(x, y), one(y)), y) -@inline function choose_num_blocks(MoW::UInt, ::StaticInt{U}, ::StaticInt{T}) where {U,T} +@inline function choose_num_blocks( + MoW::UInt, + ::StaticInt{U}, + ::StaticInt{T} +) where {U,T} factors = calc_factors(StaticInt{T}()) for i ∈ 1:length(factors)-1 # miter decreases in each iteration of factors @@ -107,7 +111,7 @@ end M::UInt, ::StaticInt{U}, nt, - ::StaticInt{NTMAX}, + ::StaticInt{NTMAX} ) where {U,NTMAX} if NTMAX == 2 # `nt` must be `2` return quote @@ -119,7 +123,7 @@ end ifq = Expr( :if, :(nt == $NTMAX), - :(choose_num_blocks(M, StaticInt{$U}(), StaticInt{$NTMAX}())), + :(choose_num_blocks(M, StaticInt{$U}(), StaticInt{$NTMAX}())) ) add_bisecting_if_branches!(ifq, 2, NTMAX - 1, U, false) push!(q.args, ifq) @@ -151,24 +155,26 @@ scale_cost(c, looplen) = scale_cost(@fastmath c / looplen) @inline function choose_num_threads( C::T, NT::UInt, - x::Base.BitInteger, + x::Base.BitInteger ) where {T<:Union{Float32,Float64}} _choose_num_threads(scale_cost(T(C)), NT, x) end @inline function _choose_num_threads( C::T, NT::UInt, - x::Base.BitInteger, + x::Base.BitInteger ) where {T<:Union{Float32,Float64}} max( min( Base.fptoui( UInt, - Base.ceil_llvm(Base.mul_float_fast(C, Base.sqrt_llvm_fast(Base.uitofp(T, x)))), + Base.ceil_llvm( + Base.mul_float_fast(C, Base.sqrt_llvm_fast(Base.uitofp(T, x))) + ) ), - NT, + NT ), - one(UInt), + one(UInt) ) end function push_loop_length_expr!(q::Expr, ls::LoopSet) @@ -213,11 +219,15 @@ function outer_reduct_combine_expressions(ls::LoopSet, retv) :block, :( var"#load#thread#ret#" = $gf( - ThreadingUtilities.load(var"#thread#ptr#", typeof($retv), $(reg_size(ls))), + ThreadingUtilities.load( + var"#thread#ptr#", + typeof($retv), + $(reg_size(ls)) + ), 2, - false, + false ) - ), + ) ) # push!(q.args, :(@show var"#load#thread#ret#")) for (i, or) ∈ enumerate(ls.outer_reductions) @@ -235,9 +245,16 @@ function outer_reduct_combine_expressions(ls::LoopSet, retv) otherarg = Expr( :call, lv(:vecmemaybe), - Expr(:call, GlobalRef(Core, :getfield), Symbol("#load#thread#ret#"), j, false), + Expr( + :call, + GlobalRef(Core, :getfield), + Symbol("#load#thread#ret#"), + j, + false + ) ) - Expr(:call, lv(:vecmemaybe), Symbol(mangledvar(opv), "##onevec##")), (otherarg,) + Expr(:call, lv(:vecmemaybe), Symbol(mangledvar(opv), "##onevec##")), + (otherarg,) end end Expr(:call, reductexpr) @@ -249,11 +266,14 @@ function outer_reduct_combine_expressions(ls::LoopSet, retv) Expr( :call, lv(:vecmemaybe), - Expr(:call, gf, Symbol("#load#thread#ret#"), i, false), - ), + Expr(:call, gf, Symbol("#load#thread#ret#"), i, false) + ) ) else - push!(instrcall.args, Expr(:call, lv(:vecmemaybe), Symbol("#load#thread#ret#"))) + push!( + instrcall.args, + Expr(:call, lv(:vecmemaybe), Symbol("#load#thread#ret#")) + ) end push!(q.args, Expr(:(=), out, Expr(:call, :data, instrcall))) # push!(q.args, Expr(:(=), out, :(@show $data($instrcall)))) @@ -265,7 +285,7 @@ function thread_loop_summary!( ls::LoopSet, ua::UnrollArgs, threadedloop::Loop, - issecondthreadloop::Bool, + issecondthreadloop::Bool ) W = ls.vector_width @unpack u₁loop, u₂loop, vloop, u₁, u₂max = ua @@ -285,7 +305,7 @@ function thread_loop_summary!( :( $num_unroll_sym = Base.udiv_int( vadd_nw($lensym, $(UInt(unroll_factor - 1))), - $(UInt(unroll_factor)), + $(UInt(unroll_factor)) ) ) end @@ -309,10 +329,13 @@ function thread_loop_summary!( unroll_factor, threadedloop, iterstop_sym, - true, + true ) else - iterstop = :($iterstop_sym::Int = vadd_nsw($iterstart_sym, vmul_nw($blksz_sym, $mf))) + iterstop = :( + $iterstop_sym::Int = + vadd_nsw($iterstart_sym, vmul_nw($blksz_sym, $mf)) + ) looprange = :($iterstart_sym:StaticInt{$mf}()) lastrange = :($iterstart_sym:StaticInt{$mf}()) push_loopbound_ends!( @@ -321,14 +344,16 @@ function thread_loop_summary!( unroll_factor, threadedloop, :(vsub_nsw($iterstop_sym, one($iterstop_sym))), - false, + false ) end else stepthread_sym = Symbol("#step#thread#$threadloopnumtag#") pushpreamble!(ls, :($stepthread_sym = $(getsym(step(threadedloop))))) - iterstop = - :($iterstop_sym = vadd_nsw($iterstart_sym, vmul_nw($blksz_sym, $stepthread_sym))) + iterstop = :( + $iterstop_sym = + vadd_nsw($iterstart_sym, vmul_nw($blksz_sym, $stepthread_sym)) + ) looprange = :($iterstart_sym:$stepthread_sym) lastrange = :($iterstart_sym:$stepthread_sym) push_loopbound_ends!( @@ -337,7 +362,7 @@ function thread_loop_summary!( unroll_factor, threadedloop, :(vsub_nsw($iterstop_sym, one($iterstop_sym))), - false, + false ) end define_len, define_num_unrolls, loopstart, iterstop, looprange, lastrange @@ -347,7 +372,7 @@ function push_last_bound!( lastrange::Expr, lastexpr, iterstop, - unroll_factor::Int, + unroll_factor::Int ) push!(lastrange.args, lastexpr) unroll_factor ≠ 1 && push!(looprange.args, :(min($lastexpr, $iterstop))) @@ -359,7 +384,7 @@ function push_loopbound_ends!( unroll_factor::Int, threadedloop::Loop, iterstop, - offsetlast::Bool, + offsetlast::Bool ) if unroll_factor == 1 push!(looprange.args, iterstop) @@ -370,7 +395,7 @@ function push_loopbound_ends!( lastrange, gethint(last(threadedloop)) + offsetlast, iterstop, - unroll_factor, + unroll_factor ) else lastsym = getsym(last(threadedloop)) @@ -380,7 +405,7 @@ function push_loopbound_ends!( lastrange, :(vadd_nsw($lastsym, one($lastsym))), iterstop, - unroll_factor, + unroll_factor ) else push_last_bound!(looprange, lastrange, lastsym, iterstop, unroll_factor) @@ -398,7 +423,8 @@ function define_block_size(threadedloop, vloop, tn, W) if threadedloop === vloop quote $baseblocksizeuint, $nrem = divrem_fast($num_unroll, $thread_factor) - $baseblocksizeint = ($baseblocksizeuint << $(VectorizationBase.intlog2(W))) % Int + $baseblocksizeint = + ($baseblocksizeuint << $(VectorizationBase.intlog2(W))) % Int $remstep = $(Int(W)) end else @@ -419,7 +445,7 @@ function thread_one_loops_expr( OPS::Expr, ARF::Expr, AM::Expr, - LPSYM::Expr, + LPSYM::Expr ) looplen = looplengthprod(ls) c = scale_cost(c, looplen) @@ -427,11 +453,16 @@ function thread_one_loops_expr( _num_threads = _choose_num_threads(c, ntmax, Int64(looplen))::UInt _num_threads > 1 || return avx_body(ls, UNROLL) ntcallexpr = Expr(:call, %, Expr(:call, Threads.nthreads), UInt) - choose_nthread = - Expr(:(=), Symbol("#nthreads#"), Expr(:call, min, ntcallexpr, _num_threads)) + choose_nthread = Expr( + :(=), + Symbol("#nthreads#"), + Expr(:call, min, ntcallexpr, _num_threads) + ) else - choose_nthread = - :(_choose_num_threads($(Float32(c)), min(Threads.nthreads() % UInt, $ntmax))) + choose_nthread = :(_choose_num_threads( + $(Float32(c)), + min(Threads.nthreads() % UInt, $ntmax) + )) push_loop_length_expr!(choose_nthread, ls) choose_nthread = Expr(:(=), Symbol("#nthreads#"), choose_nthread) end @@ -458,7 +489,7 @@ function thread_one_loops_expr( $AM, $LPSYM, Val(typeof(var"#avx#call#args#")), - flatten_to_tuple(var"#avx#call#args#")..., + flatten_to_tuple(var"#avx#call#args#")... )) update_return_values = if length(ls.outer_reductions) > 0 retv = loopset_return_value(ls, Val(false)) @@ -467,7 +498,8 @@ function thread_one_loops_expr( else nothing end - retexpr = length(ls.outer_reductions) > 0 ? :(return $retv) : :(return nothing) + retexpr = + length(ls.outer_reductions) > 0 ? :(return $retv) : :(return nothing) iterdef = define_block_size(threadedloop, ua.vloop, 0, ls.vector_width) q = quote $choose_nthread # UInt @@ -487,23 +519,27 @@ function thread_one_loops_expr( var"#thread#id#" = 0x00000000 var"##do#thread##" = false for var"#threads#" in var"#threads#tuple#" - var"#thread#launch#count#" = 0x00000000 var"#thread#mask#" = PolyesterWeave.mask(var"#threads#") var"#nrequest#" = length(var"#threads#") var"#threads#remain#" = var"#thread#launch#count#" ≠ var"#nrequest#" var"##do#thread##" |= var"#threads#remain#" while var"#threads#remain#" - VectorizationBase.assume(var"#thread#mask#" ≠ zero(var"#thread#mask#")) - var"#trailzing#zeros#" = Base.trailing_zeros(var"#thread#mask#") % UInt32 + VectorizationBase.assume( + var"#thread#mask#" ≠ zero(var"#thread#mask#") + ) + var"#trailzing#zeros#" = + Base.trailing_zeros(var"#thread#mask#") % UInt32 var"#nblock#size#thread#0#" = Core.ifelse( var"#thread#launch#count#" < (var"#nrem#thread#0#" % UInt32), vadd_nw(var"#base#block#size#thread#0#", var"#block#rem#step#0#"), - var"#base#block#size#thread#0#", + var"#base#block#size#thread#0#" ) - var"#trailzing#zeros#" = vadd_nw(var"#trailzing#zeros#", 0x00000001) + var"#trailzing#zeros#" = + vadd_nw(var"#trailzing#zeros#", 0x00000001) $iterstop - var"#thread#id#" = vadd_nw(var"#thread#id#", var"#trailzing#zeros#") + var"#thread#id#" = + vadd_nw(var"#thread#id#", var"#trailzing#zeros#") var"##lbvargs#to_launch##" = ($loopboundexpr, var"#vargs#") avx_launch( @@ -514,14 +550,16 @@ function thread_one_loops_expr( $LPSYM, StaticType{typeof(var"##lbvargs#to_launch##")}(), flatten_to_tuple(var"##lbvargs#to_launch##"), - var"#thread#id#", + var"#thread#id#" ) var"#thread#mask#" >>>= var"#trailzing#zeros#" var"#iter#start#0#" = var"#iter#stop#0#" - var"#thread#launch#count#" = vadd_nw(var"#thread#launch#count#", 0x00000001) - var"#threads#remain#" = var"#thread#launch#count#" ≠ var"#nrequest#" + var"#thread#launch#count#" = + vadd_nw(var"#thread#launch#count#", 0x00000001) + var"#threads#remain#" = + var"#thread#launch#count#" ≠ var"#nrequest#" end var"#nrem#thread#0#" -= var"#nrequest#" end @@ -544,8 +582,10 @@ function thread_one_loops_expr( (var"#thread#mask#" ≠ zero(var"#thread#mask#")) while var"#threads#remain#" VectorizationBase.assume(var"#thread#mask#" ≠ zero(var"#thread#mask#")) - var"#trailzing#zeros#" = - vadd_nw(Base.trailing_zeros(var"#thread#mask#") % UInt32, 0x00000001) + var"#trailzing#zeros#" = vadd_nw( + Base.trailing_zeros(var"#thread#mask#") % UInt32, + 0x00000001 + ) var"#thread#mask#" >>>= var"#trailzing#zeros#" var"#thread#id#" = vadd_nw(var"#thread#id#", var"#trailzing#zeros#") var"#thread#ptr#" = ThreadingUtilities.taskpointer(var"#thread#id#") @@ -570,16 +610,31 @@ function define_vthread_blocks(vloop, u₁loop, u₂loop, u₁, u₂, ntmax, tn) sntmax = staticexpr(ntmax % Int) if vloop === u₁loop :( - $lhs = - _choose_num_blocks($loopunrollname, StaticInt{$u₁}(), var"#nthreads#", $sntmax) + $lhs = _choose_num_blocks( + $loopunrollname, + StaticInt{$u₁}(), + var"#nthreads#", + $sntmax + ) ) elseif vloop === u₂loop :( - $lhs = - _choose_num_blocks($loopunrollname, StaticInt{$u₂}(), var"#nthreads#", $sntmax) + $lhs = _choose_num_blocks( + $loopunrollname, + StaticInt{$u₂}(), + var"#nthreads#", + $sntmax + ) ) else - :($lhs = _choose_num_blocks($loopunrollname, StaticInt{1}(), var"#nthreads#", $sntmax)) + :( + $lhs = _choose_num_blocks( + $loopunrollname, + StaticInt{1}(), + var"#nthreads#", + $sntmax + ) + ) end end function define_thread_blocks( @@ -590,7 +645,7 @@ function define_thread_blocks( u₂loop, u₁, u₂, - ntmax, + ntmax ) if vloop === threadedloop1 define_vthread_blocks(threadedloop1, u₁loop, u₂loop, u₁, u₂, ntmax, 0) @@ -613,7 +668,7 @@ function thread_two_loops_expr( OPS::Expr, ARF::Expr, AM::Expr, - LPSYM::Expr, + LPSYM::Expr ) looplen = looplengthprod(ls) # c = 0.0225 * c / looplen @@ -622,11 +677,16 @@ function thread_two_loops_expr( _num_threads = _choose_num_threads(c, ntmax, Int64(looplen))::UInt _num_threads > 1 || return avx_body(ls, UNROLL) ntcallexpr = Expr(:call, %, Expr(:call, Threads.nthreads), UInt) - choose_nthread = - Expr(:(=), Symbol("#nthreads#"), Expr(:call, min, ntcallexpr, _num_threads)) + choose_nthread = Expr( + :(=), + Symbol("#nthreads#"), + Expr(:call, min, ntcallexpr, _num_threads) + ) else - choose_nthread = - :(_choose_num_threads($(Float32(c)), min(Threads.nthreads() % UInt, $ntmax))) + choose_nthread = :(_choose_num_threads( + $(Float32(c)), + min(Threads.nthreads() % UInt, $ntmax) + )) push_loop_length_expr!(choose_nthread, ls) choose_nthread = Expr(:(=), Symbol("#nthreads#"), choose_nthread) end @@ -644,10 +704,18 @@ function thread_two_loops_expr( u₂ = u₂max threadedloop1 = getloop(ls, threadedid1) threadedloop2 = getloop(ls, threadedid2) - define_len1, define_num_unrolls1, loopstart1, iterstop1, looprange1, lastrange1 = - thread_loop_summary!(ls, ua, threadedloop1, false) - define_len2, define_num_unrolls2, loopstart2, iterstop2, looprange2, lastrange2 = - thread_loop_summary!(ls, ua, threadedloop2, true) + define_len1, + define_num_unrolls1, + loopstart1, + iterstop1, + looprange1, + lastrange1 = thread_loop_summary!(ls, ua, threadedloop1, false) + define_len2, + define_num_unrolls2, + loopstart2, + iterstop2, + looprange2, + lastrange2 = thread_loop_summary!(ls, ua, threadedloop2, true) loopboundexpr = Expr(:tuple) lastboundexpr = Expr(:tuple) for loop ∈ ls.loops @@ -670,7 +738,7 @@ function thread_two_loops_expr( $AM, $LPSYM, Val(typeof(var"#avx#call#args#")), - flatten_to_tuple(var"#avx#call#args#")..., + flatten_to_tuple(var"#avx#call#args#")... )) update_return_values = if length(ls.outer_reductions) > 0 retv = loopset_return_value(ls, Val(false)) @@ -679,11 +747,20 @@ function thread_two_loops_expr( else nothing end - blockdef = - define_thread_blocks(threadedloop1, threadedloop2, vloop, u₁loop, u₂loop, u₁, u₂, ntmax) + blockdef = define_thread_blocks( + threadedloop1, + threadedloop2, + vloop, + u₁loop, + u₂loop, + u₁, + u₂, + ntmax + ) iterdef1 = define_block_size(threadedloop1, vloop, 0, ls.vector_width) iterdef2 = define_block_size(threadedloop2, vloop, 1, ls.vector_width) - retexpr = length(ls.outer_reductions) > 0 ? :(return $retv) : :(return nothing) + retexpr = + length(ls.outer_reductions) > 0 ? :(return $retv) : :(return nothing) q = quote $choose_nthread # UInt $loopstart1 @@ -701,32 +778,38 @@ function thread_two_loops_expr( var"#thread#factor#0#" = var"#num#unrolls#thread#0#" var"#thread#factor#1#" = var"#num#unrolls#thread#1#" else - var"##thread#0##excess##" = var"#num#unrolls#thread#0#" ≥ var"#nthreads#" - var"##thread#1##excess##" = var"#num#unrolls#thread#1#" ≥ var"#nthreads#" + var"##thread#0##excess##" = + var"#num#unrolls#thread#0#" ≥ var"#nthreads#" + var"##thread#1##excess##" = + var"#num#unrolls#thread#1#" ≥ var"#nthreads#" if var"##thread#0##excess##" & var"##thread#1##excess##" $blockdef elseif var"##thread#0##excess##" # var"#num#unrolls#thread#1#" is small but var"#num#unrolls#thread#0#" is not; we want to place a small one in front - (var"#thread#factor#1#", var"#thread#factor#0#") = _choose_num_blocks( - var"#num#unrolls#thread#1#", - StaticInt{1}(), - var"#nthreads#", - $(staticexpr(ntmax % Int)), - ) + (var"#thread#factor#1#", var"#thread#factor#0#") = + _choose_num_blocks( + var"#num#unrolls#thread#1#", + StaticInt{1}(), + var"#nthreads#", + $(staticexpr(ntmax % Int)) + ) else # var"#num#unrolls#thread#0#" is small, and var"#num#unrolls#thread#1#" may or may not be - (var"#thread#factor#0#", var"#thread#factor#1#") = _choose_num_blocks( - var"#num#unrolls#thread#0#", - StaticInt{1}(), - var"#nthreads#", - $(staticexpr(ntmax % Int)), - ) + (var"#thread#factor#0#", var"#thread#factor#1#") = + _choose_num_blocks( + var"#num#unrolls#thread#0#", + StaticInt{1}(), + var"#nthreads#", + $(staticexpr(ntmax % Int)) + ) end - var"#thread#factor#0#" = min(var"#thread#factor#0#", var"#num#unrolls#thread#0#") - var"#thread#factor#1#" = min(var"#thread#factor#1#", var"#num#unrolls#thread#1#") + var"#thread#factor#0#" = + min(var"#thread#factor#0#", var"#num#unrolls#thread#0#") + var"#thread#factor#1#" = + min(var"#thread#factor#1#", var"#num#unrolls#thread#1#") end # @show (var"#thread#factor#0#", var"#thread#factor#1#") var"#nrequest#" = vsub_nsw( vmul_nsw(var"#thread#factor#0#", var"#thread#factor#1#" % UInt32), - 0x00000001, + 0x00000001 ) var"#loop#1#start#init#" = var"#iter#start#0#" var"##do#thread##" = var"#nrequest#" ≠ 0x00000000 @@ -748,22 +831,27 @@ function thread_two_loops_expr( var"#threads#remain#" = var"#thread#launch#count#" ≠ var"#nrequest#" var"##do#thread##" |= var"#threads#remain#" while var"#threads#remain#" - VectorizationBase.assume(var"#thread#mask#" ≠ zero(var"#thread#mask#")) - var"#trailzing#zeros#" = Base.trailing_zeros(var"#thread#mask#") % UInt32 + VectorizationBase.assume( + var"#thread#mask#" ≠ zero(var"#thread#mask#") + ) + var"#trailzing#zeros#" = + Base.trailing_zeros(var"#thread#mask#") % UInt32 var"#nblock#size#thread#0#" = Core.ifelse( var"#thread#launch#count#0#" < (var"#nrem#thread#0#" % UInt32), vadd_nw(var"#base#block#size#thread#0#", var"#block#rem#step#0#"), - var"#base#block#size#thread#0#", + var"#base#block#size#thread#0#" ) var"#nblock#size#thread#1#" = Core.ifelse( var"#thread#launch#count#1#" < (var"#nrem#thread#1#" % UInt32), vadd_nw(var"#base#block#size#thread#1#", var"#block#rem#step#1#"), - var"#base#block#size#thread#1#", + var"#base#block#size#thread#1#" ) - var"#trailzing#zeros#" = vadd_nw(var"#trailzing#zeros#", 0x00000001) + var"#trailzing#zeros#" = + vadd_nw(var"#trailzing#zeros#", 0x00000001) $iterstop1 $iterstop2 - var"#thread#id#" = vadd_nw(var"#thread#id#", var"#trailzing#zeros#") + var"#thread#id#" = + vadd_nw(var"#thread#id#", var"#trailzing#zeros#") # @show var"#thread#id#" $loopboundexpr var"##lbvargs#to_launch##" = ($loopboundexpr, var"#vargs#") avx_launch( @@ -774,33 +862,39 @@ function thread_two_loops_expr( $LPSYM, StaticType{typeof(var"##lbvargs#to_launch##")}(), flatten_to_tuple(var"##lbvargs#to_launch##"), - var"#thread#id#", + var"#thread#id#" ) var"#thread#mask#" >>>= var"#trailzing#zeros#" var"##end#inner##" = - var"#thread#launch#count#0#" == vsub_nw(var"#thread#factor#0#", 0x00000001) + var"#thread#launch#count#0#" == + vsub_nw(var"#thread#factor#0#", 0x00000001) var"#thread#launch#count#0#" = Core.ifelse( var"##end#inner##", 0x00000000, - vadd_nw(var"#thread#launch#count#0#", 0x00000001), + vadd_nw(var"#thread#launch#count#0#", 0x00000001) ) var"#thread#launch#count#1#" = Core.ifelse( var"##end#inner##", var"#thread#launch#count#1#" + 0x00000001, - var"#thread#launch#count#1#", + var"#thread#launch#count#1#" ) var"#iter#start#0#" = Core.ifelse( var"##end#inner##", var"#loop#1#start#init#", - var"#iter#stop#0#", + var"#iter#stop#0#" + ) + var"#iter#start#1#" = Core.ifelse( + var"##end#inner##", + var"#iter#stop#1#", + var"#iter#start#1#" ) - var"#iter#start#1#" = - Core.ifelse(var"##end#inner##", var"#iter#stop#1#", var"#iter#start#1#") - var"#thread#launch#count#" = vadd_nw(var"#thread#launch#count#", 0x00000001) - var"#threads#remain#" = var"#thread#launch#count#" ≠ var"#nrequest#" + var"#thread#launch#count#" = + vadd_nw(var"#thread#launch#count#", 0x00000001) + var"#threads#remain#" = + var"#thread#launch#count#" ≠ var"#nrequest#" end end else# eliminate undef var errors that the compiler should be able to figure out are unreachable, but doesn't @@ -822,8 +916,10 @@ function thread_two_loops_expr( (var"#thread#mask#" ≠ zero(var"#thread#mask#")) while var"#threads#remain#" VectorizationBase.assume(var"#thread#mask#" ≠ zero(var"#thread#mask#")) - var"#trailzing#zeros#" = - vadd_nw(Base.trailing_zeros(var"#thread#mask#") % UInt32, 0x00000001) + var"#trailzing#zeros#" = vadd_nw( + Base.trailing_zeros(var"#thread#mask#") % UInt32, + 0x00000001 + ) var"#thread#mask#" >>>= var"#trailzing#zeros#" var"#thread#id#" = vadd_nw(var"#thread#id#", var"#trailzing#zeros#") var"#thread#ptr#" = ThreadingUtilities.taskpointer(var"#thread#id#") @@ -839,7 +935,8 @@ function thread_two_loops_expr( end function valid_thread_loops(ls::LoopSet) - order, u₁loop, u₂loop, vectorized, u₁, u₂, c, shouldinline = choose_order_cost(ls) + order, u₁loop, u₂loop, vectorized, u₁, u₂, c, shouldinline = + choose_order_cost(ls) # NOTE: `names` are being placed in the opposite order here versus normal lowering! copyto!(names(ls), order) init_loop_map!(ls) @@ -877,15 +974,37 @@ function avx_threads_expr( OPS::Expr, ARF::Expr, AM::Expr, - LPSYM::Expr, + LPSYM::Expr ) valid_thread_loop, ua, c = valid_thread_loops(ls) num_candiates = sum(valid_thread_loop) if (num_candiates == 0) || (nt ≤ 1) # it was called from `avx_body` but now `nt` was set to `1` avx_body(ls, UNROLL) elseif (num_candiates == 1) || (nt ≤ 3) - thread_one_loops_expr(ls, ua, valid_thread_loop, nt, c, UNROLL, OPS, ARF, AM, LPSYM) + thread_one_loops_expr( + ls, + ua, + valid_thread_loop, + nt, + c, + UNROLL, + OPS, + ARF, + AM, + LPSYM + ) else # requires at least 4 threads - thread_two_loops_expr(ls, ua, valid_thread_loop, nt, c, UNROLL, OPS, ARF, AM, LPSYM) + thread_two_loops_expr( + ls, + ua, + valid_thread_loop, + nt, + c, + UNROLL, + OPS, + ARF, + AM, + LPSYM + ) end end diff --git a/src/codegen/lowering.jl b/src/codegen/lowering.jl index c857d51cf..a058bc0b3 100644 --- a/src/codegen/lowering.jl +++ b/src/codegen/lowering.jl @@ -1,5 +1,4 @@ - # the `lowernonstore` and `lowerstore` options are there as a means of lowering all non-store operations before lowering the stores. function lower!( q::Expr, @@ -11,7 +10,7 @@ function lower!( suffix::Int, mask::Bool, lowernonstore::Bool, - lowerstore::Bool, + lowerstore::Bool ) ua = UnrollArgs(ls, u₁, unrollsyms, u₂, suffix) for op ∈ ops @@ -37,7 +36,13 @@ end function isu₂invalidstorereorder(ls::LoopSet, us::UnrollSpecification) us.u₂ == -1 ? false : ls.validreorder[ls.loopordermap[us.u₂loopnum]] ≠ 0x03 end -function lower_block(ls::LoopSet, us::UnrollSpecification, n::Int, mask::Bool, UF::Int) +function lower_block( + ls::LoopSet, + us::UnrollSpecification, + n::Int, + mask::Bool, + UF::Int +) @unpack u₁loopnum, u₂loopnum, vloopnum, u₁, u₂ = us ops = oporder(ls) order = names(ls) @@ -51,7 +56,18 @@ function lower_block(ls::LoopSet, us::UnrollSpecification, n::Int, mask::Bool, U cannot_reorder_u₂ = isu₂invalidstorereorder(ls, us) for prepost ∈ 1:2 # !u₁ && !u₂ - lower!(blockq, ops[1, 1, prepost, n], ls, unrollsyms, u₁, u₂, -1, mask, true, true) + lower!( + blockq, + ops[1, 1, prepost, n], + ls, + unrollsyms, + u₁, + u₂, + -1, + mask, + true, + true + ) # isu₁unrolled, isu₂unrolled, after_loop, n opsv1 = ops[1, 2, prepost, n] opsv2 = ops[2, 2, prepost, n] @@ -61,7 +77,18 @@ function lower_block(ls::LoopSet, us::UnrollSpecification, n::Int, mask::Bool, U iszero(length(opsv2)) || (nstores += sum(isstore, opsv2)) # if nstores if (length(opsv1) + length(opsv2) == nstores) && u₂ > 1 # all_u₂_ops_store - lower!(blockq, ops[2, 1, prepost, n], ls, unrollsyms, u₁, u₂, -1, mask, true, true) # for u ∈ 0:u₁-1 + lower!( + blockq, + ops[2, 1, prepost, n], + ls, + unrollsyms, + u₁, + u₂, + -1, + mask, + true, + true + ) # for u ∈ 0:u₁-1 lower_tiled_store!(blockq, opsv1, opsv2, ls, unrollsyms, u₁, u₂, mask) else for store ∈ (false, true) @@ -84,7 +111,7 @@ function lower_block(ls::LoopSet, us::UnrollSpecification, n::Int, mask::Bool, U t, mask & !(dontmaskfirsttiles & (t < u₂ - 1)), lowernonstore, - lowerstore, + lowerstore ) if iszero(t) && !store # u₁ && !u₂ # for u ∈ 0:u₁-1 @@ -98,7 +125,7 @@ function lower_block(ls::LoopSet, us::UnrollSpecification, n::Int, mask::Bool, U -1, mask, true, - true, + true ) # end end @@ -114,7 +141,7 @@ function lower_block(ls::LoopSet, us::UnrollSpecification, n::Int, mask::Bool, U t, mask & !(dontmaskfirsttiles & (t < u₂ - 1)), lowernonstore, - lowerstore, + lowerstore ) # end end @@ -122,11 +149,44 @@ function lower_block(ls::LoopSet, us::UnrollSpecification, n::Int, mask::Bool, U end end elseif cannot_reorder_u₂ - lower!(blockq, ops[2, 1, prepost, n], ls, unrollsyms, u₁, u₂, -1, mask, true, true) + lower!( + blockq, + ops[2, 1, prepost, n], + ls, + unrollsyms, + u₁, + u₂, + -1, + mask, + true, + true + ) else # for u ∈ 0:u₁-1 # u₁ && !u₂ - lower!(blockq, ops[2, 1, prepost, n], ls, unrollsyms, u₁, u₂, -1, mask, true, false) - lower!(blockq, ops[2, 1, prepost, n], ls, unrollsyms, u₁, u₂, -1, mask, false, true) + lower!( + blockq, + ops[2, 1, prepost, n], + ls, + unrollsyms, + u₁, + u₂, + -1, + mask, + true, + false + ) + lower!( + blockq, + ops[2, 1, prepost, n], + ls, + unrollsyms, + u₁, + u₂, + -1, + mask, + false, + true + ) # end end if n > 1 && prepost == 1 @@ -177,7 +237,12 @@ function allinteriorunrolled(ls::LoopSet, us::UnrollSpecification, N) unroll_total ≤ 16 end -function lower_no_unroll(ls::LoopSet, us::UnrollSpecification, n::Int, inclmask::Bool) +function lower_no_unroll( + ls::LoopSet, + us::UnrollSpecification, + n::Int, + inclmask::Bool +) nisvectorized = isvectorized(us, n) loop = getloop(ls, n) tc = terminatecondition(ls, us, n, inclmask, 1) @@ -224,7 +289,7 @@ function lower_unrolled_dynamic( ls::LoopSet, us::UnrollSpecification, n::Int, - inclmask::Bool, + inclmask::Bool ) UF = unrollfactor(us, n) isone(UF) && return lower_no_unroll(ls, us, n, inclmask) @@ -240,7 +305,8 @@ function lower_unrolled_dynamic( if W ≠ 0 & isknown(first(loop)) & isknown(step(loop)) loopisstatic = isknown(last(loop)) # something other than the default hint currently means an UpperBoundedInteger was passed as an argument - loopisbounded = (looplength < UFW) & (loopisstatic | (gethint(last(loop)) ≠ 1024)) + loopisbounded = + (looplength < UFW) & (loopisstatic | (gethint(last(loop)) ≠ 1024)) else loopisstatic = false loopisbounded = false @@ -251,14 +317,19 @@ function lower_unrolled_dynamic( UFWnew = cld(looplength, cld(looplength, UFW)) UF = cld(UFWnew, W) UFW = UF * W - us = nisunrolled ? UnrollSpecification(us, UF, u₂) : UnrollSpecification(us, u₁, UF) + us = + nisunrolled ? UnrollSpecification(us, UF, u₂) : + UnrollSpecification(us, u₁, UF) end remmask = inclmask | nisvectorized sl = startloop(ls, us, n, false) UFt = loopisstatic ? cld(looplength % UFW, W) : 1 # Don't place remainder first if we're going to have to mask this loop (i.e., if this loop is vectorized) remfirst = - loopisstatic & (!nisvectorized) & (UFt > 0) & !(unsigned(Ureduct) < unsigned(UF)) + loopisstatic & + (!nisvectorized) & + (UFt > 0) & + !(unsigned(Ureduct) < unsigned(UF)) tc = terminatecondition(ls, us, n, inclmask, remfirst ? 1 : UF) # Don't need to create the body if loop is dynamic and bounded dynamicbounded = ((!loopisstatic) & loopisbounded) @@ -322,19 +393,30 @@ function lower_unrolled_dynamic( if length(loop) < UF * W Expr(:block) else - Expr(:block, add_upper_outer_reductions(ls, q, Ureduct, UF, loop, nisvectorized)) + Expr( + :block, + add_upper_outer_reductions(ls, q, Ureduct, UF, loop, nisvectorized) + ) end else - Expr(:block, add_upper_outer_reductions(ls, q, Ureduct, UF, loop, nisvectorized)) + Expr( + :block, + add_upper_outer_reductions(ls, q, Ureduct, UF, loop, nisvectorized) + ) end if add_cleanup cleanup_expr = Expr(blockhead) - blockhead === :block || - push!(cleanup_expr.args, terminatecondition(ls, us, n, inclmask, UF_cleanup)) + blockhead === :block || push!( + cleanup_expr.args, + terminatecondition(ls, us, n, inclmask, UF_cleanup) + ) us_cleanup = nisunrolled ? UnrollSpecification(us, UF_cleanup, u₂) : UnrollSpecification(us, u₁, UF_cleanup) - push!(cleanup_expr.args, lower_block(ls, us_cleanup, n, inclmask, UF_cleanup)) + push!( + cleanup_expr.args, + lower_block(ls, us_cleanup, n, inclmask, UF_cleanup) + ) push!(_q.args, cleanup_expr) end UFt > 0 && push!(_q.args, remblock) @@ -357,13 +439,19 @@ function lower_unrolled_dynamic( UF = rem_uf >> 1 UFt = rem_uf - UF ust = - nisunrolled ? UnrollSpecification(us, UFt, u₂) : UnrollSpecification(us, u₁, UFt) + nisunrolled ? UnrollSpecification(us, UFt, u₂) : + UnrollSpecification(us, u₁, UFt) newblock = lower_block(ls, ust, n, remmask, UFt) # comparison = unrollremcomparison(ls, loop, UFt, n, nisvectorized, remfirst) comparison = terminatecondition(ls, us, n, inclmask, UFt) UFt = 1 UF += 1 - iseven(rem_uf) - Expr(:block, q, Expr(iseven(rem_uf) ? :while : :if, comparison, newblock), remblock) + Expr( + :block, + q, + Expr(iseven(rem_uf) ? :while : :if, comparison, newblock), + remblock + ) else Expr(:block, q, remblock) end @@ -372,7 +460,8 @@ function lower_unrolled_dynamic( iforelseif = :if while true ust = - nisunrolled ? UnrollSpecification(us, UFt, u₂) : UnrollSpecification(us, u₁, UFt) + nisunrolled ? UnrollSpecification(us, UFt, u₂) : + UnrollSpecification(us, u₁, UFt) newblock = lower_block(ls, ust, n, remmask, UFt) if (UFt ≥ UF - 1 + nisvectorized) || UFt == Ureduct || loopisstatic if isone(num_loops(ls)) && isone(UFt) && isone(Ureduct) @@ -381,10 +470,14 @@ function lower_unrolled_dynamic( push!(remblock.args, newblock) break end - comparison = unrollremcomparison(ls, loop, UFt, n, nisvectorized, remfirst) + comparison = + unrollremcomparison(ls, loop, UFt, n, nisvectorized, remfirst) if isone(num_loops(ls)) && isone(UFt) remblocknew = Expr(:if, comparison, newblock) - push!(remblock.args, Expr(:block, Expr(:let, definemask(loop), remblocknew))) + push!( + remblock.args, + Expr(:block, Expr(:let, definemask(loop), remblocknew)) + ) remblock = remblocknew else remblocknew = Expr(iforelseif, comparison, newblock) @@ -408,7 +501,7 @@ function lower_unrolled_dynamic( ls, order[u₁loopnum], order[us.u₂loopnum], - vectorized, + vectorized ) Expr(:block, pre, Expr(:let, sl, q), post) else @@ -421,7 +514,7 @@ function unrollremcomparison( UFt::Int, n::Int, nisvectorized::Bool, - remfirst::Bool, + remfirst::Bool ) termind = ls.lssm.terminators[n] if iszero(termind) @@ -430,7 +523,12 @@ function unrollremcomparison( pointerremcomparison(ls, termind, UFt, n, nisvectorized, remfirst, loop) end end -function loopvarremcomparison(loop::Loop, UFt::Int, nisvectorized::Bool, remfirst::Bool) +function loopvarremcomparison( + loop::Loop, + UFt::Int, + nisvectorized::Bool, + remfirst::Bool +) loopsym = loop.itersymbol loopstep = loop.step if nisvectorized @@ -442,7 +540,7 @@ function loopvarremcomparison(loop::Loop, UFt::Int, nisvectorized::Bool, remfirs :call, GlobalRef(Base, :<), loopsym, - gethint(first(loop)) + UFt * gethint(loopstep) - 1, + gethint(first(loop)) + UFt * gethint(loopstep) - 1 ) elseif isknown(last(loop)) if isknown(loopstep) @@ -450,21 +548,21 @@ function loopvarremcomparison(loop::Loop, UFt::Int, nisvectorized::Bool, remfirs :call, GlobalRef(Base, :>), loopsym, - gethint(last(loop)) - UFt * gethint(loopstep), + gethint(last(loop)) - UFt * gethint(loopstep) ) elseif isone(UFt) Expr( :call, GlobalRef(Base, :>), loopsym, - subexpr(gethint(last(loop)), getsym(loopstep)), + subexpr(gethint(last(loop)), getsym(loopstep)) ) else Expr( :call, GlobalRef(Base, :>), loopsym, - subexpr(gethint(last(loop)), mulexpr(getsym(loopstep), UFt)), + subexpr(gethint(last(loop)), mulexpr(getsym(loopstep), UFt)) ) end else @@ -473,21 +571,26 @@ function loopvarremcomparison(loop::Loop, UFt::Int, nisvectorized::Bool, remfirs :call, GlobalRef(Base, :>), loopsym, - Expr(:call, lv(:vsub_nsw), getsym(last(loop)), UFt * gethint(loopstep)), + Expr(:call, lv(:vsub_nsw), getsym(last(loop)), UFt * gethint(loopstep)) ) elseif isone(UFt) Expr( :call, GlobalRef(Base, :>), loopsym, - Expr(:call, lv(:vsub_nsw), getsym(last(loop)), getsym(loopstep)), + Expr(:call, lv(:vsub_nsw), getsym(last(loop)), getsym(loopstep)) ) else Expr( :call, GlobalRef(Base, :>), loopsym, - Expr(:call, lv(:vsub_nsw), getsym(last(loop)), mulexpr(getsym(loopstep), UFt)), + Expr( + :call, + lv(:vsub_nsw), + getsym(last(loop)), + mulexpr(getsym(loopstep), UFt) + ) ) end end @@ -499,7 +602,7 @@ function pointerremcomparison( n::Int, nisvectorized::Bool, remfirst::Bool, - loop::Loop, + loop::Loop ) lssm = ls.lssm termar = lssm.incrementedptrs[n][termind] @@ -508,14 +611,24 @@ function pointerremcomparison( ptroff = vptr_offset(ptr) if remfirst cmp = GlobalRef(VectorizationBase, :vlt) - Expr(:call, cmp, ptroff, pointermax(ls, ptrdef, n, 1 - UFt, nisvectorized, loop), ptr) + Expr( + :call, + cmp, + ptroff, + pointermax(ls, ptrdef, n, 1 - UFt, nisvectorized, loop), + ptr + ) else cmp = GlobalRef(VectorizationBase, :vge) Expr(:call, cmp, ptroff, maxsym(ptr, UFt), ptr) end end -@generated function of_same_size(::Type{T}, ::Type{S}, ::StaticInt{R}) where {T,S,R} +@generated function of_same_size( + ::Type{T}, + ::Type{S}, + ::StaticInt{R} +) where {T,S,R} sizeof_S = sizeof(S) sizeof_T = sizeof(T) if T <: Integer @@ -546,7 +659,8 @@ end of_same_size( T, S, - VectorizationBase.register_size() ÷ VectorizationBase.simd_integer_register_size(), + VectorizationBase.register_size() ÷ + VectorizationBase.simd_integer_register_size() ) end function outer_reduction_zero( @@ -554,7 +668,7 @@ function outer_reduction_zero( u₁u::Bool, Umax::Int, reduct_class::Float64, - rs::Union{Expr,StaticInt}, + rs::Union{Expr,StaticInt} ) isifelse = instruction(op).instr === :ifelse reduct_zero = if isifelse @@ -565,7 +679,8 @@ function outer_reduction_zero( end # Tsym = outer_reduct_init_typename(op) # Tsym = ELTYPESYMBOL - Tsym = Expr(:call, lv(:of_same_size), outer_reduct_init_typename(op), ELTYPESYMBOL) + Tsym = + Expr(:call, lv(:of_same_size), outer_reduct_init_typename(op), ELTYPESYMBOL) if isvectorized(op) if Umax == 1 || !u₁u if reduct_zero === :zero @@ -578,12 +693,19 @@ function outer_reduction_zero( lv(:_vbroadcast), VECTORWIDTHSYMBOL, Expr(:call, lv(reduct_zero), Tsym), - rs, + rs ) end else if reduct_zero === :zero - Expr(:call, lv(:zero_vecunroll), staticexpr(Umax), VECTORWIDTHSYMBOL, Tsym, rs) + Expr( + :call, + lv(:zero_vecunroll), + staticexpr(Umax), + VECTORWIDTHSYMBOL, + Tsym, + rs + ) elseif isifelse Expr( :call, @@ -591,7 +713,7 @@ function outer_reduction_zero( staticexpr(Umax), VECTORWIDTHSYMBOL, reduct_zero, - rs, + rs ) else Expr( @@ -600,7 +722,7 @@ function outer_reduction_zero( staticexpr(Umax), VECTORWIDTHSYMBOL, Expr(:call, reduct_zero, Tsym), - rs, + rs ) end end @@ -618,7 +740,7 @@ function initialize_outer_reductions!( op::Operation, _Umax::Int, us::UnrollSpecification, - rs::Union{Expr,StaticInt}, + rs::Union{Expr,StaticInt} ) @unpack u₁, u₂ = us Umax = u₂ == -1 ? _Umax : u₁ @@ -628,9 +750,15 @@ function initialize_outer_reductions!( getloop(ls, us.u₁loopnum).itersymbol, getloop(ls, us.u₂loopnum).itersymbol, getloop(ls, us.vloopnum).itersymbol, - ls, + ls )#, u₂) - z = outer_reduction_zero(op, u₁u, Umax, reduction_instruction_class(instruction(op)), rs) + z = outer_reduction_zero( + op, + u₁u, + Umax, + reduction_instruction_class(instruction(op)), + rs + ) mvar = variable_name(op, -1) if (u₂ == -1) push!(q.args, Expr(:(=), Symbol(mvar, '_', _Umax), z)) @@ -645,7 +773,11 @@ function initialize_outer_reductions!( end nothing end -function initialize_outer_reductions!(q::Expr, ls::LoopSet, Umax::Union{Int,StaticInt}) +function initialize_outer_reductions!( + q::Expr, + ls::LoopSet, + Umax::Union{Int,StaticInt} +) rs = staticexpr(reg_size(ls)) us = ls.unrollspecification for or ∈ ls.outer_reductions @@ -655,18 +787,22 @@ end initialize_outer_reductions!(ls::LoopSet, Umax::Int) = initialize_outer_reductions!(ls.preamble, ls, Umax) function add_upper_comp_check(unrolledloop, loopbuffer) - if isstaticloop(unrolledloop) Expr(:call, Base.GlobalRef(Base, :≥), length(unrolledloop), loopbuffer) elseif isknown(first(unrolledloop)) if isone(first(unrolledloop)) - Expr(:call, Base.GlobalRef(Base, :≥), getsym(last(unrolledloop)), loopbuffer) + Expr( + :call, + Base.GlobalRef(Base, :≥), + getsym(last(unrolledloop)), + loopbuffer + ) else Expr( :call, Base.GlobalRef(Base, :≥), getsym(last(unrolledloop)), - addexpr(loopbuffer, gethint(first(unrolledloop)) - 1), + addexpr(loopbuffer, gethint(first(unrolledloop)) - 1) ) end elseif isknown(last(unrolledloop)) @@ -677,9 +813,9 @@ function add_upper_comp_check(unrolledloop, loopbuffer) :call, lv(:vsub_nsw), gethint(last(unrolledloop)) + 1, - getsym(first(unrolledloop)), + getsym(first(unrolledloop)) ), - loopbuffer, + loopbuffer ) else# both are given by symbols Expr( @@ -689,9 +825,9 @@ function add_upper_comp_check(unrolledloop, loopbuffer) :call, lv(:vsub_nsw), getsym(last(unrolledloop)), - Expr(:call, lv(:vsub_nsw), getsym(first(unrolledloop)), staticexpr(1)), + Expr(:call, lv(:vsub_nsw), getsym(first(unrolledloop)), staticexpr(1)) ), - loopbuffer, + loopbuffer ) end end @@ -701,7 +837,7 @@ function add_upper_outer_reductions( Ulow::Int, Uhigh::Int, unrolledloop::Loop, - reductisvectorized::Bool, + reductisvectorized::Bool ) ifq = Expr(:block) ifqlet = Expr(:block) @@ -737,7 +873,10 @@ function add_upper_outer_reductions( end end ncomparison = if reductisvectorized - add_upper_comp_check(unrolledloop, mulexpr(VECTORWIDTHSYMBOL, Uhigh, step(unrolledloop))) + add_upper_comp_check( + unrolledloop, + mulexpr(VECTORWIDTHSYMBOL, Uhigh, step(unrolledloop)) + ) elseif isknown(step(unrolledloop)) add_upper_comp_check(unrolledloop, Uhigh * gethint(step(unrolledloop))) else @@ -776,8 +915,13 @@ function reduce_expr!(q::Expr, ls::LoopSet, U::Int) Expr( :(=), var, - Expr(:call, reduction_scalar_combine(op), Symbol(mvar, "##onevec##"), var), - ), + Expr( + :call, + reduction_scalar_combine(op), + Symbol(mvar, "##onevec##"), + var + ) + ) ) else reductexpr = ifelse_reduction(:IfElseReduced, op) do opv @@ -785,7 +929,11 @@ function reduce_expr!(q::Expr, ls::LoopSet, U::Int) end push!( q.args, - Expr(:(=), var, Expr(:call, reductexpr, Symbol(mvar, "##onevec##"), var)), + Expr( + :(=), + var, + Expr(:call, reductexpr, Symbol(mvar, "##onevec##"), var) + ) ) end else @@ -801,7 +949,7 @@ function reinit_push_preblockpost!( post::Expr, z::Expr, s::Symbol, - reduct::Symbol, + reduct::Symbol ) push!(letblock.args, Expr(:(=), s, z)) tempsym = gensym(s) # placeholder @@ -816,7 +964,7 @@ function reinit_and_update_tiled_outer_reduct!( ls::LoopSet, u₁loopsym::Symbol, u₂loopsym::Symbol, - vloopsym::Symbol, + vloopsym::Symbol ) rs = staticexpr(reg_size(ls)) usorig = ls.unrollspecification @@ -840,14 +988,29 @@ function reinit_and_update_tiled_outer_reduct!( post, z, Symbol(mvar, '_', usorig.u₁), - reduct, + reduct ) else # it's u₂unrolled for u ∈ 0:Umax-1 - reinit_push_preblockpost!(letblock, pre, block, post, z, Symbol(mvar, u), reduct) + reinit_push_preblockpost!( + letblock, + pre, + block, + post, + z, + Symbol(mvar, u), + reduct + ) end end - initialize_outer_reductions!(letblock, ls, ls.operations[or], ureduct(ls), usorig, rs) + initialize_outer_reductions!( + letblock, + ls, + ls.operations[or], + ureduct(ls), + usorig, + rs + ) end pre, post end @@ -918,8 +1081,12 @@ function init_remblock(unrolledloop::Loop, lssm::LoopStartStopManager, n::Int)#u if iszero(termind) rangesym = unrolledloop.rangesym if rangesym === Symbol("") - condition = - Expr(:call, lv(:cmpend), unrolledloop.itersymbol, staticloopexpr(unrolledloop)) + condition = Expr( + :call, + lv(:cmpend), + unrolledloop.itersymbol, + staticloopexpr(unrolledloop) + ) else condition = Expr(:call, lv(:cmpend), unrolledloop.itersymbol, rangesym) end @@ -927,7 +1094,13 @@ function init_remblock(unrolledloop::Loop, lssm::LoopStartStopManager, n::Int)#u termar = lssm.incrementedptrs[n][termind] ptr = vptr(termar) ptroff = vptr_offset(ptr) - condition = Expr(:call, GlobalRef(VectorizationBase, :vlt), ptroff, maxsym(ptr, 0), ptr) + condition = Expr( + :call, + GlobalRef(VectorizationBase, :vlt), + ptroff, + maxsym(ptr, 0), + ptr + ) end Expr(:if, condition) end @@ -967,7 +1140,12 @@ function definemask(loop::Loop) maskexpr(addexpr(lenexpr, 1)) end end -function define_eltype_vec_width!(q::Expr, ls::LoopSet, vectorized, ortypdefined::Bool) +function define_eltype_vec_width!( + q::Expr, + ls::LoopSet, + vectorized, + ortypdefined::Bool +) push!(q.args, Expr(:(=), ELTYPESYMBOL, determine_eltype(ls, ortypdefined))) push!(q.args, Expr(:(=), VECTORWIDTHSYMBOL, determine_width(ls, vectorized))) nothing @@ -998,15 +1176,13 @@ function setup_preamble!(ls::LoopSet, us::UnrollSpecification, Ureduct::Int) getloop(ls, us.vloopnum), u₁, u₂, - -1, + -1 ) lower_compute!(ls.preamble, op, ls, ua, false) end end end -function lsexpr(ls::LoopSet, q) - Expr(:block, ls.preamble, q) -end +lsexpr(ls::LoopSet, q) = Expr(:block, ls.preamble, q) function isanouterreduction(ls::LoopSet, op::Operation) opname = name(op) @@ -1058,8 +1234,8 @@ function calc_Ureduct!(ls::LoopSet, us::UnrollSpecification) elseif !((u₁ui == Int(u₁u)) & (u₂ui == Int(u₁u))) throw( ArgumentError( - "Doesn't currenly handle differently unrolled reductions yet, please file an issue with an example.", - ), + "Doesn't currenly handle differently unrolled reductions yet, please file an issue with an example." + ) ) end end @@ -1079,16 +1255,27 @@ function lower_unrollspec(ls::LoopSet) Ureduct = calc_Ureduct!(ls, us) setup_preamble!(ls, us, Ureduct) initgesps = add_loop_start_stop_manager!(ls) - q = Expr(:let, initgesps, lower_unrolled_dynamic(ls, us, num_loops(ls), false)) + q = + Expr(:let, initgesps, lower_unrolled_dynamic(ls, us, num_loops(ls), false)) q = gc_preserve(ls, Expr(:block, q)) reduce_expr!(q, ls, Ureduct) lsexpr(ls, q) end -function lower(ls::LoopSet, order, u₁loop, u₂loop, vectorized, u₁, u₂, inline::Bool) +function lower( + ls::LoopSet, + order, + u₁loop, + u₂loop, + vectorized, + u₁, + u₂, + inline::Bool +) cacheunrolled!(ls, u₁loop, u₂loop, vectorized) fillorder!(ls, order, u₁loop, u₂loop, u₂, vectorized) - ls.unrollspecification = UnrollSpecification(ls, u₁loop, u₂loop, vectorized, u₁, u₂) + ls.unrollspecification = + UnrollSpecification(ls, u₁loop, u₂loop, vectorized, u₁, u₂) q = lower_unrollspec(ls) inline && pushfirst!(q.args, Expr(:meta, :inline)) q @@ -1096,8 +1283,18 @@ end function lower(ls::LoopSet, inline::Int = -1) fill_offset_memop_collection!(ls) - order, u₁loop, u₂loop, vectorized, u₁, u₂, c, shouldinline = choose_order_cost(ls) - lower(ls, order, u₁loop, u₂loop, vectorized, u₁, u₂, inlinedecision(inline, shouldinline)) + order, u₁loop, u₂loop, vectorized, u₁, u₂, c, shouldinline = + choose_order_cost(ls) + lower( + ls, + order, + u₁loop, + u₂loop, + vectorized, + u₁, + u₂, + inlinedecision(inline, shouldinline) + ) end function lower(ls::LoopSet, u₁::Int, u₂::Int, v::Int, inline::Int) fill_offset_memop_collection!(ls) @@ -1108,13 +1305,15 @@ function lower(ls::LoopSet, u₁::Int, u₂::Int, v::Int, inline::Int) copyto!(ls.loop_order.bestorder, order) elseif u₁ > 0 u₂ = -1 - order, vectorized, c = choose_unroll_order(ls, Inf, store_load_deps(operations(ls)), v) + order, vectorized, c = + choose_unroll_order(ls, Inf, store_load_deps(operations(ls)), v) u₁loop = first(order) u₂loop = Symbol("##undefined##") shouldinline = true copyto!(ls.loop_order.bestorder, order) else - order, u₁loop, u₂loop, vectorized, u₁, u₂, c, shouldinline = choose_order_cost(ls, v) + order, u₁loop, u₂loop, vectorized, u₁, u₂, c, shouldinline = + choose_order_cost(ls, v) end doinline = inlinedecision(inline, shouldinline) lower(ls, order, u₁loop, u₂loop, vectorized, u₁, u₂, doinline) @@ -1143,16 +1342,14 @@ isunrolled_sym(op, u₁loop, u₂loop) It returns `true`/`false` for each loop, indicating whether they're unrolled. If there is a third argument, it will avoid unrolling that symbol along reductions if said symbol is part of the reduction chain. - """ function isunrolled_sym( op::Operation, u₁loop::Symbol, u₂loop::Symbol, vloop::Symbol, - (u₁ild, u₂ild)::Tuple{Bool,Bool} = (isu₁unrolled(op), isu₂unrolled(op)), + (u₁ild, u₂ild)::Tuple{Bool,Bool} = (isu₁unrolled(op), isu₂unrolled(op)) ) - (accesses_memory(op) | isloopvalue(op)) && return (u₁ild, u₂ild) if isconstant(op) if length(loopdependencies(op)) == 0 @@ -1193,7 +1390,7 @@ function isunrolled_sym( u₁loop::Symbol, u₂loop::Symbol, vloop::Symbol, - ls::LoopSet, + ls::LoopSet ) us = ls.unrollspecification isunrolled_sym(op, u₁loop, u₂loop, vloop, us) @@ -1203,12 +1400,13 @@ function isunrolled_sym( u₁loop::Symbol, u₂loop::Symbol, vloop::Symbol, - us::UnrollSpecification, + us::UnrollSpecification ) @unpack u₁, u₂ = us u₁u = (u₁ > 1) & isu₁unrolled(op) u₂u = (u₂ > 1) & isu₂unrolled(op) - ((u₂ > 1) | accesses_memory(op)) ? isunrolled_sym(op, u₁loop, u₂loop, vloop, (u₁u, u₂u)) : + ((u₂ > 1) | accesses_memory(op)) ? + isunrolled_sym(op, u₁loop, u₂loop, vloop, (u₁u, u₂u)) : (isunrolled_sym(op, u₁loop, u₁u), false) end @@ -1218,7 +1416,11 @@ function isunrolled_sym(op::Operation, u₁loop::Symbol, us::UnrollSpecification u₁u = (us.u₁ > 1) & isu₁unrolled(op) isunrolled_sym(op, u₁loop, u₁u) end -function isunrolled_sym(op::Operation, u₁loop::Symbol, u₁u::Bool = isu₁unrolled(op)) +function isunrolled_sym( + op::Operation, + u₁loop::Symbol, + u₁u::Bool = isu₁unrolled(op) +) u₁u || (isconstant(op) & (u₁loop ∈ reducedchildren(op))) end @@ -1227,14 +1429,13 @@ function isunrolled_sym( u₁loop::Symbol, u₂loop::Symbol, vloop::Symbol, - u₂max::Int, + u₂max::Int ) - ((u₂max > 1) | accesses_memory(op)) ? isunrolled_sym(op, u₁loop, u₂loop, vloop) : + ((u₂max > 1) | accesses_memory(op)) ? + isunrolled_sym(op, u₁loop, u₂loop, vloop) : (isunrolled_sym(op, u₁loop), false) end - - function variable_name(op::Operation, suffix::Int) mvar = mangledvar(op) suffix == -1 ? mvar : Symbol(mvar, suffix, :_) @@ -1246,7 +1447,7 @@ function variable_name_and_unrolled( u₂loop::Symbol, vloop::Symbol, u₂iter::Int, - ls::LoopSet, + ls::LoopSet ) u₁op, u₂op = isunrolled_sym(op, u₁loop, u₂loop, vloop, ls) mvar = u₂op ? variable_name(op, u₂iter) : mangledvar(op) diff --git a/src/codegen/operation_evaluation_order.jl b/src/codegen/operation_evaluation_order.jl index c0af709fb..02ab7b905 100644 --- a/src/codegen/operation_evaluation_order.jl +++ b/src/codegen/operation_evaluation_order.jl @@ -29,7 +29,7 @@ function isnopidentity( u₁loop::Symbol, u₂loop::Symbol, vectorized::Symbol, - u₂max::Int, + u₂max::Int ) parents_op = parents(op) if iscompute(op) && instruction(op).instr === :identity @@ -42,7 +42,8 @@ function isnopidentity( Base.iterate(parents_op, state) === nothing || return false name(opp) === name(op) || return false # @show op opp isu₁unrolled(op), isu₁unrolled(opp), isu₂unrolled(op), isu₂unrolled(opp) - (isu₁unrolled(op) == isu₁unrolled(opp)) & (isu₂unrolled(op) == isu₂unrolled(opp)) + (isu₁unrolled(op) == isu₁unrolled(opp)) & + (isu₂unrolled(op) == isu₂unrolled(opp)) else false end @@ -53,7 +54,7 @@ function set_upstream_family!( op::Operation, val::T, ld::Vector{Symbol}, - id::Int, + id::Int ) where {T} adal[identifier(op)] == val && return # must already have been set if ld != loopdependencies(op) || id == identifier(op) @@ -68,7 +69,7 @@ function search_for_reductinit!( op::Operation, opswap::Operation, var::Symbol, - loopdeps::Vector{Symbol}, + loopdeps::Vector{Symbol} ) for (i, opp) ∈ enumerate(parents(op)) if (name(opp) === var) && @@ -95,7 +96,7 @@ function addoptoorder!( u₁loop::Symbol, u₂loop::Symbol, vectorized::Symbol, - u₂max::Int, + u₂max::Int ) lo = ls.loop_order id = identifier(op) @@ -112,7 +113,7 @@ function addoptoorder!( u₁loop, u₂loop, vectorized, - u₂max, + u₂max ) end included_vars[id] || return nothing @@ -133,14 +134,20 @@ function addoptoorder!( # @show op, after_loop # isloopvalue(op) || push!(lo[isunrolled,istiled,after_loop,_n], op) # all(opp -> iszero(length(reduceddependencies(opp))), parents(op)) && - set_upstream_family!(place_after_loop, op, false, loopdependencies(op), identifier(op)) # parents that have already been included are not moved, so no need to check included_vars to filter + set_upstream_family!( + place_after_loop, + op, + false, + loopdependencies(op), + identifier(op) + ) # parents that have already been included are not moved, so no need to check included_vars to filter nothing end function replace_reduct_init!( ls::LoopSet, op::Operation, opsub::Operation, - opcheck::Operation, + opcheck::Operation ) deleteat!(parents(op), 2) op.variable = opcheck.variable @@ -154,7 +161,7 @@ function nounrollreduction( op::Operation, u₁loop::Symbol, u₂loop::Symbol, - vectorized::Symbol, + vectorized::Symbol ) reduceddeps = reduceddependencies(op) (vectorized ∉ reduceddeps) && (u₁loop ∉ reduceddeps) && (u₂loop ∉ reduceddeps) @@ -163,7 +170,7 @@ function load_short_static_reduction_first!( ls::LoopSet, u₁loop::Symbol, u₂loop::Symbol, - vectorized::Symbol, + vectorized::Symbol ) for op ∈ operations(ls) iscompute(op) || continue @@ -188,8 +195,12 @@ function load_short_static_reduction_first!( opsub = parents(op)[2] length(children(opsub)) == 1 || continue opsearch = parents(op)[1] - opcheck = - search_for_reductinit!(opsearch, opsub, name(opsearch), loopdependencies(op)) + opcheck = search_for_reductinit!( + opsearch, + opsub, + name(opsearch), + loopdependencies(op) + ) opcheck === opsearch || replace_reduct_init!(ls, op, opsub, opcheck) end elseif (instruction(op).instr === :add_fast) && @@ -199,11 +210,17 @@ function load_short_static_reduction_first!( (length(vecloop) ≤ 16) && nounrollreduction(op, u₁loop, u₂loop, vectorized) opsub = parents(op)[2] - ((length(reduceddependencies(opsub)) == 0) & (length(children(opsub)) == 1)) || - continue + ( + (length(reduceddependencies(opsub)) == 0) & + (length(children(opsub)) == 1) + ) || continue opsearch = parents(op)[1] - opcheck = - search_for_reductinit!(opsearch, opsub, name(opsearch), loopdependencies(op)) + opcheck = search_for_reductinit!( + opsearch, + opsub, + name(opsearch), + loopdependencies(op) + ) opcheck === opsearch || replace_reduct_init!(ls, op, opsub, opcheck) end end @@ -216,7 +233,7 @@ function fillorder!( u₁loop::Symbol, u₂loop::Symbol, u₂max::Int, - vectorized::Symbol, + vectorized::Symbol ) load_short_static_reduction_first!(ls, u₁loop, u₂loop, vectorized) lo = ls.loop_order @@ -248,7 +265,7 @@ function fillorder!( u₁loop, u₂loop, vectorized, - u₂max, + u₂max ) end end diff --git a/src/codegen/split_loops.jl b/src/codegen/split_loops.jl index 3440704a1..af1eafba8 100644 --- a/src/codegen/split_loops.jl +++ b/src/codegen/split_loops.jl @@ -1,12 +1,11 @@ - function add_operation!( ls_new::LoopSet, included::Vector{Int}, ls::LoopSet, op::Operation, ids::Vector{Int}, - issecond::Bool, + issecond::Bool ) newid = included[identifier(op)] iszero(newid) || return operations(ls_new)[newid] @@ -32,7 +31,7 @@ function add_operation!( reduceddependencies(opc), parentsnew, opc.ref, - reducedchildren(opc), + reducedchildren(opc) ) addsetv!(ls_new.includedactualarrays, vptr(opc.ref)) push!(operations(ls_new), opnew) @@ -40,7 +39,14 @@ function add_operation!( for i ∈ 2:length(parentsopc) push!( parentsnew, - add_operation!(ls_new, included, ls, parentsopc[i], ids, issecond), + add_operation!( + ls_new, + included, + ls, + parentsopc[i], + ids, + issecond + ) ) end included[identifier(opp)] = identifier(opnew) @@ -62,7 +68,7 @@ function add_operation!( reduceddependencies(op), vparents, op.ref, - reducedchildren(op), + reducedchildren(op) ) accesses_memory(op) && addsetv!(ls_new.includedactualarrays, vptr(op.ref)) push!(operations(ls_new), opnew) @@ -70,12 +76,10 @@ function add_operation!( opnew end -function append_if_included!(vnew, vold, included) - for (i, v) ∈ vold +append_if_included!(vnew, vold, included) = for (i, v) ∈ vold id = included[i] iszero(id) || push!(vnew, (id, v)) end -end function split_loopset(ls::LoopSet, ids::Vector{Int}, issecond::Bool) ls_new = LoopSet(:LoopVectorization) @@ -95,7 +99,11 @@ function split_loopset(ls::LoopSet, ids::Vector{Int}, issecond::Bool) append_if_included!(ls_new.preamble_symint, ls.preamble_symint, included) append_if_included!(ls_new.preamble_symfloat, ls.preamble_symfloat, included) append_if_included!(ls_new.preamble_zeros, ls.preamble_zeros, included) - append_if_included!(ls_new.preamble_funcofeltypes, ls.preamble_funcofeltypes, included) + append_if_included!( + ls_new.preamble_funcofeltypes, + ls.preamble_funcofeltypes, + included + ) for i ∈ ls.outer_reductions id = included[i] iszero(id) || push!(ls_new.outer_reductions, id) @@ -147,29 +155,61 @@ function lower_and_split_loops(ls::LoopSet, inline::Int) for (ind, i) ∈ enumerate(split_candidates) split_1[1] = i ls_1 = split_loopset(ls, split_1, false) - order_1, unrolled_1, tiled_1, vectorized_1, U_1, T_1, cost_1, shouldinline_1 = - choose_order_cost(ls_1) + order_1, + unrolled_1, + tiled_1, + vectorized_1, + U_1, + T_1, + cost_1, + shouldinline_1 = choose_order_cost(ls_1) remaining_ops[1:ind-1] .= @view(split_candidates[1:ind-1]) remaining_ops[ind:end] .= @view(split_candidates[ind+1:end]) ls_2 = split_loopset(ls, remaining_ops, true) - order_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2, cost_2, shouldinline_2 = - choose_order_cost(ls_2) + order_2, + unrolled_2, + tiled_2, + vectorized_2, + U_2, + T_2, + cost_2, + shouldinline_2 = choose_order_cost(ls_2) # U_1 = T_1 = U_2 = T_2 = 2 - if cost_1 + cost_2 + looplenpen * (looplengthprod(ls_1) + looplengthprod(ls_2)) ≤ + if cost_1 + + cost_2 + + looplenpen * (looplengthprod(ls_1) + looplengthprod(ls_2)) ≤ muladd(0.9, cost_fused, ls_looplen) ls_2_lowered = if length(remaining_ops) > 1 inline = iszero(inline) ? (shouldinline_1 % Int) : inline lower_and_split_loops(ls_2, inline) else doinline = inlinedecision(inline, shouldinline_1 | shouldinline_2) - lower(ls_2, order_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2, doinline) + lower( + ls_2, + order_2, + unrolled_2, + tiled_2, + vectorized_2, + U_2, + T_2, + doinline + ) end return Expr( :block, ls.preamble, - lower(ls_1, order_1, unrolled_1, tiled_1, vectorized_1, U_1, T_1, false), + lower( + ls_1, + order_1, + unrolled_1, + tiled_1, + vectorized_1, + U_1, + T_1, + false + ), ls_2_lowered, - nothing, + nothing ) end length(split_candidates) == 2 && break @@ -183,6 +223,6 @@ function lower_and_split_loops(ls::LoopSet, inline::Int) vectorized_fused, U_fused, T_fused, - doinline, + doinline ) end diff --git a/src/condense_loopset.jl b/src/condense_loopset.jl index ed25d0453..b984f2c92 100644 --- a/src/condense_loopset.jl +++ b/src/condense_loopset.jl @@ -1,4 +1,5 @@ -@enum IndexType::UInt8 NotAnIndex = 0 LoopIndex = 1 ComputedIndex = 2 SymbolicIndex = 3 +@enum IndexType::UInt8 NotAnIndex = 0 LoopIndex = 1 ComputedIndex = 2 SymbolicIndex = + 3 Base.:|(u::Unsigned, it::IndexType) = u | UInt8(it) Base.:(==)(u::Unsigned, it::IndexType) = (u % UInt8) == UInt8(it) @@ -44,7 +45,10 @@ function rebuild_fields(offset::Int, ::Type{T}) where {T} elseif fieldcount(TF) ≡ 0 push!(call.args, Expr(:call, getfield, :t, (offset += 1))) elseif TF <: DataType - push!(call.args, Expr(:call, lv(:gettype), Expr(:call, getfield, :t, (offset += 1)))) + push!( + call.args, + Expr(:call, lv(:gettype), Expr(:call, getfield, :t, (offset += 1))) + ) else arg, offset = rebuild_fields(offset, TF) push!(call.args, arg) @@ -78,7 +82,8 @@ struct ArrayRefStruct{array,ptr} offsets::UInt128 strides::UInt128 end -array_and_ptr(@nospecialize(ar::ArrayRefStruct{a,p})) where {a,p} = (a::Symbol, p::Symbol) +array_and_ptr(@nospecialize(ar::ArrayRefStruct{a,p})) where {a,p} = + (a::Symbol, p::Symbol) # array(@nospecialize(ar::ArrayRefStruct{a,p})) where {a,p} = a::Symbol # ptr(@nospecialize(ar::ArrayRefStruct{a,p})) where {a,p} = p::Symbol @@ -92,7 +97,7 @@ function ArrayRefStruct( ls::LoopSet, mref::ArrayReferenceMeta, arraysymbolinds::Vector{Symbol}, - ids::Vector{Int}, + ids::Vector{Int} ) index_types = zero(UInt128) indices = zero(UInt128) @@ -128,7 +133,12 @@ function ArrayRefStruct( # end end end - ArrayRefStruct{mref.ref.array,mref.ptr}(index_types, indices, offsets, strides) + ArrayRefStruct{mref.ref.array,mref.ptr}( + index_types, + indices, + offsets, + strides + ) end """ @@ -159,7 +169,8 @@ function findmatchingarray(ls::LoopSet, mref::ArrayReferenceMeta) end 0x0000 end -filled_8byte_chunks(u::T) where {T<:Unsigned} = sizeof(T) - (leading_zeros(u) >>> 3) +filled_8byte_chunks(u::T) where {T<:Unsigned} = + sizeof(T) - (leading_zeros(u) >>> 3) function shifted_loopset(ls::LoopSet, loopsyms::Vector{Symbol}) ld = zero(UInt128) # leading_zeros(ld) >> 2 yields the number of loopdeps @@ -169,9 +180,12 @@ function shifted_loopset(ls::LoopSet, loopsyms::Vector{Symbol}) end ld end -loopdeps_uint(ls::LoopSet, op::Operation) = shifted_loopset(ls, loopdependencies(op)) -reduceddeps_uint(ls::LoopSet, op::Operation) = shifted_loopset(ls, reduceddependencies(op)) -childdeps_uint(ls::LoopSet, op::Operation) = shifted_loopset(ls, reducedchildren(op)) +loopdeps_uint(ls::LoopSet, op::Operation) = + shifted_loopset(ls, loopdependencies(op)) +reduceddeps_uint(ls::LoopSet, op::Operation) = + shifted_loopset(ls, reduceddependencies(op)) +childdeps_uint(ls::LoopSet, op::Operation) = + shifted_loopset(ls, reducedchildren(op)) function parents_uint(oppv::AbstractVector{Operation}) p = zero(UInt128) for parent ∈ oppv @@ -218,7 +232,7 @@ function OperationStruct!( varnames::Vector{Symbol}, ids::Vector{Int}, ls::LoopSet, - op::Operation, + op::Operation ) ld = loopdeps_uint(ls, op) rd = reduceddeps_uint(ls, op) @@ -239,8 +253,9 @@ end Zero():static_step(r):(maybestaticlast(r)-maybestaticfirst(r)) @inline zerorangestart(r::CartesianIndices) = CartesianIndices(map(zerorangestart, r.indices)) -@inline zerorangestart(r::ArrayInterface.OptionallyStaticUnitRange{StaticInt{1}}) = - CloseOpen(maybestaticlast(r)) +@inline zerorangestart( + r::ArrayInterface.OptionallyStaticUnitRange{StaticInt{1}} +) = CloseOpen(maybestaticlast(r)) function loop_boundary!(q::Expr, loop::Loop, shouldindbyind::Bool) if isstaticloop(loop) || loop.rangesym === Symbol("") @@ -289,7 +304,7 @@ function argmeta_and_consts_description(ls::LoopSet, arraysymbolinds) tuple_expr(ls.preamble_symint), tuple_expr(ls.preamble_symfloat), tuple_expr(ls.preamble_zeros), - tuple_expr(ls.preamble_funcofeltypes), + tuple_expr(ls.preamble_funcofeltypes) ) end @inline vdata(v::Vec) = getfield(v, :data) @@ -315,7 +330,10 @@ function loopset_return_value(ls::LoopSet, ::Val{extract}) where {extract} for or ∈ ls.outer_reductions op = ops[or] if extract - push!(ret.args, Expr(:call, :vdata, Symbol(mangledvar(op), "##onevec##"))) + push!( + ret.args, + Expr(:call, :vdata, Symbol(mangledvar(op), "##onevec##")) + ) else push!(ret.args, Symbol(mangledvar(ops[or]), "##onevec##")) end @@ -359,40 +377,46 @@ end val(x) = Expr(:call, Expr(:curly, :Val, x)) @inline gespf1(x, i) = gesp(x, i) -@inline gespf1(x::StridedPointer{T,1}, i::Tuple{I}) where {T,I<:Union{Integer,StaticInt}} = - gesp(x, i) +@inline gespf1( + x::StridedPointer{T,1}, + i::Tuple{I} +) where {T,I<:Union{Integer,StaticInt}} = gesp(x, i) @inline gespf1( x::StridedBitPointer{T,1}, - i::Tuple{I}, + i::Tuple{I} ) where {T,I<:Union{Integer,StaticInt}} = gesp(x, i) @inline gespf1(x::StridedPointer{T,1}, i::Tuple{Zero}) where {T} = x @inline gespf1(x::StridedBitPointer{T,1}, i::Tuple{Zero}) where {T} = x @generated function gespf1( x::AbstractStridedPointer{T,N,C,B,R}, - i::Tuple{I}, + i::Tuple{I} ) where {T,N,I<:Union{Integer,StaticInt},C,B,R} ri = argmin(R) quote $(Expr(:meta, :inline)) - p, li = VectorizationBase.tdot(x, (vsub_nsw(getfield(i, 1), one($I)),), strides(x)) + p, li = VectorizationBase.tdot( + x, + (vsub_nsw(getfield(i, 1), one($I)),), + strides(x) + ) ptr = gep(p, li) si = ArrayInterface.StrideIndex{1,$(R[ri],),$(C === 1 ? 1 : 0)}( (getfield(strides(x), $ri),), - (Zero(),), + (Zero(),) ) stridedpointer(ptr, si, StaticInt{$(B === 1 ? 1 : 0)}()) end end @generated function gespf1( x::AbstractStridedPointer{T,N,C,B,R}, - ::Tuple{VectorizationBase.NullStep}, + ::Tuple{VectorizationBase.NullStep} ) where {T,N,C,B,R} ri = argmin(R) quote $(Expr(:meta, :inline)) si = ArrayInterface.StrideIndex{1,$(R[ri],),$(C === 1 ? 1 : 0)}( (getfield(strides(x), $ri),), - (getfield(offsets(x), $ri),), + (getfield(offsets(x), $ri),) ) stridedpointer(pointer(x), si, StaticInt{$(B == 1 ? 1 : 0)}()) end @@ -407,7 +431,7 @@ function should_zerorangestart( ls::LoopSet, allarrayrefs::Vector{ArrayReferenceMeta}, name_to_array_map::Vector{Vector{Int}}, - isrooted::Vector{Bool}, + isrooted::Vector{Bool} ) loops = ls.loops shouldindbyind = fill(false, length(loops)) @@ -423,7 +447,8 @@ function should_zerorangestart( baseref = allarrayrefs[first(namev)] # firstcontainsind relies on stripping of duplicate inds in parsing firstcontainsind = findfirstcontaining(baseref, ind) - basestride = firstcontainsind == 0 ? 0 : getstrides(baseref)[firstcontainsind] + basestride = + firstcontainsind == 0 ? 0 : getstrides(baseref)[firstcontainsind] allsame = true # The idea here is that if any ref to the same array doesn't have `ind`, # we can't offset that dimension because different inds will clash. @@ -431,8 +456,10 @@ function should_zerorangestart( # to be consistent, and check that all arrays are valid first. for j ∈ @view(namev[2:end]) ref = allarrayrefs[j] - if (firstcontainsind ≠ findfirstcontaining(ref, ind)) || - ((firstcontainsind ≠ 0) && (basestride ≠ getstrides(ref)[firstcontainsind])) + if (firstcontainsind ≠ findfirstcontaining(ref, ind)) || ( + (firstcontainsind ≠ 0) && + (basestride ≠ getstrides(ref)[firstcontainsind]) + ) allsame = false break end @@ -445,17 +472,22 @@ function should_zerorangestart( end return shouldindbyind end -function check_shouldindbyind(ls::LoopSet, ind::Symbol, shouldindbyind::Vector{Bool}) +function check_shouldindbyind( + ls::LoopSet, + ind::Symbol, + shouldindbyind::Vector{Bool} +) for (i, loop) ∈ enumerate(ls.loops) loop.itersymbol === ind && return shouldindbyind[i] end true end - @inline densewrapper(sp, A) = sp -@inline densewrapper(sp::AbstractStridedPointer{T,N}, A::AbstractArray{T,N}) where {T,N} = - _densewrapper(sp, VectorizationBase.val_dense_dims(A)) +@inline densewrapper( + sp::AbstractStridedPointer{T,N}, + A::AbstractArray{T,N} +) where {T,N} = _densewrapper(sp, VectorizationBase.val_dense_dims(A)) @inline _densewrapper(sp, ::Nothing) = sp @inline _densewrapper(sp::AbstractStridedPointer, ::Val{D}) where {D} = VectorizationBase.DensePointerWrapper{D}(sp) @@ -501,7 +533,7 @@ function add_grouped_strided_pointer!(extra_args::Expr, ls::LoopSet) allarrayrefs, k, name_to_array_map, - unique_to_name_and_op_map, + unique_to_name_and_op_map ) push!(gespsummaries, (k, gespindsummary)) found = true @@ -512,7 +544,8 @@ function add_grouped_strided_pointer!(extra_args::Expr, ls::LoopSet) push!(preserve, presbufsym(ref.ref.array)) end roots = getroots(ls) - shouldindbyind = should_zerorangestart(ls, allarrayrefs, name_to_array_map, roots) + shouldindbyind = + should_zerorangestart(ls, allarrayrefs, name_to_array_map, roots) for (k, gespindsummary) ∈ gespsummaries ref = allarrayrefs[k] gespinds = calcgespinds( @@ -521,7 +554,7 @@ function add_grouped_strided_pointer!(extra_args::Expr, ls::LoopSet) gespindsummary, shouldindbyind, name_to_array_map[first(first(unique_to_name_and_op_map[k]))], - unique_to_name_and_op_map, + unique_to_name_and_op_map ) push!( tgarrays.args, @@ -529,8 +562,8 @@ function add_grouped_strided_pointer!(extra_args::Expr, ls::LoopSet) :call, lv(:densewrapper), Expr(:call, lv(:gespf1), vptr(ref), gespinds), - name(ref), - ), + name(ref) + ) ) end push!(gsp.args, tgarrays) @@ -557,11 +590,25 @@ end ::StaticInt{RS}, ::StaticInt{AR}, ::StaticInt{NT}, - ::StaticInt{CLS}, + ::StaticInt{CLS} ) where {CNFARG,W,RS,AR,CLS,NT} inline, u₁, u₂, v, BROADCAST, thread, warncheckarg, safe = CNFARG nt = min(thread % UInt, NT % UInt) - t = Expr(:tuple, inline, u₁, u₂, v, BROADCAST, W, RS, AR, CLS, nt, warncheckarg, safe) + t = Expr( + :tuple, + inline, + u₁, + u₂, + v, + BROADCAST, + W, + RS, + AR, + CLS, + nt, + warncheckarg, + safe + ) length(CNFARG) == 7 && push!(t.args, CNFARG[7]) Expr(:call, Expr(:curly, :Val, t)) end @@ -572,13 +619,15 @@ end register_size(), available_registers(), num_cores(), #FIXME - cache_linesize(), + cache_linesize() ) end function find_samename_constparent(op::Operation, opname::Symbol) for opp ∈ parents(op) - (((isconstant(opp) && instruction(opp) == LOOPCONSTANT) && (name(opp) === opname))) && - return opp + (( + (isconstant(opp) && instruction(opp) == LOOPCONSTANT) && + (name(opp) === opname) + )) && return opp opptemp = find_samename_constparent(opp, opname) opptemp === opp || return opptemp end @@ -595,8 +644,6 @@ function remove_outer_reducts!(roots::Vector{Bool}, ls::LoopSet) end end - - function split_ifelse!( ls::LoopSet, preserve::Vector{Symbol}, @@ -608,7 +655,7 @@ function split_ifelse!( thread::UInt, warncheckarg::Int, safe::Bool, - debug::Bool, + debug::Bool ) roots[k] = false op = operations(ls)[k] @@ -667,7 +714,7 @@ function split_ifelse!( thread, warncheckarg, safe, - debug, + debug )) else $(generate_call_split( @@ -680,7 +727,7 @@ function split_ifelse!( thread, warncheckarg, safe, - debug, + debug )) end ) @@ -694,7 +741,7 @@ function generate_call( thread::UInt, warncheckarg::Int, safe::Bool, - debug::Bool, + debug::Bool ) extra_args = Expr(:tuple) fill_children!(ls) @@ -709,7 +756,7 @@ function generate_call( thread, warncheckarg, safe, - debug, + debug ) end function generate_call_split( @@ -722,7 +769,7 @@ function generate_call_split( thread::UInt, warncheckarg::Int, safe::Bool, - debug::Bool, + debug::Bool ) for (k, op) ∈ enumerate(operations(ls)) parents_op = parents(op) @@ -740,7 +787,7 @@ function generate_call_split( thread, warncheckarg, safe, - debug, + debug ) end end @@ -754,7 +801,7 @@ function generate_call_split( thread, warncheckarg, safe, - debug, + debug ) end @@ -769,7 +816,7 @@ function generate_call_types( thread::UInt, warncheckarg::Int, safe::Bool, - debug::Bool, + debug::Bool ) # good place to check for split operation_descriptions = Expr(:tuple) @@ -794,7 +841,10 @@ function generate_call_types( for (j, ref) ∈ enumerate(ls.refs_aliasing_syms) # duplicate_ref[j] ≠ 0 && continue duplicate_ref[j] && continue - push!(arrayref_descriptions.args, ArrayRefStruct(ls, ref, arraysymbolinds, ids)) + push!( + arrayref_descriptions.args, + ArrayRefStruct(ls, ref, arraysymbolinds, ids) + ) end argmeta = argmeta_and_consts_description(ls, arraysymbolinds) loop_bounds = loop_boundaries(ls, shouldindbyind) @@ -818,7 +868,8 @@ function generate_call_types( end manyarg = !debug && (argcestimate > 16) func = - debug ? lv(:_turbo_loopset_debug) : (manyarg ? lv(:_turbo_manyarg!) : lv(:_turbo_!)) + debug ? lv(:_turbo_loopset_debug) : + (manyarg ? lv(:_turbo_manyarg!) : lv(:_turbo_!)) q = Expr( :call, func, @@ -826,7 +877,7 @@ function generate_call_types( val(operation_descriptions), val(arrayref_descriptions), val(argmeta), - val(loop_syms), + val(loop_syms) ) vecwidthdefq = if debug push!(q.args, Expr(:tuple, lbarg, extra_args)) @@ -835,7 +886,11 @@ function generate_call_types( vargsym = gensym(:vargsym) push!( q.args, - Expr(:call, GlobalRef(Base, :Val), Expr(:call, GlobalRef(Base, :typeof), vargsym)), + Expr( + :call, + GlobalRef(Base, :Val), + Expr(:call, GlobalRef(Base, :typeof), vargsym) + ) ) if manyarg push!(q.args, Expr(:call, lv(:flatten_to_tuple), vargsym)) @@ -869,7 +924,6 @@ end """ check_args(::Vararg{AbstractArray}) - LoopVectorization will optimize an `@turbo` loop if `check_args` on each on the indexed abstract arrays returns true. It returns true for `AbstractArray{T}`s when `check_type(T) == true` and the array or its parent is a `StridedArray` or `AbstractRange`. @@ -886,7 +940,8 @@ end # @info "`LoopVectorization.check_args(::$(typeof(x))) == false`, therefore compiling a probably slow `@inbounds @fastmath` fallback loop." maxlog=1 false end -@inline check_args(A, B, C::Vararg{Any,K}) where {K} = check_args(A) && check_args(B, C...) +@inline check_args(A, B, C::Vararg{Any,K}) where {K} = + check_args(A) && check_args(B, C...) @inline check_args(::AbstractRange{T}) where {T} = check_type(T) @inline check_args(::UpTri) = false @inline check_args(::LoTri) = false @@ -916,7 +971,7 @@ end struct RetVec2Int end (::RetVec2Int)(_) = Vec{2,Int} """ - can_turbo(f::Function, ::Val{NARGS}) +can_turbo(f::Function, ::Val{NARGS}) Check whether a given function with a specified number of arguments can be used inside a `@turbo` loop. @@ -962,10 +1017,18 @@ function check_turbo_safe(ls::LoopSet) q end -make_fast(q) = - Expr(:macrocall, Symbol("@fastmath"), LineNumberNode(@__LINE__, Symbol(@__FILE__)), q) -make_crashy(q) = - Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__, Symbol(@__FILE__)), q) +make_fast(q) = Expr( + :macrocall, + Symbol("@fastmath"), + LineNumberNode(@__LINE__, Symbol(@__FILE__)), + q +) +make_crashy(q) = Expr( + :macrocall, + Symbol("@inbounds"), + LineNumberNode(@__LINE__, Symbol(@__FILE__)), + q +) @inline vecmemaybe(x::NativeTypes) = x @inline vecmemaybe(x::VectorizationBase._Vec) = Vec(x) @@ -998,24 +1061,39 @@ end # call, preserve = generate_call_split(ls, (inline,u₁,u₂), thread % UInt, false) # setup_call_ret!(ls, call, preserve) # end -setup_outerreduct_preserve_mangler(op::Operation) = Symbol(mangledvar(op), "##onevec##") +setup_outerreduct_preserve_mangler(op::Operation) = + Symbol(mangledvar(op), "##onevec##") -function outer_reduction_to_scalar_reduceq!(q::Expr, op::Operation, var = name(op)) +function outer_reduction_to_scalar_reduceq!( + q::Expr, + op::Operation, + var = name(op) +) instr = instruction(op) out = setup_outerreduct_preserve_mangler(op) if instr.instr ≢ :ifelse - Expr(:call, reduction_scalar_combine(op), Expr(:call, lv(:vecmemaybe), out), var) + Expr( + :call, + reduction_scalar_combine(op), + Expr(:call, lv(:vecmemaybe), out), + var + ) else opinstr = ifelse_reduction(:IfElseReduced, op) do opv opvname = name(opv) oporig = gensym(opvname) pushfirst!(q.args, Expr(:(=), oporig, opvname)) - Expr(:call, lv(:vecmemaybe), setup_outerreduct_preserve_mangler(opv)), (oporig,) + Expr(:call, lv(:vecmemaybe), setup_outerreduct_preserve_mangler(opv)), + (oporig,) end Expr(:call, opinstr, Expr(:call, lv(:vecmemaybe), out), var) end end -function setup_outerreduct_preserve(ls::LoopSet, call::Expr, preserve::Vector{Symbol}) +function setup_outerreduct_preserve( + ls::LoopSet, + call::Expr, + preserve::Vector{Symbol} +) iszero(length(ls.outer_reductions)) && return gc_preserve(call, preserve) retv = loopset_return_value(ls, Val(false)) q = Expr(:block, gc_preserve(Expr(:(=), retv, call), preserve)) @@ -1033,7 +1111,14 @@ function setup_call_final(ls::LoopSet, q::Expr) return ls.preamble end function setup_call_debug(ls::LoopSet) - generate_call(ls, (false, zero(Int8), zero(Int8), zero(Int8)), zero(UInt), 1, true, true) + generate_call( + ls, + (false, zero(Int8), zero(Int8), zero(Int8)), + zero(UInt), + 1, + true, + true + ) end function setup_call( ls::LoopSet, @@ -1046,7 +1131,7 @@ function setup_call( v::Int8, thread::Int, warncheckarg::Int, - safe::Bool, + safe::Bool ) # We outline/inline at the macro level by creating/not creating an anonymous function. # The old API instead was based on inlining or not inline the generated function, but diff --git a/src/constructors.jl b/src/constructors.jl index 80340e92f..121f8c290 100644 --- a/src/constructors.jl +++ b/src/constructors.jl @@ -23,7 +23,7 @@ function add_ci_call!( syms::Vector{Symbol}, i::Int, @nospecialize(valarg) = nothing, - @nospecialize(mod) = nothing, + @nospecialize(mod) = nothing ) call = if f isa Core.SSAValue Expr(:call, syms[f.id]) @@ -52,14 +52,15 @@ function substitute_broadcast( v::Int8, threads::Int, warncheckarg::Int, - safe::Bool, + safe::Bool ) ci = first(Meta.lower(LoopVectorization, q).args).code nargs = length(ci) - 1 lb = Expr(:block) syms = Vector{Symbol}(undef, nargs) configarg = (inline, u₁, u₂, v, true, threads, warncheckarg, safe) - unroll_param_tup = Expr(:call, lv(:avx_config_val), :(Val{$configarg}()), staticexpr(0)) + unroll_param_tup = + Expr(:call, lv(:avx_config_val), :(Val{$configarg}()), staticexpr(0)) for n ∈ 1:nargs _ciₙ = ci[n] if _ciₙ isa Symbol @@ -72,9 +73,25 @@ function substitute_broadcast( if ciₙ.head === :(=) push!(lb.args, Expr(:(=), f, syms[((ciₙargs[2])::Core.SSAValue).id])) elseif isglobalref(f, Base, :materialize!) - add_ci_call!(lb, lv(:vmaterialize!), ciₙargs, syms, n, unroll_param_tup, mod) + add_ci_call!( + lb, + lv(:vmaterialize!), + ciₙargs, + syms, + n, + unroll_param_tup, + mod + ) elseif isglobalref(f, Base, :materialize) - add_ci_call!(lb, lv(:vmaterialize), ciₙargs, syms, n, unroll_param_tup, mod) + add_ci_call!( + lb, + lv(:vmaterialize), + ciₙargs, + syms, + n, + unroll_param_tup, + mod + ) else add_ci_call!(lb, f, ciₙargs, syms, n) end @@ -87,7 +104,6 @@ function substitute_broadcast( esc(Expr(:let, lb, Expr(:block, ret))) end - function LoopSet(q::Expr, mod::Symbol = :Main) ls = LoopSet(mod) check_inputs!(q, ls.prepreamble) @@ -113,7 +129,7 @@ function check_macro_kwarg( v::Int8, threads::Int, warncheckarg::Int, - safe::Bool, + safe::Bool ) ((arg.head === :(=)) && (length(arg.args) == 2)) || throw(ArgumentError("macro kwarg should be of the form `argname = value`.")) @@ -128,7 +144,9 @@ function check_macro_kwarg( u₁ = convert(Int8, value.args[1])::Int8 u₂ = convert(Int8, value.args[2])::Int8 else - throw(ArgumentError("Don't know how to process argument in `unroll=$value`.")) + throw( + ArgumentError("Don't know how to process argument in `unroll=$value`.") + ) end elseif kw === :vectorize v = convert(Int8, value) @@ -140,7 +158,9 @@ function check_macro_kwarg( elseif value isa Integer threads = max(1, convert(Int, value)::Int) else - throw(ArgumentError("Don't know how to process argument in `thread=$value`.")) + throw( + ArgumentError("Don't know how to process argument in `thread=$value`.") + ) end elseif kw === :warn_check_args warncheckarg = convert(Int, value)::Int @@ -149,8 +169,8 @@ function check_macro_kwarg( else throw( ArgumentError( - "Received unrecognized keyword argument $kw. Recognized arguments include:\n`inline`, `unroll`, `check_empty`, and `thread`.", - ), + "Received unrecognized keyword argument $kw. Recognized arguments include:\n`inline`, `unroll`, `check_empty`, and `thread`." + ) ) end inline, check_empty, u₁, u₂, v, threads, warncheckarg, safe @@ -164,11 +184,21 @@ function process_args( v::Int8 = zero(Int8), threads::Int = 1, warncheckarg::Int = 1, - safe::Bool = true, + safe::Bool = true ) for arg ∈ args inline, check_empty, u₁, u₂, v, threads, warncheckarg, safe = - check_macro_kwarg(arg, inline, check_empty, u₁, u₂, v, threads, warncheckarg, safe) + check_macro_kwarg( + arg, + inline, + check_empty, + u₁, + u₂, + v, + threads, + warncheckarg, + safe + ) end inline, check_empty, u₁, u₂, v, threads, warncheckarg, safe end @@ -223,8 +253,10 @@ function replace_single_enumerate!(q, prepreamble, i = nothing) indsym = itersyms.args[1]::Symbol _replace_looprange!(q, i, indsym, iter) elseif itersyms isa Symbol # if itersyms are not unbox in loop range - throw(ArgumentError("`for $itersyms in enumerate($r)` is not supported, - please use `for ($(itersyms)_i, $(itersyms)_v) in enumerate($r)` instead.")) + throw( + ArgumentError("`for $itersyms in enumerate($r)` is not supported, + please use `for ($(itersyms)_i, $(itersyms)_v) in enumerate($r)` instead.") + ) else throw(ArgumentError("Don't know how to handle expression `$itersyms`.")) end @@ -240,12 +272,37 @@ function turbo_macro(mod, src, q, args...) q = macroexpand(mod, q) if q.head === :for ls = LoopSet(q, mod) - inline, check_empty, u₁, u₂, v, threads, warncheckarg, safe = process_args(args) - esc(setup_call(ls, q, src, inline, check_empty, u₁, u₂, v, threads, warncheckarg, safe)) + inline, check_empty, u₁, u₂, v, threads, warncheckarg, safe = + process_args(args) + esc( + setup_call( + ls, + q, + src, + inline, + check_empty, + u₁, + u₂, + v, + threads, + warncheckarg, + safe + ) + ) else inline, check_empty, u₁, u₂, v, threads, warncheckarg, safe = - process_args(args, inline = true) - substitute_broadcast(q, Symbol(mod), inline, u₁, u₂, v, threads, warncheckarg, safe) + process_args(args; inline = true) + substitute_broadcast( + q, + Symbol(mod), + inline, + u₁, + u₂, + v, + threads, + warncheckarg, + safe + ) end end """ @@ -267,10 +324,10 @@ The macro models the set of nested loops, and chooses an ordering of the three l Current limitations: -1. It assumes that loop iterations are independent. -2. It does not perform bounds checks. -3. It assumes that each loop iterates at least once. (Use `@turbo check_empty=true` to lift this assumption.) -4. That there is only one loop at each level of the nest. + 1. It assumes that loop iterations are independent. + 2. It does not perform bounds checks. + 3. It assumes that each loop iterates at least once. (Use `@turbo check_empty=true` to lift this assumption.) + 4. That there is only one loop at each level of the nest. It may also apply to broadcasts: @@ -295,7 +352,7 @@ Advanced users can customize the implementation of the `@turbo`-annotated block using keyword arguments: ```julia -@turbo inline=false unroll=2 thread=4 body +@turbo inline = false unroll = 2 thread = 4 body ``` where `body` is the code of the block (e.g., `for ... end`). @@ -355,13 +412,22 @@ Note that later arguments take precendence. Meant for convenience, as `@tturbo` is shorter than `@turbo thread=true`. """ macro tturbo(args...) - turbo_macro(__module__, __source__, last(args), :(thread = true), Base.front(args)...) + turbo_macro( + __module__, + __source__, + last(args), + :(thread = true), + Base.front(args)... + ) end function def_outer_reduct_types!(ls::LoopSet) for or ∈ ls.outer_reductions op = operations(ls)[or] - pushpreamble!(ls, Expr(:(=), outer_reduct_init_typename(op), typeof_expr(op))) + pushpreamble!( + ls, + Expr(:(=), outer_reduct_init_typename(op), typeof_expr(op)) + ) end end """ @@ -383,8 +449,17 @@ end macro _turbo(arg, q) @assert q.head === :for q = macroexpand(__module__, q) - inline, check_empty, u₁, u₂, v = - check_macro_kwarg(arg, false, false, zero(Int8), zero(Int8), zero(Int8), 1, 0, true) + inline, check_empty, u₁, u₂, v = check_macro_kwarg( + arg, + false, + false, + zero(Int8), + zero(Int8), + zero(Int8), + 1, + 0, + true + ) ls = LoopSet(q, __module__) set_hw!(ls) def_outer_reduct_types!(ls) diff --git a/src/getconstindexes.jl b/src/getconstindexes.jl index d23d545b3..596a107f5 100644 --- a/src/getconstindexes.jl +++ b/src/getconstindexes.jl @@ -37,7 +37,7 @@ const EXTRACTFUNS = ( :thirtysixth, :thirtyseventh, :thirtyeighth, - :last, + :last ) for (i, f) ∈ enumerate(EXTRACTFUNS) diff --git a/src/modeling/costs.jl b/src/modeling/costs.jl index 712c15bc4..8c2c2bbb5 100644 --- a/src/modeling/costs.jl +++ b/src/modeling/costs.jl @@ -1,7 +1,6 @@ lv(x) = GlobalRef(LoopVectorization, x) - """ Instruction @@ -14,13 +13,11 @@ struct Instruction end # lower(instr::Instruction) = Expr(:(.), instr.mod, QuoteNode(instr.instr)) # Base.convert(::Type{Expr}, instr::Instruction) = Expr(:(.), instr.mod, QuoteNode(instr.instr)) -function callexpr(instr::Instruction) - if instr.mod === :LoopVectorization +callexpr(instr::Instruction) = if instr.mod === :LoopVectorization Expr(:call, lv(instr.instr)) else#if instr.mod === :Main Expr(:call, instr.instr) end -end function callexpr(instr::Instruction, arg) ce = callexpr(instr) append!(ce.args, arg) @@ -93,8 +90,10 @@ end const OPAQUE_INSTRUCTION = InstructionCost(-1.0, 20, 20.0, 16) instruction_cost(instruction::Instruction) = - instruction.mod === :LoopVectorization ? COST[instruction.instr] : OPAQUE_INSTRUCTION -instruction_cost(instruction::Symbol) = get(COST, instruction, OPAQUE_INSTRUCTION) + instruction.mod === :LoopVectorization ? COST[instruction.instr] : + OPAQUE_INSTRUCTION +instruction_cost(instruction::Symbol) = + get(COST, instruction, OPAQUE_INSTRUCTION) scalar_cost(instr::Instruction) = scalar_cost(instruction_cost(instr)) vector_cost(instr::Instruction, Wshift, sizeof_T) = vector_cost(instruction_cost(instr), Wshift, sizeof_T) @@ -106,10 +105,8 @@ vector_cost(instr::Instruction, Wshift, sizeof_T) = # cost( instruction_cost(instruction), Wshift, sizeof_T ) # end - # Just a semi-reasonable assumption; should not be that sensitive to anything other than loads - # Comments on setindex! # 1. Not a part of dependency chains, so not really twice as expensive as getindex? # 2. getindex loads a register, not setindex!, but we place cost on setindex! @@ -263,7 +260,7 @@ const COST = Dict{Symbol,InstructionCost}( :vmovsldup => InstructionCost(1, 1.0), :vmovshdup => InstructionCost(1, 1.0), :exponent => InstructionCost(8, 1.0), - :significand => InstructionCost(8, 1.0), + :significand => InstructionCost(8, 1.0) ) for f in EXTRACTFUNS @@ -288,7 +285,8 @@ Base.convert(::Type{Instruction}, instr::Symbol) = Instruction(instr) function instruction(f::Symbol) # f === :ifelse && return Instruction(:LoopVectorization, :ifelse) # @assert f ∈ keys(COST) - f ∈ keys(COST) ? Instruction(:LoopVectorization, f) : Instruction(Symbol(""), f) + f ∈ keys(COST) ? Instruction(:LoopVectorization, f) : + Instruction(Symbol(""), f) end # instruction(f::Symbol, m::Symbol) = f ∈ keys(COST) ? Instruction(:LoopVectorization, f) : Instruction(m, f) Instruction(instr::Symbol) = instruction(instr) @@ -319,11 +317,15 @@ end @inline (ier::IfElseReducer)(a::VecUnroll) = VecUnroll(VectorizationBase.fmap(ier, VectorizationBase.data(a))) @inline (ier::IfElseReducer)(a::VecUnroll, b::VecUnroll) = VecUnroll( - VectorizationBase.fmap(ier, VectorizationBase.data(a), VectorizationBase.data(b)), + VectorizationBase.fmap( + ier, + VectorizationBase.data(a), + VectorizationBase.data(b) + ) ) - -@inline (ier::IfElseReduced)(x::NativeTypes, y::NativeTypes) = ifelse(ier.f(x, y), x, y) +@inline (ier::IfElseReduced)(x::NativeTypes, y::NativeTypes) = + ifelse(ier.f(x, y), x, y) @inline (ier::IfElseReduced)(x::AbstractSIMD{W}, y::AbstractSIMD{W}) where {W} = ifelse(ier.f(x, y), x, y) @inline function (ier::IfElseReduced)(x::AbstractSIMD, y::AbstractSIMD) @@ -338,17 +340,21 @@ end ifelse(f(r, y), r, y) end - @inline (ier::IfElseReduceTo)(a::NativeTypes, ::NativeTypes) = a @inline (ier::IfElseReduceTo)(a::AbstractSIMD, ::NativeTypes) = VectorizationBase.ifelse_reduce(ier.f, a) -@inline (ier::IfElseReduceTo)(a::AbstractSIMD{W}, ::AbstractSIMD{W}) where {W} = a +@inline (ier::IfElseReduceTo)(a::AbstractSIMD{W}, ::AbstractSIMD{W}) where {W} = + a @inline function (ier::IfElseReduceTo)(a::AbstractSIMD, b::AbstractSIMD) x, y = VectorizationBase.splitvector(a) # halve recursively ier(ifelse(ier.f(x, y), x, y), b) end @inline (ier::IfElseReduceTo)(a::VecUnroll, b::VecUnroll) = VecUnroll( - VectorizationBase.fmap(ier, VectorizationBase.data(a), VectorizationBase.data(b)), + VectorizationBase.fmap( + ier, + VectorizationBase.data(a), + VectorizationBase.data(b) + ) ) @inline (iec::IfElseCollapser)(a) = @@ -383,19 +389,21 @@ end @inline _first_ifelse_reduce_mirror(f::F, a, b) where {F} = getfield(VectorizationBase.ifelse_reduce_mirror(f, a, b), 1) -@inline (ier::IfElseReducerMirror)(a) = _first_ifelse_reduce_mirror(ier.f, a, ier.a) +@inline (ier::IfElseReducerMirror)(a) = + _first_ifelse_reduce_mirror(ier.f, a, ier.a) @inline function _ifelse_reduce_mirror(f::F, a, b, c, d) where {F} r, rm = VectorizationBase.ifelse_reduce_mirror(f, b, d) ifelse(f(c, rm), a, r) end -@inline (ier::IfElseReducerMirror)(a, b) = _ifelse_reduce_mirror(ier.f, a, b, ier.a, ier.b) +@inline (ier::IfElseReducerMirror)(a, b) = + _ifelse_reduce_mirror(ier.f, a, b, ier.a, ier.b) @inline (ier::IfElseReducerMirror)(a::VecUnroll) = VecUnroll( VectorizationBase.fmap( _first_ifelse_reduce_mirror, ier.f, VectorizationBase.data(a), - VectorizationBase.data(ier.a), - ), + VectorizationBase.data(ier.a) + ) ) @inline function (ier::IfElseReducerMirror)(a::VecUnroll, b::VecUnroll) VecUnroll( @@ -405,8 +413,8 @@ end VectorizationBase.data(a), VectorizationBase.data(b), VectorizationBase.data(ier.a), - VectorizationBase.data(ier.b), - ), + VectorizationBase.data(ier.b) + ) ) end @@ -414,8 +422,10 @@ end IfElseReducedMirror{F,A,Nothing}(f, a, nothing) @inline (ier::IfElseReducedMirror)(x::NativeTypes, y::NativeTypes) = ifelse(ier.f(ier.a, ier.b), x, y) -@inline (ier::IfElseReducedMirror)(x::AbstractSIMD{W}, y::AbstractSIMD{W}) where {W} = - ifelse(ier.f(ier.a, ier.b), x, y) +@inline (ier::IfElseReducedMirror)( + x::AbstractSIMD{W}, + y::AbstractSIMD{W} +) where {W} = ifelse(ier.f(ier.a, ier.b), x, y) @inline function _reduce_mirror(f::F, x, y, a, b) where {F} r, _ = IfElseReducedMirror(f, a, b)(x, y) ifelse(f(r, y), r, y) @@ -429,8 +439,8 @@ end getfield(x, :data), getfield(y, :data), getfield(ier.a, :data), - getfield(ier.b, :data), - ), + getfield(ier.b, :data) + ) ) @inline function (ier::IfElseReducedMirror)(x::AbstractSIMD, y::NativeTypes) f = ier.f @@ -439,11 +449,13 @@ end ifelse(f(rm, ier.b), r, y) end - @inline (ier::IfElseReduceToMirror)(a::NativeTypes, ::NativeTypes) = a @inline (ier::IfElseReduceToMirror)(a::AbstractSIMD, ::NativeTypes) = VectorizationBase.ifelse_reduce_mirror(ier.f, a, ier.a) -@inline (ier::IfElseReduceToMirror)(a::AbstractSIMD{W}, ::AbstractSIMD{W}) where {W} = a +@inline (ier::IfElseReduceToMirror)( + a::AbstractSIMD{W}, + ::AbstractSIMD{W} +) where {W} = a @inline function (ier::IfElseReduceToMirror)(a::AbstractSIMD, b::AbstractSIMD) x, y = VectorizationBase.splitvector(a) # halve recursively w, z = VectorizationBase.splitvector(ier.a) # halve recursively @@ -452,7 +464,11 @@ end IfElseReduceToMirror(f, ifelse(fwz, w, z))(ifelse(fwz, x, y), b) end @inline (ier::IfElseReduceToMirror)(a::VecUnroll, b::VecUnroll) = VecUnroll( - VectorizationBase.fmap(ier, VectorizationBase.data(a), VectorizationBase.data(b)), + VectorizationBase.fmap( + ier, + VectorizationBase.data(a), + VectorizationBase.data(b) + ) ) # @inline (iec::IfElseCollapserMirror)(a) = getfield(VectorizationBase.ifelse_collapse_mirror(iec.f, a, iec.a), 1, false) @@ -525,10 +541,11 @@ const REDUCTION_CLASS = Dict{Symbol,Float64}( :max_fast => MAX, :min_fast => MIN, :vfmaddsub => ADDITIVE_IN_REDUCTIONS, - :vfmsubadd => ADDITIVE_IN_REDUCTIONS, + :vfmsubadd => ADDITIVE_IN_REDUCTIONS ) reduction_instruction_class(instr::Symbol) = get(REDUCTION_CLASS, instr, NaN) -reduction_instruction_class(instr::Instruction) = reduction_instruction_class(instr.instr) +reduction_instruction_class(instr::Instruction) = + reduction_instruction_class(instr.instr) function reduction_to_single_vector(x::Float64) if x == ADDITIVE_IN_REDUCTIONS :collapse_add @@ -546,8 +563,7 @@ function reduction_to_single_vector(x::Float64) throw("Reduction not found.") end end -function reduce_to_onevecunroll(x::Float64) - if x == ADDITIVE_IN_REDUCTIONS +reduce_to_onevecunroll(x::Float64) = if x == ADDITIVE_IN_REDUCTIONS :+ elseif x == MULTIPLICATIVE_IN_REDUCTIONS :* @@ -562,9 +578,7 @@ function reduce_to_onevecunroll(x::Float64) else throw("Reduction not found.") end -end -function reduce_number_of_vectors(x::Float64) - if x == ADDITIVE_IN_REDUCTIONS +reduce_number_of_vectors(x::Float64) = if x == ADDITIVE_IN_REDUCTIONS :contract_add elseif x == MULTIPLICATIVE_IN_REDUCTIONS :contract_mul @@ -579,9 +593,7 @@ function reduce_number_of_vectors(x::Float64) else throw("Reduction not found.") end -end -function reduction_to_scalar(x::Float64) - if x == ADDITIVE_IN_REDUCTIONS +reduction_to_scalar(x::Float64) = if x == ADDITIVE_IN_REDUCTIONS :vsum elseif x == MULTIPLICATIVE_IN_REDUCTIONS :vprod @@ -596,7 +608,6 @@ function reduction_to_scalar(x::Float64) else throw("Reduction not found.") end -end function reduction_scalar_combine(x::Float64) # x == 1.0 ? :reduced_add : x == 2.0 ? :reduced_prod : x == 3.0 ? :reduced_any : x == 4.0 ? :reduced_all : x == 5.0 ? :reduced_max : x == 6.0 ? :reduced_min : throw("Reduction not found.") if x == ADDITIVE_IN_REDUCTIONS @@ -653,7 +664,6 @@ function reduction_zero_class(x::Symbol)::Float64 end reduction_zero(x) = reduction_zero(reduction_instruction_class(x)) - function isreductcombineinstr(instr::Symbol) instr ∈ ( :reduced_add, @@ -663,7 +673,7 @@ function isreductcombineinstr(instr::Symbol) :reduced_max, :reduced_min, :reduce_to_max, - :reduce_to_min, + :reduce_to_min ) end isreductcombineinstr(instr::Instruction) = isreductcombineinstr(instr.instr) @@ -765,7 +775,7 @@ const FUNCTIONSYMBOLS = IdDict{Type{<:Function},Instruction}( typeof(ifelse) => :ifelse, typeof(identity) => :identity, typeof(conj) => :identity,#conj, - typeof(÷) => :vdiv_fast, + typeof(÷) => :vdiv_fast # typeof(zero) => :zero, # typeof(one) => :one, # typeof(axes) => :axes, diff --git a/src/modeling/determinestrategy.jl b/src/modeling/determinestrategy.jl index ae4576234..5cc04d68c 100644 --- a/src/modeling/determinestrategy.jl +++ b/src/modeling/determinestrategy.jl @@ -19,8 +19,8 @@ function check_linear_parents(ls::LoopSet, op::Operation, s::Symbol) :add_fast, :(-), :vsub, - :sub_fast, - ), + :sub_fast + ) ) || return false for opp ∈ parents(op) check_linear_parents(ls, opp, s) || return false @@ -61,7 +61,13 @@ function unitstride(ls::LoopSet, op::Operation, s::Symbol) end true end -function cannot_shuffle(op::Operation, u₁::Symbol, u₂::Symbol, contigind::Symbol, indices) # assumes isvectorized and !unitstride +function cannot_shuffle( + op::Operation, + u₁::Symbol, + u₂::Symbol, + contigind::Symbol, + indices +) # assumes isvectorized and !unitstride !(( !rejectcurly(op) && ( ( @@ -77,7 +83,7 @@ function cost( (u₁, u₂)::Tuple{Symbol,Symbol}, vloopsym::Symbol, Wshift::Int, - size_T::Int = op.elementbytes, + size_T::Int = op.elementbytes ) isconstant(op) && return 0.0, 0, 1.0#Float64(length(loopdependencies(op)) > 0) isloopvalue(op) && return 0.0, 0, 0.0 @@ -92,14 +98,15 @@ function cost( elseif iscompute(op) && ( Base.sym_in( instruction(op).instr, - (:vadd_nsw, :vsub_nsw, :(+), :(-), :add_fast, :sub_fast), + (:vadd_nsw, :vsub_nsw, :(+), :(-), :add_fast, :sub_fast) ) && all(opp -> (isloopvalue(opp)), parents(op)) )# || (reg_count(ls) == 32) && (instruction(op).instr === :ifelse)) # all(opp -> (isloopvalue(opp) | isconstant(opp)), parents(op)) return 0.0, 0, 0.0 end opisvectorized = isvectorized(op) - srt, sl, srp = opisvectorized ? vector_cost(instr, Wshift, size_T) : scalar_cost(instr) + srt, sl, srp = + opisvectorized ? vector_cost(instr, Wshift, size_T) : scalar_cost(instr) if accesses_memory(op) # either vbroadcast/reductionstore, vmov(a/u)pd, or gather/scatter if opisvectorized @@ -111,8 +118,10 @@ function cost( # cannot shuffle false means reject curly # either false means shuffle dont_shuffle = - (Wshift > 3) || - (rejectinterleave(op) && (cannot_shuffle(op, u₁, u₂, contigind, indices))) + (Wshift > 3) || ( + rejectinterleave(op) && + (cannot_shuffle(op, u₁, u₂, contigind, indices)) + ) if dont_shuffle # offset = 0.0 # gather/scatter, alignment doesn't matter r = 1 << shifter @@ -146,9 +155,7 @@ end # Base._return_type() -function biggest_type_size(ls::LoopSet) - maximum(elsize, operations(ls)) -end +biggest_type_size(ls::LoopSet) = maximum(elsize, operations(ls)) function hasintersection(a, b) for aᵢ ∈ a, bᵢ ∈ b aᵢ === bᵢ && return true @@ -182,9 +189,10 @@ function evaluate_cost_unroll( order::Vector{Symbol}, vloopsym::Symbol, max_cost::Float64 = typemax(Float64), - sld::Vector{Vector{Symbol}} = store_load_deps(operations(ls)), + sld::Vector{Vector{Symbol}} = store_load_deps(operations(ls)) ) - included_vars = fill!(resize!(ls.included_vars, length(operations(ls))), false) + included_vars = + fill!(resize!(ls.included_vars, length(operations(ls))), false) nested_loop_syms = Symbol[]#Set{Symbol}() total_cost = 0.0 iter = 1.0 @@ -206,10 +214,12 @@ function evaluate_cost_unroll( # it must also be a subset of defined symbols loopdependencies(op) ⊆ nested_loop_syms || continue # hasintersection(reduceddependencies(op), nested_loop_syms) && return Inf - (isassigned(sld, id) && any(s -> (s ∉ sld[id]), nested_loop_syms)) && return Inf + (isassigned(sld, id) && any(s -> (s ∉ sld[id]), nested_loop_syms)) && + return Inf included_vars[id] = true # TODO: use actual unrolls here? - c = first(cost(ls, op, (Symbol(""), Symbol("")), vloopsym, Wshift, size_T)) + c = + first(cost(ls, op, (Symbol(""), Symbol("")), vloopsym, Wshift, size_T)) total_cost += iter * c 0.9total_cost > max_cost && return total_cost # abort if more expensive; we only want to know the cheapest end @@ -227,13 +237,14 @@ function depchain_cost!( Wshift::Int, size_T::Int, rt::Float64 = 0.0, - sl::Int = 0, + sl::Int = 0 ) skip[identifier(op)] = true # depth first search for opp ∈ parents(op) skip[identifier(opp)] && continue - rt, sl = depchain_cost!(ls, skip, opp, unrolled, vloopsym, Wshift, size_T, rt, sl) + rt, sl = + depchain_cost!(ls, skip, opp, unrolled, vloopsym, Wshift, size_T, rt, sl) end # Basically assuming memory and compute don't conflict, but everything else does # Ie, ignoring the fact that integer and floating point operations likely don't either @@ -284,7 +295,8 @@ function unroll_no_reductions(ls, order, vloopsym) rpc = 0 # register pressure independent of unroll factor for op ∈ operations(ls) isu₁unrolled(op) || continue - rt, sl, rpop = cost(ls, op, (unrolled, Symbol("")), vloopsym, Wshift, size_T) + rt, sl, rpop = + cost(ls, op, (unrolled, Symbol("")), vloopsym, Wshift, size_T) if iscompute(op) compute_rt += rt compute_l += sl @@ -305,7 +317,11 @@ function unroll_no_reductions(ls, order, vloopsym) # if compute_rt > 40 # max(VectorizationBase.nextpow2( min( 4, round(Int, compute_rt / memory_rt) ) ), 1) # else - clamp(round(Int, compute_l / compute_rt), 1, Core.ifelse(compute_rt > 80, 2, 4)) + clamp( + round(Int, compute_l / compute_rt), + 1, + Core.ifelse(compute_rt > 80, 2, 4) + ) # end elseif iszero(load_rt) iszero(store_rt) ? 4 : max(1, min(4, round(Int, 2compute_rt / store_rt))) @@ -335,7 +351,8 @@ function unroll_no_reductions(ls, order, vloopsym) else reg_constraint = max(1, remaining_reg ÷ max(1, round(Int, rpp))) end - maybe_demote_unroll(ls, clamp(u, 1, reg_constraint), unrolled, vloopsym), unrolled + maybe_demote_unroll(ls, clamp(u, 1, reg_constraint), unrolled, vloopsym), + unrolled # rt = max(compute_rt, load_rt + store_rt) # # (iszero(rt) ? 4 : max(1, roundpow2( min( 4, round(Int, 16 / rt) ) ))), unrolled # (iszero(rt) ? 4 : max(1, VectorizationBase.nextpow2( min( 4, round(Int, 8 / rt) ) ))), unrolled @@ -344,7 +361,7 @@ function determine_unroll_factor( ls::LoopSet, order::Vector{Symbol}, unrolled::Symbol, - vloopsym::Symbol, + vloopsym::Symbol ) cacheunrolled!(ls, unrolled, Symbol(""), vloopsym) size_T = biggest_type_size(ls) @@ -361,7 +378,15 @@ function determine_unroll_factor( store_recip_throughput = 0.0 for op ∈ operations(ls) if isreduction(op) - rt, sl = depchain_cost!(ls, visited_nodes, op, unrolled, vloopsym, Wshift, size_T) + rt, sl = depchain_cost!( + ls, + visited_nodes, + op, + unrolled, + vloopsym, + Wshift, + size_T + ) if isouterreduction(ls, op) ≠ -1 || unrolled ∉ reduceddependencies(op) latency = max(sl, latency) end @@ -403,12 +428,17 @@ function demote_unroll_factor(ls::LoopSet, UF, loop::Loop) UF end -function determine_unroll_factor(ls::LoopSet, order::Vector{Symbol}, vloopsym::Symbol) +function determine_unroll_factor( + ls::LoopSet, + order::Vector{Symbol}, + vloopsym::Symbol +) num_reductions = count_reductions(ls) # The strategy is to use an unroll factor of 1, unless there appears to be loop carried dependencies (ie, num_reductions > 0) # The assumption here is that unrolling provides no real benefit, unless it is needed to enable OOO execution by breaking up these dependency chains loopindexesbit = ls.loopindexesbit - if iszero(length(loopindexesbit)) || ((!loopindexesbit[getloopid(ls, vloopsym)])) + if iszero(length(loopindexesbit)) || + ((!loopindexesbit[getloopid(ls, vloopsym)])) if iszero(num_reductions) return unroll_no_reductions(ls, order, vloopsym) else @@ -418,7 +448,8 @@ function determine_unroll_factor(ls::LoopSet, order::Vector{Symbol}, vloopsym::S return 8 ÷ ls.vector_width, vloopsym else # handle `BitArray` loops with reductions rttemp, ltemp = determine_unroll_factor(ls, order, vloopsym, vloopsym) - UF = min(8, VectorizationBase.nextpow2(max(1, round(Int, ltemp / (rttemp))))) + UF = + min(8, VectorizationBase.nextpow2(max(1, round(Int, ltemp / (rttemp))))) UFfactor = 8 ÷ ls.vector_width cld(UF, UFfactor) * UFfactor, vloopsym # UF2 = cld(UF, UFfactor)*UFfactor, vloopsym @@ -431,7 +462,7 @@ function determine_unroll_factor( ls::LoopSet, order::Vector{Symbol}, vloopsym::Symbol, - num_reductions::Int, + num_reductions::Int ) innermost_loop = last(order) rt = Inf @@ -442,7 +473,10 @@ function determine_unroll_factor( reject_reorder(ls, unrolled, false) && continue rttemp, ltemp = determine_unroll_factor(ls, order, unrolled, vloopsym) rtcomptemp = - rttemp + (0.01 * ((vloopsym === unrolled) + (unrolled === innermost_loop) - latency)) + rttemp + ( + 0.01 * + ((vloopsym === unrolled) + (unrolled === innermost_loop) - latency) + ) if rtcomptemp < rtcomp rt = rttemp rtcomp = rtcomptemp @@ -455,12 +489,18 @@ function determine_unroll_factor( if lrtratio ≥ 7.0 UF = 8 else - UF = VectorizationBase.nextpow2(round(Int, clamp(lrtratio, 1.0, 4.0), RoundUp)) + UF = + VectorizationBase.nextpow2(round(Int, clamp(lrtratio, 1.0, 4.0), RoundUp)) end UF = maybe_demote_unroll(ls, UF, best_unrolled, vloopsym) UF, best_unrolled end -function maybe_demote_unroll(ls::LoopSet, UF::Int, unrollsym::Symbol, vloopsym::Symbol)::Int +function maybe_demote_unroll( + ls::LoopSet, + UF::Int, + unrollsym::Symbol, + vloopsym::Symbol +)::Int if unrollsym === vloopsym return demote_unroll_factor(ls, UF, vloopsym) else @@ -513,12 +553,13 @@ function solve_unroll_lagrange( u₂L, u₁step::Int, u₂step::Int, - atleast31registers::Bool, + atleast31registers::Bool ) X₁, X₂, X₃, X₄ = X[1], X[2], X[3], X[4] # If we don't have opmask registers, masks probably occupy a vector register (e.g., on CPUs with AVX but not AVX512) R₁, R₂, R₃, R₄ = R[1], R[2], R[3], R[4] - iszero(R₃) || return solve_unroll_iter(X, R, u₁L, u₂L, u₁step:u₁step:10, u₂step:u₂step:10) + iszero(R₃) || + return solve_unroll_iter(X, R, u₁L, u₂L, u₁step:u₁step:10, u₂step:u₂step:10) RR = R₄ a = R₂^2 * X₃ - R₁ * X₄ * R₂ - R₁ * X₂ * RR b = R₁ * X₄ * RR - R₁ * X₄ * RR - 2X₃ * RR * R₂ @@ -530,9 +571,18 @@ function solve_unroll_lagrange( u₁float_finite = isfinite(u₁float) u₂float_finite = isfinite(u₂float) if !(u₁float_finite & u₂float_finite) # brute force - u₁high = Core.ifelse(iszero(X₃), u₁step, Core.ifelse(atleast31registers, 8, 6)) - u₂high = Core.ifelse(iszero(X₂), u₂step, Core.ifelse(atleast31registers, 8, 6)) - return solve_unroll_iter(X, R, u₁L, u₂L, u₁step:u₁step:u₁high, u₂step:u₂step:u₂high) + u₁high = + Core.ifelse(iszero(X₃), u₁step, Core.ifelse(atleast31registers, 8, 6)) + u₂high = + Core.ifelse(iszero(X₂), u₂step, Core.ifelse(atleast31registers, 8, 6)) + return solve_unroll_iter( + X, + R, + u₁L, + u₂L, + u₁step:u₁step:u₁high, + u₂step:u₂step:u₂high + ) end u₁low = floor(Int, u₁float) u₂low = max(u₂step, floor(Int, 0.8u₂float)) # must be at least 1 @@ -555,7 +605,7 @@ function solve_unroll_lagrange( u₁L, u₂L, reverse(u₁low:u₁step:u₁high), - reverse(u₂low:u₂step:u₂high), + reverse(u₂low:u₂step:u₂high) ) end @@ -585,10 +635,11 @@ function solve_unroll( u₂L, u₁step, u₂step, - atleast31registers::Bool, + atleast31registers::Bool ) # iszero(first(R)) && return -1,-1,Inf #solve_smalltilesize(X, R, u₁max, u₂max) - u₁, u₂, cost = solve_unroll_lagrange(X, R, u₁L, u₂L, u₁step, u₂step, atleast31registers) + u₁, u₂, cost = + solve_unroll_lagrange(X, R, u₁L, u₂L, u₁step, u₂step, atleast31registers) # u₂ -= u₂ & 1 # u₁ = min(u₁, u₂) u₁_too_large = u₁ > u₁max @@ -608,9 +659,7 @@ function solve_unroll( end u₁, u₂, cost end -function maybedemotesize(U::Int, N::Int) - num_iterations(N, num_iterations(N, U)) -end +maybedemotesize(U::Int, N::Int) = num_iterations(N, num_iterations(N, U)) function maybedemotesize(u₂::Int, N::Int, U::Int, Uloop::Loop, maxu₂base::Int) u₂ > 1 || return 1 u₂ == N && return u₂ @@ -631,7 +680,7 @@ function solve_unroll( reg_pressure::AbstractVector{Float64}, W::Int, vloopsym::Symbol, - rounduᵢ::Int, + rounduᵢ::Int ) (u₁step, u₂step) = if rounduᵢ == 1 # max is to safeguard against some weird arch I've never heard of. (clamp(cache_lnsze(ls) ÷ reg_size(ls), 1, 4), 1) @@ -657,7 +706,7 @@ function solve_unroll( u₂loop, u₁step, u₂step, - reg_count(ls) ≥ 31, + reg_count(ls) ≥ 31 ) end @@ -672,7 +721,7 @@ function solve_unroll( u₂loop::Loop, u₁step::Int, u₂step::Int, - atleast31registers::Bool, + atleast31registers::Bool ) maxu₂base = maxu₁base = atleast31registers ? 10 : 6#8 maxu₂ = maxu₂base#8 @@ -724,7 +773,7 @@ function solve_unroll( u₂Lf, u₁step, u₂step, - atleast31registers, + atleast31registers ) # heuristic to more evenly divide small numbers of iterations if isstaticloop(u₂loop) @@ -765,7 +814,12 @@ function loopdepindices(ls::LoopSet, op::Operation) end loopdepsret end -function stride_penalty(ls::LoopSet, op::Operation, order::Vector{Symbol}, loopfreqs) +function stride_penalty( + ls::LoopSet, + op::Operation, + order::Vector{Symbol}, + loopfreqs +) loopdeps = loopdepindices(ls, op) opstrides = Vector{Int}(undef, length(loopdeps)) # very minor stride assumption here, because we don't really want to base optimization decisions on it... @@ -931,7 +985,11 @@ function maxnegativeoffset(ls::LoopSet, op::Operation, u::Symbol) end mno, id end -function maxnegativeoffset(ls::LoopSet, op::Operation, unrollsyms::UnrollSymbols) +function maxnegativeoffset( + ls::LoopSet, + op::Operation, + unrollsyms::UnrollSymbols +) @unpack u₁loopsym, u₂loopsym, vloopsym = unrollsyms mno = typemin(Int) i = 0 @@ -960,7 +1018,7 @@ function load_elimination_cost_factor!( iters, unrollsyms::UnrollSymbols, Wshift, - size_T, + size_T ) @unpack u₁loopsym, u₂loopsym, vloopsym = unrollsyms if !iszero(first(isoptranslation(ls, op, unrollsyms))) @@ -1052,7 +1110,7 @@ function add_constant_offset_load_elmination_cost!( u₂reduces::Bool, Wshift::Int, size_T::Int, - opisininnerloop::Bool, + opisininnerloop::Bool ) @unpack u₁loopsym, u₂loopsym, vloopsym = unrollsyms offset, uid = maxnegativeoffset(ls, op, unrollsyms) @@ -1096,8 +1154,8 @@ function add_constant_offset_load_elmination_cost!( end end -function update_cost_vec!(costs, cost, u₁reduces, u₂reduces) - @inbounds if u₁reduces & u₂reduces +update_cost_vec!(costs, cost, u₁reduces, u₂reduces) = @inbounds if u₁reduces & + u₂reduces costs[4] += cost elseif u₂reduces # cost decreased by unrolling u₂loop costs[2] += cost @@ -1106,9 +1164,7 @@ function update_cost_vec!(costs, cost, u₁reduces, u₂reduces) else # no cost decrease; cost must be repeated costs[1] += cost end -end -function update_reg_pres!(rp, cost, u₁reduces, u₂reduces) - @inbounds if u₁reduces# & u₂reduces +update_reg_pres!(rp, cost, u₁reduces, u₂reduces) = @inbounds if u₁reduces# & u₂reduces rp[4] -= cost elseif u₂reduces # cost decreased by unrolling u₂loop rp[2] += cost @@ -1117,7 +1173,6 @@ function update_reg_pres!(rp, cost, u₁reduces, u₂reduces) else # no cost decrease; cost must be repeated rp[1] += cost end -end function child_dependent_u₁u₂(op::Operation) u₁ = u₂ = false for opc ∈ children(op) @@ -1130,13 +1185,20 @@ function evaluate_cost_tile( ls::LoopSet, order::Vector{Symbol}, unrollsyms::UnrollSymbols, - anyisbit::Bool = false, + anyisbit::Bool = false ) nops = length(operations(ls)) iters = Vector{Float64}(undef, nops) reduced_by_unrolling = Array{Bool}(undef, 2, 2, nops) fill_children!(ls) - evaluate_cost_tile!(iters, reduced_by_unrolling, ls, order, unrollsyms, anyisbit) + evaluate_cost_tile!( + iters, + reduced_by_unrolling, + ls, + order, + unrollsyms, + anyisbit + ) end function evaluate_cost_tile!( iters::Vector{Float64}, @@ -1146,7 +1208,7 @@ function evaluate_cost_tile!( unrollsyms::UnrollSymbols, anyisbit::Bool, sld::Vector{Vector{Symbol}} = store_load_deps(operations(ls)), - holdopinreg::Vector{Bool} = holdopinregister(ls), + holdopinreg::Vector{Bool} = holdopinregister(ls) ) N = length(order) @assert N ≥ 2 "Cannot tile merely $N loops!" @@ -1231,7 +1293,8 @@ function evaluate_cost_tile!( return 0, 0, Inf, false included_vars[id] = true if isconstant(op) - depends_on_u₁, depends_on_u₂ = isunrolled_sym(op, u₁loopsym, u₂loopsym, vloopsym) + depends_on_u₁, depends_on_u₂ = + isunrolled_sym(op, u₁loopsym, u₂loopsym, vloopsym) reduced_by_unrolling[1, 1, id] = !depends_on_u₁ reduced_by_unrolling[2, 1, id] = !depends_on_u₂ else @@ -1276,7 +1339,7 @@ function evaluate_cost_tile!( u₂reducesrp, Wshift, size_T, - opisininnerloop, + opisininnerloop ) continue elseif load_elimination_cost_factor!( @@ -1288,7 +1351,7 @@ function evaluate_cost_tile!( iters[id], unrollsyms, Wshift, - size_T, + size_T ) continue end @@ -1336,7 +1399,8 @@ function evaluate_cost_tile!( # @inbounds ((cost_vec[4] > 0) || ((cost_vec[2] > 0) & (cost_vec[3] > 0))) || return 0,0,Inf,false # reg_pres[4] == remaining_registers costpenalty = - ((reg_pressure[1] + reg_pressure[2] + reg_pressure[3]) > reg_pressure[4]) ? 2 : 1 + ((reg_pressure[1] + reg_pressure[2] + reg_pressure[3]) > reg_pressure[4]) ? + 2 : 1 u₁v = vloopsym === u₁loopsym u₂v = vloopsym === u₂loopsym visbit = anyisbit && ls.loopindexesbit[getloopid(ls, vloopsym)] @@ -1360,18 +1424,27 @@ function evaluate_cost_tile!( 1, 1, length(getloop(ls, u₁loopsym)), - length(getloop(ls, u₂loopsym)), + length(getloop(ls, u₂loopsym)) ) else - u₁, u₂, ucost = - solve_unroll(ls, u₁loopsym, u₂loopsym, cost_vec, reg_pressure, W, vloopsym, round_uᵢ) + u₁, u₂, ucost = solve_unroll( + ls, + u₁loopsym, + u₂loopsym, + cost_vec, + reg_pressure, + W, + vloopsym, + round_uᵢ + ) end outer_reduct_penalty = length(ls.outer_reductions) * (u₁ + isodd(u₁)) favor_bigger_u₂ = u₁ - u₂ # favor_smaller_vloopsym = (u₁v ? u₁ : -u₁) + (u₂v ? u₂ : -u₂) favor_smaller_vectorized = (u₁v ⊻ u₂v) ? (u₁v ? u₁ - u₂ : u₂ - u₁) : 0 favor_u₁_vectorized = -0.2u₁v - favoring_heuristics = favor_bigger_u₂ + 0.5favor_smaller_vectorized + favor_u₁_vectorized + favoring_heuristics = + favor_bigger_u₂ + 0.5favor_smaller_vectorized + favor_u₁_vectorized costpenalty = costpenalty * ucost + stride_penalty(ls, order) + @@ -1461,7 +1534,11 @@ function LoopOrders(ls::LoopSet) else reductsyms, nonreductsyms = outer_reduct_loopordersplit(ls) end - LoopOrders(nonreductsyms, reductsyms, Vector{Symbol}(undef, length(ls.loopsymbols))) + LoopOrders( + nonreductsyms, + reductsyms, + Vector{Symbol}(undef, length(ls.loopsymbols)) + ) end nonreductview(lo::LoopOrders) = view(lo.buff, 1:length(lo.syms_nr)) @@ -1503,7 +1580,7 @@ swap!(x::AbstractVector, i::Int, j::Int) = (x[j], x[i]) = (x[i], x[j]) function swap!( dest::AbstractVector{Symbol}, src::AbstractVector{Symbol}, - offs::AbstractVector{Int}, + offs::AbstractVector{Int} ) copyto!(dest, src) for i ∈ eachindex(offs) @@ -1539,7 +1616,7 @@ function choose_unroll_order( ls::LoopSet, lowest_cost::Float64 = Inf, sld::Vector{Vector{Symbol}} = store_load_deps(operations(ls)), - v::Int = 0, + v::Int = 0 ) iszero(length(offsetloadcollection(ls).opidcollectionmap)) && fill_offset_memop_collection!(ls) @@ -1565,8 +1642,6 @@ function choose_unroll_order( end end - - """ This function searches for unrolling combinations that will cause LoopVectorization to generate invalid code. @@ -1577,7 +1652,6 @@ But separate vectors for a reduced loop are not needed. Separate vectors will he so you want to unroll at least one of the loops. However, reductions demand combining all the separate vectors, and each vector also eats a valuable register, so it's best to avoid excessive numbers these accumulation vectors. - If a reduced op depends on both unrolled loops (u1 and u2), it will check over which of these it is reduced. If... neither: cannot avoid unrolling it along both one of them: don't unroll the reduced loop @@ -1632,7 +1706,7 @@ inlinedecision(inline::Int, shouldinline::Bool) = function choose_tile( ls::LoopSet, sld::Vector{Vector{Symbol}} = store_load_deps(operations(ls)), - v::Int = 0, + v::Int = 0 ) iszero(length(offsetloadcollection(ls).opidcollectionmap)) && fill_offset_memop_collection!(ls) @@ -1673,7 +1747,7 @@ function choose_tile( UnrollSymbols(newu₁, newu₂, new_vec), anyisbit, sld, - holdopinreg, + holdopinreg ) # if cost_temp < lowest_cost # leads to 4 vmovapds if cost_temp ≤ lowest_cost # lead to 2 vmovapds @@ -1724,7 +1798,8 @@ function choose_order_cost(ls::LoopSet, v::Int = 0) resize!(ls.loop_order, length(ls.loopsymbols)) sld = store_load_deps(operations(ls)) if (num_loops(ls) > 1) && (length(ls.operations) ≤ 100) - torder, tunroll, ttile, tvec, tU, tT, tc, shouldinline = choose_tile(ls, sld, v) + torder, tunroll, ttile, tvec, tU, tT, tc, shouldinline = + choose_tile(ls, sld, v) else torder = names(ls) # dummy tunroll = ttile = tvec = Symbol("##undefined##") # dummy diff --git a/src/modeling/graphs.jl b/src/modeling/graphs.jl index 5f38d4a07..8d41cf726 100644 --- a/src/modeling/graphs.jl +++ b/src/modeling/graphs.jl @@ -31,9 +31,12 @@ UnPack.unpack(ua::UnrollArgs, ::Val{:u₂loopsym}) = getfield(getfield(ua, :u₂loop), :itersymbol) UnPack.unpack(ua::UnrollArgs, ::Val{:vloopsym}) = getfield(getfield(ua, :vloop), :itersymbol) -UnPack.unpack(ua::UnrollArgs, ::Val{:u₁step}) = getfield(getfield(ua, :u₁loop), :step) -UnPack.unpack(ua::UnrollArgs, ::Val{:u₂step}) = getfield(getfield(ua, :u₂loop), :step) -UnPack.unpack(ua::UnrollArgs, ::Val{:vstep}) = getfield(getfield(ua, :vloop), :step) +UnPack.unpack(ua::UnrollArgs, ::Val{:u₁step}) = + getfield(getfield(ua, :u₁loop), :step) +UnPack.unpack(ua::UnrollArgs, ::Val{:u₂step}) = + getfield(getfield(ua, :u₂loop), :step) +UnPack.unpack(ua::UnrollArgs, ::Val{:vstep}) = + getfield(getfield(ua, :vloop), :step) struct UnrollSpecification u₁loopnum::Int @@ -52,7 +55,8 @@ end # UnrollSpecification(u₁loopnum, u₂loopnum, vloopnum, u₁, u₂) # end isunrolled1(us::UnrollSpecification, n::Int) = us.u₁loopnum == n -isunrolled2(us::UnrollSpecification, n::Int) = !isunrolled1(us, n) && us.u₂loopnum == n +isunrolled2(us::UnrollSpecification, n::Int) = + !isunrolled1(us, n) && us.u₂loopnum == n isvectorized(us::UnrollSpecification, n::Int) = us.vloopnum == n function unrollfactor(us::UnrollSpecification, n::Int) @unpack u₁loopnum, u₂loopnum, u₁, u₂ = us @@ -67,9 +71,11 @@ function pushexpr!(ex::Expr, mk::MaybeKnown) nothing end pushexpr!(ex::Expr, x::Union{Symbol,Expr}) = (push!(ex.args, x); nothing) -pushexpr!(ex::Expr, x::Integer) = (push!(ex.args, staticexpr(convert(Int, x))); nothing) +pushexpr!(ex::Expr, x::Integer) = + (push!(ex.args, staticexpr(convert(Int, x))); nothing) pushexpr!(ex::Expr, @nospecialize(x::StaticInt)) = (push!(ex.args, x); nothing) -MaybeKnown(x::Integer) = MaybeKnown(convert(Int, x), Symbol("##UNDEFINED##"), true) +MaybeKnown(x::Integer) = + MaybeKnown(convert(Int, x), Symbol("##UNDEFINED##"), true) MaybeKnown(x::Integer, default::Int) = MaybeKnown(x) MaybeKnown(x::Symbol, default::Int) = MaybeKnown(default, x, false) @@ -86,7 +92,7 @@ function Loop( stop::Union{Int,Symbol}, step::Union{Int,Symbol}, rangename::Symbol, - lensym::Symbol, + lensym::Symbol ) Loop( itersymbol, @@ -94,7 +100,7 @@ function Loop( MaybeKnown(stop, 1024), MaybeKnown(step, 1), rangename, - lensym, + lensym ) end startstopΔ(loop::Loop) = gethint(last(loop)) - gethint(first(loop)) @@ -110,8 +116,6 @@ Base.step(l::Loop) = getfield(l, :step) isstaticloop(l::Loop) = isknown(first(l)) & isknown(last(l)) & isknown(step(l)) unitstep(l::Loop) = isone(step(l)) - - function startloop(loop::Loop, itersymbol, staticinit::Bool = false) start = first(loop) if isknown(start) @@ -142,7 +146,12 @@ addexpr(a, b) = arithmeticexpr(+, :vadd_nsw, a, b) subexpr(a, b) = arithmeticexpr(-, :vsub_nsw, a, b) mulexpr(a, b) = arithmeticexpr(*, :vmul_nsw, a, b) lazymulexpr(a, b) = arithmeticexpr(*, :lazymul, a, b) -function arithmeticexpr(op, f, a::Union{Integer,MaybeKnown}, b::Union{Integer,MaybeKnown}) +function arithmeticexpr( + op, + f, + a::Union{Integer,MaybeKnown}, + b::Union{Integer,MaybeKnown} +) if isknown(a) & isknown(b) return staticexpr(op(gethint(a), gethint(b))) else @@ -213,7 +222,8 @@ function addexpr(ex, incr::Integer) pushexpr!(expr, convert(Int, incr)) expr end -staticmulincr(ptr, incr) = Expr(:call, lv(:staticmul), Expr(:call, :eltype, ptr), incr) +staticmulincr(ptr, incr) = + Expr(:call, lv(:staticmul), Expr(:call, :eltype, ptr), incr) @inline cmpend(i::Int, r::AbstractCloseOpen) = i < getfield(r, :upper) @inline cmpend(i::Int, r::AbstractUnitRange) = i ≤ last(r) @@ -237,14 +247,19 @@ function staticloopexpr(loop::Loop) s = step(loop) l = last(loop) if isone(s) - Expr(:call, GlobalRef(Base, :(:)), staticexpr(gethint(f)), staticexpr(gethint(l))) + Expr( + :call, + GlobalRef(Base, :(:)), + staticexpr(gethint(f)), + staticexpr(gethint(l)) + ) else Expr( :call, GlobalRef(Base, :(:)), staticexpr(gethint(f)), staticexpr(gethint(s)), - staticexpr(gethint(l)), + staticexpr(gethint(l)) ) end end @@ -256,7 +271,12 @@ function vec_looprange(loop::Loop, UF::Int, mangledname) vec_looprange(UF, mangledname, loop.rangesym, fast) end end -function vec_looprange(UF::Int, mangledname, r::Union{Expr,Symbol}, zerostart::Bool) +function vec_looprange( + UF::Int, + mangledname, + r::Union{Expr,Symbol}, + zerostart::Bool +) cmp = zerostart ? lv(:vcmpendzs) : lv(:vcmpend) if isone(UF) Expr(:call, cmp, mangledname, r, VECTORWIDTHSYMBOL) @@ -285,7 +305,7 @@ function terminatecondition( n::Int, mangledname::Symbol, inclmask::Bool, - UF::Int = unrollfactor(us, n), + UF::Int = unrollfactor(us, n) ) if !isvectorized(us, n) looprange(loop, UF, mangledname) @@ -301,7 +321,7 @@ function incrementloopcounter( n::Int, mangledname::Symbol, UF::Int, - l::Loop, + l::Loop ) incr = step(l) if isknown(incr) @@ -310,7 +330,12 @@ function incrementloopcounter( incrementloopcounter(us, n, mangledname, UF, getsym(incr)) end end -function incrementloopcounter(us::UnrollSpecification, n::Int, mangledname::Symbol, UF::Int) +function incrementloopcounter( + us::UnrollSpecification, + n::Int, + mangledname::Symbol, + UF::Int +) if isvectorized(us, n) if isone(UF) Expr(:(=), mangledname, addexpr(VECTORWIDTHSYMBOL, mangledname)) @@ -318,7 +343,7 @@ function incrementloopcounter(us::UnrollSpecification, n::Int, mangledname::Symb Expr( :(=), mangledname, - addexpr(mulexpr(VECTORWIDTHSYMBOL, staticexpr(UF)), mangledname), + addexpr(mulexpr(VECTORWIDTHSYMBOL, staticexpr(UF)), mangledname) ) end else @@ -330,16 +355,23 @@ function incrementloopcounter( n::Int, mangledname::Symbol, UF::Int, - incr::Symbol, + incr::Symbol ) if isvectorized(us, n) if isone(UF) - Expr(:(=), mangledname, addexpr(mulexpr(VECTORWIDTHSYMBOL, incr), mangledname)) + Expr( + :(=), + mangledname, + addexpr(mulexpr(VECTORWIDTHSYMBOL, incr), mangledname) + ) else Expr( :(=), mangledname, - addexpr(mulexpr(mulexpr(VECTORWIDTHSYMBOL, staticexpr(UF)), incr), mangledname), + addexpr( + mulexpr(mulexpr(VECTORWIDTHSYMBOL, staticexpr(UF)), incr), + mangledname + ) ) end else @@ -347,7 +379,13 @@ function incrementloopcounter( end end -function incrementloopcounter!(q, us::UnrollSpecification, n::Int, UF::Int, l::Loop) +function incrementloopcounter!( + q, + us::UnrollSpecification, + n::Int, + UF::Int, + l::Loop +) incr = step(l) if isknown(incr) incrementloopcounter!(q, us, n, UF * gethint(incr)) @@ -366,7 +404,13 @@ function incrementloopcounter!(q, us::UnrollSpecification, n::Int, UF::Int) push!(q.args, staticexpr(UF)) end end -function incrementloopcounter!(q, us::UnrollSpecification, n::Int, UF::Int, incr::Symbol) +function incrementloopcounter!( + q, + us::UnrollSpecification, + n::Int, + UF::Int, + incr::Symbol +) if isvectorized(us, n) if isone(UF) push!(q.args, mulexpr(VECTORWIDTHSYMBOL, incr)) @@ -404,12 +448,13 @@ function Base.resize!(lo::LoopOrder, N::Int) end Base.size(lo::LoopOrder) = (2, 2, 2, length(lo.loopnames)) Base.@propagate_inbounds Base.getindex(lo::LoopOrder, i::Int) = lo.oporder[i] -Base.@propagate_inbounds Base.getindex(lo::LoopOrder, i::Vararg{Int,K}) where {K} = - lo.oporder[LinearIndices(size(lo))[i...]] +Base.@propagate_inbounds Base.getindex( + lo::LoopOrder, + i::Vararg{Int,K} +) where {K} = lo.oporder[LinearIndices(size(lo))[i...]] @enum NumberType::Int8 HardInt HardFloat IntOrFloat INVALID - struct LoopStartStopManager terminators::Vector{Int} incrementedptrs::Vector{Vector{ArrayReferenceMeta}} @@ -466,16 +511,16 @@ function UnrollArgs( u₁::Int, unrollsyms::UnrollSymbols, u₂max::Int, - suffix::Int, + suffix::Int ) @unpack u₁loopsym, u₂loopsym, vloopsym = unrollsyms u₁loop = getloop(ls, u₁loopsym) - u₂loop = u₂loopsym === Symbol("##undefined##") ? u₁loop : getloop(ls, u₂loopsym) + u₂loop = + u₂loopsym === Symbol("##undefined##") ? u₁loop : getloop(ls, u₂loopsym) vloop = getloop(ls, vloopsym) UnrollArgs(u₁loop, u₂loop, vloop, u₁, u₂max, suffix) end - function cost_vec_buf(ls::LoopSet) cv = @view(ls.cost_vec[:, 2]) @inbounds for i ∈ 1:4 @@ -509,7 +554,12 @@ end available_registers() = ifelse(has_opmask_registers(), register_count(), register_count() - One()) function set_hw!(ls::LoopSet) - set_hw!(ls, Int(register_size()), Int(available_registers()), Int(cache_linesize())) + set_hw!( + ls, + Int(register_size()), + Int(available_registers()), + Int(cache_linesize()) + ) end reg_size(ls::LoopSet) = ls.register_size reg_count(ls::LoopSet) = ls.register_count @@ -609,27 +659,25 @@ function LoopSet(mod::Symbol) end """ - Used internally to create symbols unique for this loopset. - This is used so that identical loops will create identical `_turbo_!` calls in the macroexpansions, hopefully reducing recompilation. - """ +Used internally to create symbols unique for this loopset. +This is used so that identical loops will create identical `_turbo_!` calls in the macroexpansions, hopefully reducing recompilation. +""" gensym!(ls::LoopSet, s) = Symbol("###$(s)###$(ls.symcounter += 1)###") -function fill_children!(ls::LoopSet) - for op ∈ operations(ls) +fill_children!(ls::LoopSet) = for op ∈ operations(ls) empty!(children(op)) for opp ∈ parents(op) @assert children(opp) !== NOPARENTS push!(children(opp), op) end end -end function rejectinterleave!( ls::LoopSet, op::Operation, u₁loop::Symbol, u₂loop::Symbol, vloopsym::Symbol, - vloop::Loop, + vloop::Loop ) setunrolled!(ls, op, u₁loop, u₂loop, vloopsym) if accesses_memory(op) @@ -647,7 +695,12 @@ function rejectinterleave!( end end end -function cacheunrolled!(ls::LoopSet, u₁loop::Symbol, u₂loop::Symbol, vloopsym::Symbol) +function cacheunrolled!( + ls::LoopSet, + u₁loop::Symbol, + u₂loop::Symbol, + vloopsym::Symbol +) vloop = getloop(ls, vloopsym) for op ∈ operations(ls) rejectinterleave!(ls, op, u₁loop, u₂loop, vloopsym, vloop) @@ -658,7 +711,7 @@ function setunrolled!( op::Operation, u₁loopsym::Symbol, u₂loopsym::Symbol, - vectorized::Symbol, + vectorized::Symbol ) u₁::Bool = u₂::Bool = v::Bool = false for ld ∈ loopdependencies(op) @@ -735,9 +788,10 @@ end # looprangesym(ls::LoopSet, s::Symbol) = getloop(ls, s).rangesym """ - getop only works while construction a LoopSet object. You cannot use it while lowering. - """ -getop(ls::LoopSet, var::Number, elementbytes) = add_constant!(ls, var, elementbytes) +getop only works while construction a LoopSet object. You cannot use it while lowering. +""" +getop(ls::LoopSet, var::Number, elementbytes) = + add_constant!(ls, var, elementbytes) function getop(ls::LoopSet, var::Symbol, elementbytes::Int) get!(ls.opdict, var) do add_constant!(ls, var, elementbytes) @@ -777,7 +831,7 @@ function Operation( dependencies, reduced_deps, parents, - ref = NOTAREFERENCE, + ref = NOTAREFERENCE ) Operation( length(operations(ls)), @@ -788,7 +842,7 @@ function Operation( dependencies, reduced_deps, parents, - ref, + ref ) end function Operation( @@ -797,14 +851,24 @@ function Operation( elementbytes, instr, optype, - mpref::ArrayReferenceMetaPosition, + mpref::ArrayReferenceMetaPosition ) - Operation(length(operations(ls)), variable, elementbytes, instr, optype, mpref) + Operation( + length(operations(ls)), + variable, + elementbytes, + instr, + optype, + mpref + ) end operations(ls::LoopSet) = ls.operations -function getconstvalues(ls::LoopSet, opparents::Vector{Operation})::Tuple{Bool,Vector{Any}} +function getconstvalues( + ls::LoopSet, + opparents::Vector{Operation} +)::Tuple{Bool,Vector{Any}} vals = sizehint!(Any[], length(opparents)) for i ∈ eachindex(opparents) pushconstvalue!(vals, ls, opparents[i]) && return true, vals @@ -812,7 +876,11 @@ function getconstvalues(ls::LoopSet, opparents::Vector{Operation})::Tuple{Bool,V false, vals end -function add_constant_compute!(ls::LoopSet, op::Operation, var::Symbol)::Operation +function add_constant_compute!( + ls::LoopSet, + op::Operation, + var::Symbol +)::Operation op.node_type = constant instr = instruction(op) opparents = parents(op) @@ -827,8 +895,8 @@ function add_constant_compute!(ls::LoopSet, op::Operation, var::Symbol)::Operati :vfmadd_fast, :vfnmadd_fast, :vfmsub_fast, - :vfnmsub_fast, - ), + :vfnmsub_fast + ) ) getconstfailed, vals = getconstvalues(ls, opparents) if !getconstfailed @@ -855,25 +923,25 @@ function add_constant_compute!(ls::LoopSet, op::Operation, var::Symbol)::Operati return add_constant!( ls, T((big(vals[1]) * big(vals[2]) + big(vals[3]))), - 8, + 8 )::Operation elseif f === :vfnmadd_fast return add_constant!( ls, T(big(vals[3]) - big(vals[1]) * big(vals[2])), - 8, + 8 )::Operation elseif f === :vfmsub_fast return add_constant!( ls, T((big(vals[1]) * big(vals[2]) - big(vals[3]))), - 8, + 8 )::Operation elseif f === :vfnmsub_fast return add_constant!( ls, T(-(big(vals[1]) * big(vals[2]) + big(vals[3]))), - 8, + 8 )::Operation end end @@ -942,20 +1010,21 @@ add_loop_bound!( itersym::Symbol, bound::Union{Integer,Symbol}, upper::Bool, - step::Bool, + step::Bool )::MaybeKnown = MaybeKnown(bound, upper ? 1024 : 1) function add_loop_bound!( ls::LoopSet, itersym::Symbol, bound::Expr, upper::Bool, - step::Bool, + step::Bool )::MaybeKnown makestatic!(bound) N = gensym!( ls, - string(itersym) * - (upper ? "_loop_upper_bound" : (step ? "_loop_step" : "_loop_lower_bound")), + string(itersym) * ( + upper ? "_loop_upper_bound" : (step ? "_loop_step" : "_loop_lower_bound") + ) ) pushprepreamble!(ls, Expr(:(=), N, bound)) MaybeKnown(N, upper ? 1024 : 1) @@ -965,7 +1034,7 @@ function range_loop!( itersym::Symbol, l::MaybeKnown, u::MaybeKnown, - s::MaybeKnown, + s::MaybeKnown ) rangename = gensym!(ls, "range") lenname = gensym!(ls, "length") @@ -976,7 +1045,11 @@ function range_loop!( pushprepreamble!(ls, Expr(:(=), rangename, range)) pushprepreamble!( ls, - Expr(:(=), lenname, Expr(:call, GlobalRef(ArrayInterface, :static_length), rangename)), + Expr( + :(=), + lenname, + Expr(:call, GlobalRef(ArrayInterface, :static_length), rangename) + ) ) Loop(itersym, l, u, s, rangename, lenname) end @@ -1008,7 +1081,10 @@ function oneto_loop!(ls::LoopSet, r::Expr, itersym::Symbol)::Loop lensym = N = gensym!(ls, "loop" * string(itersym)) rangename = gensym!(ls, "range") pushprepreamble!(ls, Expr(:(=), N, otN)) - pushprepreamble!(ls, Expr(:(=), rangename, Expr(:call, :(:), staticexpr(1), N))) + pushprepreamble!( + ls, + Expr(:(=), rangename, Expr(:call, :(:), staticexpr(1), N)) + ) MaybeKnown(N, 1024) end Loop(itersym, l, u, s, rangename, lensym) @@ -1016,19 +1092,26 @@ end @inline _reverse(r) = maybestaticlast(r):-static_step(r):maybestaticfirst(r) @inline canonicalize_range(r::OptionallyStaticUnitRange) = r -@inline function canonicalize_range(r::OptionallyStaticRange, ::StaticInt{S}) where {S} +@inline function canonicalize_range( + r::OptionallyStaticRange, + ::StaticInt{S} +) where {S} ifelse(ArrayInterface.gt(StaticInt{S}(), Zero()), r, _reverse(r)) end -@inline canonicalize_range(r::OptionallyStaticRange, s::Integer) = s > 0 ? r : _reverse(r) +@inline canonicalize_range(r::OptionallyStaticRange, s::Integer) = + s > 0 ? r : _reverse(r) @inline canonicalize_range(r::AbstractCloseOpen) = r -@inline canonicalize_range(r::AbstractUnitRange) = maybestaticfirst(r):maybestaticlast(r) -@inline canonicalize_range(r::OptionallyStaticRange) = canonicalize_range(r, static_step(r)) +@inline canonicalize_range(r::AbstractUnitRange) = + maybestaticfirst(r):maybestaticlast(r) +@inline canonicalize_range(r::OptionallyStaticRange) = + canonicalize_range(r, static_step(r)) @inline canonicalize_range(r::AbstractRange) = canonicalize_range(maybestaticfirst(r):static_step(r):maybestaticlast(r)) @inline canonicalize_range(r::StepRange{T,T}) where {T<:Base.BitInteger} = r @inline canonicalize_range(r::CartesianIndices) = CartesianIndices(map(canonicalize_range, r.indices)) -@inline canonicalize_range(r::Base.OneTo{U}) where {U<:Unsigned} = One():(last(r)%Int) +@inline canonicalize_range(r::Base.OneTo{U}) where {U<:Unsigned} = + One():(last(r)%Int) function canonicalize_range(x) throw( @@ -1049,8 +1132,8 @@ function canonicalize_range(x) ... end ``` -""", - ), +""" + ) ) end @@ -1058,7 +1141,7 @@ function misc_loop!( ls::LoopSet, r::Union{Expr,Symbol}, itersym::Symbol, - staticstepone::Bool, + staticstepone::Bool )::Loop rangename = gensym!(ls, "looprange" * string(itersym)) lenname = gensym!(ls, "looplen" * string(itersym)) @@ -1067,26 +1150,41 @@ function misc_loop!( Expr( :(=), rangename, - Expr(:call, lv(:canonicalize_range), :(@inbounds $(makestatic!(r)))), - ), + Expr(:call, lv(:canonicalize_range), :(@inbounds $(makestatic!(r)))) + ) ) pushprepreamble!( ls, - Expr(:(=), lenname, Expr(:call, GlobalRef(ArrayInterface, :static_length), rangename)), + Expr( + :(=), + lenname, + Expr(:call, GlobalRef(ArrayInterface, :static_length), rangename) + ) ) L = add_loop_bound!( ls, itersym, Expr(:call, lv(:maybestaticfirst), rangename), false, - false, + false + ) + U = add_loop_bound!( + ls, + itersym, + Expr(:call, lv(:maybestaticlast), rangename), + true, + false ) - U = - add_loop_bound!(ls, itersym, Expr(:call, lv(:maybestaticlast), rangename), true, false) if staticstepone Loop(itersym, L, U, MaybeKnown(1), rangename, lenname) else - S = add_loop_bound!(ls, itersym, Expr(:call, lv(:static_step), rangename), false, true) + S = add_loop_bound!( + ls, + itersym, + Expr(:call, lv(:static_step), rangename), + false, + true + ) Loop(itersym, L, U, S, rangename, lenname) end end @@ -1114,8 +1212,13 @@ function indices_loop!(ls::LoopSet, r::Expr, itersym::Symbol)::Loop Expr( :(=), axsym, - Expr(:call, GlobalRef(ArrayInterface, :axes), a_s, staticexpr(dims::Int)), - ), + Expr( + :call, + GlobalRef(ArrayInterface, :axes), + a_s, + staticexpr(dims::Int) + ) + ) ) if n > 1 axsym_prev = axessyms[n-1] @@ -1128,9 +1231,13 @@ function indices_loop!(ls::LoopSet, r::Expr, itersym::Symbol)::Loop :call, GlobalRef(Base, :(==)), Expr(:call, GlobalRef(ArrayInterface, :static_first), axsym), - Expr(:call, GlobalRef(ArrayInterface, :static_first), axsym_prev), - ), - ), + Expr( + :call, + GlobalRef(ArrayInterface, :static_first), + axsym_prev + ) + ) + ) ) pushprepreamble!( ls, @@ -1141,9 +1248,13 @@ function indices_loop!(ls::LoopSet, r::Expr, itersym::Symbol)::Loop :call, GlobalRef(Base, :(==)), Expr(:call, GlobalRef(ArrayInterface, :static_last), axsym), - Expr(:call, GlobalRef(ArrayInterface, :static_last), axsym_prev), - ), - ), + Expr( + :call, + GlobalRef(ArrayInterface, :static_last), + axsym_prev + ) + ) + ) ) end end @@ -1166,8 +1277,13 @@ function indices_loop!(ls::LoopSet, r::Expr, itersym::Symbol)::Loop Expr( :(=), axsym, - Expr(:call, GlobalRef(ArrayInterface, :axes), a_s, staticexpr(mdim)), - ), + Expr( + :call, + GlobalRef(ArrayInterface, :axes), + a_s, + staticexpr(mdim) + ) + ) ) if n > 1 axsym_prev = axessyms[n-1] @@ -1180,9 +1296,13 @@ function indices_loop!(ls::LoopSet, r::Expr, itersym::Symbol)::Loop :call, GlobalRef(Base, :(==)), Expr(:call, GlobalRef(ArrayInterface, :static_first), axsym), - Expr(:call, GlobalRef(ArrayInterface, :static_first), axsym_prev), - ), - ), + Expr( + :call, + GlobalRef(ArrayInterface, :static_first), + axsym_prev + ) + ) + ) ) pushprepreamble!( ls, @@ -1193,9 +1313,13 @@ function indices_loop!(ls::LoopSet, r::Expr, itersym::Symbol)::Loop :call, GlobalRef(Base, :(==)), Expr(:call, GlobalRef(ArrayInterface, :static_last), axsym), - Expr(:call, GlobalRef(ArrayInterface, :static_last), axsym_prev), - ), - ), + Expr( + :call, + GlobalRef(ArrayInterface, :static_last), + axsym_prev + ) + ) + ) ) end end @@ -1207,8 +1331,8 @@ function indices_loop!(ls::LoopSet, r::Expr, itersym::Symbol)::Loop end """ - This function creates a loop, while switching from 1 to 0 based indices - """ +This function creates a loop, while switching from 1 to 0 based indices +""" function register_single_loop!(ls::LoopSet, looprange::Expr) itersym = (looprange.args[1])::Symbol r = looprange.args[2] @@ -1277,7 +1401,9 @@ function instruction!(ls::LoopSet, x::Expr) end # if x.head ≢ :(->) instr = last(x.args).value - isa(instr, Symbol) && instr ∈ keys(COST) && return Instruction(:LoopVectorization, instr) + isa(instr, Symbol) && + instr ∈ keys(COST) && + return Instruction(:LoopVectorization, instr) # end instr = gensym!(ls, "f") pushprepreamble!(ls, Expr(:(=), instr, x)) @@ -1292,13 +1418,12 @@ function instruction!(ls::LoopSet, f::F) where {F<:Function} end end - function maybe_const_compute!( ls::LoopSet, LHS::Symbol, op::Operation, elementbytes::Int, - position::Int, + position::Int ) # return op if iscompute(op) && iszero(length(loopdependencies(op))) @@ -1308,21 +1433,22 @@ function maybe_const_compute!( ls.loopsymbols[1:position], gensym!(ls, instruction(op).instr), elementbytes, - :numericconstant, + :numericconstant ) else # op.dependencies = ls.loopsymbols[1:position] op end end -strip_op_linenumber_nodes(q::Expr) = only(filter(x -> !isa(x, LineNumberNode), q.args)) +strip_op_linenumber_nodes(q::Expr) = + only(filter(x -> !isa(x, LineNumberNode), q.args)) function add_operation!( ls::LoopSet, LHS::Symbol, RHS::Symbol, elementbytes::Int, - position::Int, + position::Int ) add_constant!(ls, RHS, ls.loopsymbols[1:position], LHS, elementbytes) end @@ -1331,7 +1457,7 @@ function add_comparison!( LHS::Symbol, RHS::Expr, elementbytes::Int, - position::Int, + position::Int ) Nargs = length(RHS.args) @assert (Nargs ≥ 5) & isodd(Nargs) @@ -1340,34 +1466,41 @@ function add_comparison!( gensym!(ls, "leftcmp"), RHS.args[1], elementbytes, - position, + position )::Operation p2 = add_assignment!( ls, gensym!(ls, "middlecmp"), RHS.args[3], elementbytes, - position, + position )::Operation cmpname = Nargs == 3 ? LHS : gensym!(ls, "cmp") - cmp = add_compute!(ls, cmpname, RHS.args[2], Operation[p1, p2], elementbytes)::Operation + cmp = add_compute!( + ls, + cmpname, + RHS.args[2], + Operation[p1, p2], + elementbytes + )::Operation for i ∈ 5:2:Nargs pnew = add_assignment!( ls, gensym!(ls, "rightcmp"), RHS.args[i], elementbytes, - position, + position )::Operation cmpchain = add_compute!( ls, gensym!(ls, "cmpchain"), RHS.args[i-1], Operation[p2, pnew], - elementbytes, + elementbytes )::Operation cmpname = Nargs == i ? LHS : gensym!(ls, "cmp") - cmp = add_compute!(ls, cmpname, :&, [cmp, cmpchain], elementbytes)::Operation + cmp = + add_compute!(ls, cmpname, :&, [cmp, cmpchain], elementbytes)::Operation p2 = pnew end return cmp @@ -1377,7 +1510,7 @@ function add_operation!( LHS::Symbol, RHS::Expr, elementbytes::Int, - position::Int, + position::Int ) if RHS.head === :ref add_load_ref!(ls, LHS, RHS, elementbytes) @@ -1393,12 +1526,15 @@ function add_operation!( ls.loopsymbols[1:position], LHS, elementbytes, - :numericconstant, + :numericconstant ) if f === :zero push!(ls.preamble_zeros, (identifier(op), IntOrFloat)) else - push!(ls.preamble_funcofeltypes, (identifier(op), reduction_zero_class(f))) + push!( + ls.preamble_funcofeltypes, + (identifier(op), reduction_zero_class(f)) + ) end op else @@ -1408,7 +1544,13 @@ function add_operation!( elseif RHS.head === :if add_if!(ls, LHS, RHS, elementbytes, position) elseif RHS.head === :block - add_operation!(ls, LHS, strip_op_linenumber_nodes(RHS), elementbytes, position) + add_operation!( + ls, + LHS, + strip_op_linenumber_nodes(RHS), + elementbytes, + position + ) elseif RHS.head === :(.) c = gensym!(ls, "getproperty") pushprepreamble!(ls, Expr(:(=), c, RHS)) @@ -1430,12 +1572,17 @@ function add_operation!( RHS::Expr, LHS_ref::ArrayReferenceMetaPosition, elementbytes::Int, - position::Int, + position::Int ) if RHS.head === :ref# || (RHS.head === :call && first(RHS.args) === :getindex) array, rawindices = ref_from_expr!(ls, RHS) - RHS_ref = - array_reference_meta!(ls, array, rawindices, elementbytes, gensym!(ls, LHS_sym)) + RHS_ref = array_reference_meta!( + ls, + array, + rawindices, + elementbytes, + gensym!(ls, LHS_sym) + ) op = add_load!(ls, RHS_ref, elementbytes) add_compute!(ls, LHS_sym, :identity, [op], elementbytes) # pushfirst!(LHS_ref.parents, iop) @@ -1451,13 +1598,16 @@ function add_operation!( ls.loopsymbols[1:position], LHS_sym, elementbytes, - :numericconstant, + :numericconstant ) # op = add_constant!(ls, c, Symbol[], LHS_sym, elementbytes, :numericconstant) if f === :zero push!(ls.preamble_zeros, (identifier(op), IntOrFloat)) else - push!(ls.preamble_funcofeltypes, (identifier(op), reduction_zero_class(f))) + push!( + ls.preamble_funcofeltypes, + (identifier(op), reduction_zero_class(f)) + ) end op else @@ -1466,7 +1616,13 @@ function add_operation!( elseif RHS.head === :if add_if!(ls, LHS_sym, RHS, elementbytes, position, LHS_ref) elseif RHS.head === :block - add_operation!(ls, LHS_sym, strip_op_linenumber_nodes(RHS), elementbytes, position) + add_operation!( + ls, + LHS_sym, + strip_op_linenumber_nodes(RHS), + elementbytes, + position + ) elseif RHS.head === :(.) c = gensym!(ls, "getproperty") pushpreamble!(ls, Expr(:(=), c, RHS)) @@ -1487,7 +1643,7 @@ function prepare_rhs_for_storage!( array, rawindices, elementbytes::Int, - position::Int, + position::Int )::Operation RHS isa Symbol && return add_store!(ls, RHS, array, rawindices, elementbytes) mpref = array_reference_meta!(ls, array, rawindices, elementbytes) @@ -1498,12 +1654,19 @@ function prepare_rhs_for_storage!( mpref.parents = cachedparents op = add_store!(ls, mpref, elementbytes) if lrhs ∈ keys(ls.opdict) - ls.syms_aliasing_refs[findfirst(==(mpref.mref), ls.refs_aliasing_syms)] = lrhs + ls.syms_aliasing_refs[findfirst(==(mpref.mref), ls.refs_aliasing_syms)] = + lrhs end return op end -function unpack_tuple!(ls::LoopSet, LHS::Expr, RHS, elementbytes::Int, position::Int) +function unpack_tuple!( + ls::LoopSet, + LHS::Expr, + RHS, + elementbytes::Int, + position::Int +) if Meta.isexpr(RHS, :tuple) for i ∈ eachindex(LHS.args) add_assignment!(ls, LHS.args[i], RHS.args[i], elementbytes, position) @@ -1517,7 +1680,7 @@ function unpack_tuple!(ls::LoopSet, LHS::Expr, RHS, elementbytes::Int, position: lhstemp, add_operation!(ls, lhstemp, RHS, elementbytes, position), elementbytes, - position, + position )] unpack_tuple!(ls, LHS, vparents, elementbytes, position) end @@ -1527,7 +1690,7 @@ function unpack_tuple!( LHS::Expr, vparents::Vector{Operation}, elementbytes::Int, - position::Int, + position::Int ) for i ∈ eachindex(LHS.args) f = EXTRACTFUNS[i] @@ -1551,14 +1714,20 @@ function unpack_tuple!( throw( LoopError( "Unpacking the above expression in the left hand side was not understood/supported.", - lhsi, - ), + lhsi + ) ) end first(vparents) end -function add_assignment!(ls::LoopSet, LHS, RHS, elementbytes::Int, position::Int) +function add_assignment!( + ls::LoopSet, + LHS, + RHS, + elementbytes::Int, + position::Int +) if LHS isa Symbol if RHS isa Expr maybe_const_compute!( @@ -1566,7 +1735,7 @@ function add_assignment!(ls::LoopSet, LHS, RHS, elementbytes::Int, position::Int LHS, add_operation!(ls, LHS, RHS, elementbytes, position), elementbytes, - position, + position ) else add_constant!(ls, RHS, ls.loopsymbols[1:position], LHS, elementbytes) @@ -1579,7 +1748,14 @@ function add_assignment!(ls::LoopSet, LHS, RHS, elementbytes::Int, position::Int # need to check if LHS appears in RHS # assign RHS to lrhs array, rawindices = ref_from_expr!(ls, LHS) - prepare_rhs_for_storage!(ls, RHS, array, rawindices, elementbytes, position) + prepare_rhs_for_storage!( + ls, + RHS, + array, + rawindices, + elementbytes, + position + ) else add_store_ref!(ls, RHS, LHS, elementbytes) # is this necessary? (Extension API?) end @@ -1589,8 +1765,8 @@ function add_assignment!(ls::LoopSet, LHS, RHS, elementbytes::Int, position::Int throw( LoopError( "LHS not understood; only `:ref`s and `:tuple`s are currently supported.", - LHS, - ), + LHS + ) ) end else @@ -1603,7 +1779,7 @@ function push_op!( ex::Expr, elementbytes::Int, position::Int, - mpref::Union{Nothing,ArrayReferenceMetaPosition} = nothing, + mpref::Union{Nothing,ArrayReferenceMetaPosition} = nothing )::Operation if ex.head === :call finex = first(ex.args)::Symbol @@ -1615,7 +1791,7 @@ function push_op!( array, rawindices, elementbytes, - position, + position ) else throw(LoopError("Don't know how to handle expression.", finex)) @@ -1655,7 +1831,7 @@ function Base.push!( ex::Expr, elementbytes::Int, position::Int, - mpref::Union{Nothing,ArrayReferenceMetaPosition} = nothing, + mpref::Union{Nothing,ArrayReferenceMetaPosition} = nothing ) if ex.head === :block add_block!(ls, ex, elementbytes, position) @@ -1672,7 +1848,7 @@ function UnrollSpecification( u₂loop::Symbol, vloopsym::Symbol, u₁, - u₂, + u₂ ) order = names(ls) nu₁ = findfirst(Base.Fix2(===, u₁loop), order)::Int @@ -1682,10 +1858,10 @@ function UnrollSpecification( end """ - looplengthprod(ls::LoopSet) +looplengthprod(ls::LoopSet) - Convert to `Float64` for the sake of non-64 bit platforms. - """ +Convert to `Float64` for the sake of non-64 bit platforms. +""" function looplengthprod(ls::LoopSet) l = 1.0 for loop ∈ ls.loops @@ -1695,7 +1871,6 @@ function looplengthprod(ls::LoopSet) end # prod(Float64 ∘ length, ls.loops) - function looplength(ls::LoopSet, s::Symbol) # search_tree(parents(operations(ls)[i]), name(op)) && return true id = getloopid_or_nothing(ls, s) @@ -1722,9 +1897,13 @@ function looplength(ls::LoopSet, s::Symbol) end end -function accept_reorder_according_to_tracked_reductions(ls::LoopSet, reordered::Symbol) +function accept_reorder_according_to_tracked_reductions( + ls::LoopSet, + reordered::Symbol +) for op ∈ operations(ls) - if (reordered ∈ loopdependencies(op)) && !(iscompute(op) & iszero(length(children(op)))) + if (reordered ∈ loopdependencies(op)) && + !(iscompute(op) & iszero(length(children(op)))) for opp ∈ parents(op) (iscompute(opp) && isanouterreduction(ls, opp)) && return 0x00 end @@ -1762,7 +1941,7 @@ function check_valid_reorder_dims!(ls::LoopSet) firstoff = opiref.offsets[l] maxdiff = max( checkmismatch(ops, opidsᵢ, l, firstoff, 2:length(opidsᵢ)), - checkmismatch(ops, opidsⱼ, l, firstoff, 1:length(opidsⱼ)), + checkmismatch(ops, opidsⱼ, l, firstoff, 1:length(opidsⱼ)) ) if maxdiff ≥ (isknown(step(loopk)) ? abs(gethint(step(loopk))) : 1) validreorder[k] = 0x00#0x01 @@ -1782,7 +1961,7 @@ function checkmismatch( opids::Vector{Int}, l::Int, firstoff::Int8, - checkrange::UnitRange{Int}, + checkrange::UnitRange{Int} ) maxabsdiff = 0 for m ∈ checkrange @@ -1798,7 +1977,8 @@ function fill_offset_memop_collection!(ls::LoopSet) omop = offsetloadcollection(ls) ops = operations(ls) num_ops = length(ops) - @unpack opids, opidcollectionmap, batchedcollections, batchedcollectionmap = omop + @unpack opids, opidcollectionmap, batchedcollections, batchedcollectionmap = + omop length(opidcollectionmap) == 0 || return resize!(opidcollectionmap, num_ops) fill!(opidcollectionmap, (0, 0)) @@ -1871,7 +2051,7 @@ function fill_offset_memop_collection!(ls::LoopSet) collen = length(collectionⱼ) collen ≤ 1 && continue # we have multiple, easiest to process if we sort them - sort!(collectionⱼ, by = last) + sort!(collectionⱼ; by = last) istart = 1 ostart = last(first(collectionⱼ)) oprev = ostart @@ -1890,7 +2070,7 @@ function fill_offset_memop_collection!(ls::LoopSet) ops, collectionⱼ, istart, - i - 1, + i - 1 ) end # restart istart and ostart @@ -1906,7 +2086,7 @@ function fill_offset_memop_collection!(ls::LoopSet) ops, collectionⱼ, istart, - collen, + collen ) end end @@ -1921,7 +2101,7 @@ function pushbatchedcollection!( ops, collectionⱼ, istart, - istop, + istop ) colview = view(collectionⱼ, istart:istop) push!(batchedcollections, colview) @@ -1933,14 +2113,15 @@ function pushbatchedcollection!( end """ - Returns `0` if the op is the declaration of the constant outerreduction variable. - Returns `n`, where `n` is the constant declarations's index among parents(op), if op is an outter reduction. - Returns `-1` if not an outerreduction. - """ +Returns `0` if the op is the declaration of the constant outerreduction variable. +Returns `n`, where `n` is the constant declarations's index among parents(op), if op is an outter reduction. +Returns `-1` if not an outerreduction. +""" function isouterreduction(ls::LoopSet, op::Operation) if isconstant(op) # equivalent to checking if length(loopdependencies(op)) == 0 instr = op.instruction - instr == LOOPCONSTANT && return Core.ifelse(length(loopdependencies(op)) == 0, 0, -1) + instr == LOOPCONSTANT && + return Core.ifelse(length(loopdependencies(op)) == 0, 0, -1) instr.mod === GLOBALCONSTANT && return -1 ops = operations(ls) for or ∈ ls.outer_reductions diff --git a/src/modeling/operations.jl b/src/modeling/operations.jl index 36b6c8ed0..51b5b9509 100644 --- a/src/modeling/operations.jl +++ b/src/modeling/operations.jl @@ -1,10 +1,9 @@ const DISCONTIGUOUS = Symbol("##DISCONTIGUOUSSUBARRAY##") const CONSTANTZEROINDEX = Symbol("##CONSTANTZEROINDEX##") -const LOOPCONSTANT = Instruction(:LoopVectorization, Symbol("LOOPCONSTANTINSTRUCTION")) +const LOOPCONSTANT = + Instruction(:LoopVectorization, Symbol("LOOPCONSTANTINSTRUCTION")) const GLOBALCONSTANT = Symbol("##GLOBAL##CONSTANT##") - - """ ArrayReference @@ -75,15 +74,27 @@ struct OffsetLoadCollection # offsets::Vector{Vector{Vector{Int8}}} opidcollectionmap::Vector{Tuple{Int,Int}} batchedcollections::Vector{ - SubArray{Tuple{Int,Int},1,Vector{Tuple{Int,Int}},Tuple{UnitRange{Int}},true}, + SubArray{ + Tuple{Int,Int}, + 1, + Vector{Tuple{Int,Int}}, + Tuple{UnitRange{Int}}, + true + } } batchedcollectionmap::Vector{Tuple{Int,Int}} function OffsetLoadCollection() new( Vector{Int}[], Tuple{Int,Int}[], - SubArray{Tuple{Int,Int},1,Vector{Tuple{Int,Int}},Tuple{UnitRange{Int}},true}[], - Tuple{Int,Int}[], + SubArray{ + Tuple{Int,Int}, + 1, + Vector{Tuple{Int,Int}}, + Tuple{UnitRange{Int}}, + true + }[], + Tuple{Int,Int}[] ) end end @@ -126,13 +137,21 @@ abstract type AbstractLoopOperation end memstore loopvalue end -"An operation setting a variable to a constant value (e.g., `a = 0.0`)" +""" +An operation setting a variable to a constant value (e.g., `a = 0.0`) +""" constant -"An operation setting a variable from a memory location (e.g., `a = A[i,j]`)" +""" +An operation setting a variable from a memory location (e.g., `a = A[i,j]`) +""" memload -"An operation computing a new value from one or more variables (e.g., `a = b + c`)" +""" +An operation computing a new value from one or more variables (e.g., `a = b + c`) +""" compute -"An operation storing a value to a memory location (e.g., `A[i,j] = a`)" +""" +An operation storing a value to a memory location (e.g., `A[i,j] = a`) +""" memstore """ `loopvalue` indicates an loop variable (`i` in `for i in ...`). These are the "parents" of `compute` @@ -223,7 +242,7 @@ mutable struct Operation <: AbstractLoopOperation reduced_deps::Vector{Symbol} = Symbol[], parents::Vector{Operation} = Operation[], ref::ArrayReferenceMeta = NOTAREFERENCE, - reduced_children::Vector{Symbol} = Symbol[], + reduced_children::Vector{Symbol} = Symbol[] ) new( identifier, @@ -237,7 +256,7 @@ mutable struct Operation <: AbstractLoopOperation Operation[], ref, Symbol("##", variable, :_), - reduced_children, + reduced_children ) end end @@ -310,7 +329,10 @@ function Base.show(io::IO, op::Operation) elseif isload(op) print(io, Expr(:(=), op.variable, ref_for_print(op))) elseif iscompute(op) - print(io, Expr(:(=), op.variable, callexpr(op.instruction, map(name, parents(op))))) + print( + io, + Expr(:(=), op.variable, callexpr(op.instruction, map(name, parents(op)))) + ) elseif isstore(op) print(io, Expr(:(=), ref_for_print(op), name(first(parents(op))))) elseif isloopvalue(op) @@ -349,7 +371,8 @@ name(x::ArrayReference) = x.array name(x::ArrayReferenceMeta) = x.ref.array name(op::Operation) = op.variable instruction(op::Operation) = op.instruction -isreductcombineinstr(op::Operation) = iscompute(op) && isreductcombineinstr(instruction(op)) +isreductcombineinstr(op::Operation) = + iscompute(op) && isreductcombineinstr(instruction(op)) """ mvar = mangledvar(op) @@ -357,12 +380,12 @@ Returns the mangled variable name, for use in the produced expressions. These names will be further processed if op is tiled and/or unrolled. ```julia - if tiled ∈ loopdependencies(op) # `suffix` is tilenumber - mvar = Symbol(op, suffix, :_) - end - if unrolled ∈ loopdependencies(op) # `u` is unroll number - mvar = Symbol(op, u) - end +if tiled ∈ loopdependencies(op) # `suffix` is tilenumber + mvar = Symbol(op, suffix, :_) +end +if unrolled ∈ loopdependencies(op) # `u` is unroll number + mvar = Symbol(op, u) +end ``` """ mangledvar(op::Operation) = op.mangledvariable @@ -383,7 +406,7 @@ function Operation( elementbytes::Int, instr, optype::OperationType, - mpref::ArrayReferenceMetaPosition, + mpref::ArrayReferenceMetaPosition ) Operation( id, @@ -394,16 +417,22 @@ function Operation( mpref.loopdependencies, mpref.reduceddeps, mpref.parents, - mpref.mref, + mpref.mref ) end -Base.:(==)(x::ArrayReferenceMetaPosition, y::ArrayReferenceMetaPosition) = x.mref == y.mref +Base.:(==)(x::ArrayReferenceMetaPosition, y::ArrayReferenceMetaPosition) = + x.mref == y.mref parents(op::ArrayReferenceMetaPosition) = op.parents # Avoid memory allocations by using this for ops that aren't references const NOTAREFERENCE = ArrayReferenceMeta(ArrayReference(Symbol(""), Symbol[]), Bool[], Symbol("")) -const NOTAREFERENCEMP = - ArrayReferenceMetaPosition(NOTAREFERENCE, NOPARENTS, Symbol[], Symbol[], Symbol("")) +const NOTAREFERENCEMP = ArrayReferenceMetaPosition( + NOTAREFERENCE, + NOPARENTS, + Symbol[], + Symbol[], + Symbol("") +) varname(::Nothing) = nothing varname(mpref::ArrayReferenceMetaPosition) = mpref.varname name(mpref::ArrayReferenceMetaPosition) = name(mpref.mref.ref) @@ -480,7 +509,8 @@ function ifelse_reduce_fun_expr(f::Symbol, op::Operation) lvcmp_instr = lv(instruction(cmp).instr) if success lvf = lv(f) - return not ? Expr(:call, lvf, :($(!) ∘ $lvcmp_instr)) : Expr(:call, lvf, lvcmp_instr) + return not ? Expr(:call, lvf, :($(!) ∘ $lvcmp_instr)) : + Expr(:call, lvf, lvcmp_instr) end options = children(cmp) for oop ∈ options @@ -488,7 +518,9 @@ function ifelse_reduce_fun_expr(f::Symbol, op::Operation) _cmp, _cmpa, _cmpb, _not, _success = find_cmp_args_from_ifelse(oop) _success || continue lvf = lv(Symbol(f, :Mirror)) - expr = not ? Expr(:call, lvf, :($(!) ∘ $lvcmp_instr)) : Expr(:call, lvf, lvcmp_instr) + expr = + not ? Expr(:call, lvf, :($(!) ∘ $lvcmp_instr)) : + Expr(:call, lvf, lvcmp_instr) push!(expr.args, name(_cmpa), name(_cmpb)) return expr end @@ -500,7 +532,8 @@ function ifelse_reduction(f::F, rsym::Symbol, op::Operation) where {F} lvcmp_instr = lv(instruction(cmp).instr) if success lvf = lv(rsym) - return not ? Expr(:call, lvf, :($(!) ∘ $lvcmp_instr)) : Expr(:call, lvf, lvcmp_instr) + return not ? Expr(:call, lvf, :($(!) ∘ $lvcmp_instr)) : + Expr(:call, lvf, lvcmp_instr) end options = children(cmp) for oop ∈ options @@ -523,17 +556,22 @@ end # end # end # No `@eval` to make the language server happy -reduction_scalar_combine(x) = reduction_scalar_combine(reduction_instruction_class(x)) +reduction_scalar_combine(x) = + reduction_scalar_combine(reduction_instruction_class(x)) reduction_scalar_combine(op::Operation)::GlobalRef = lv(reduction_scalar_combine(instruction(op))) reduction_to_scalar(x) = reduction_to_scalar(reduction_instruction_class(x)) -reduction_to_scalar(op::Operation)::GlobalRef = lv(reduction_to_scalar(instruction(op))) -reduce_number_of_vectors(x) = reduce_number_of_vectors(reduction_instruction_class(x)) +reduction_to_scalar(op::Operation)::GlobalRef = + lv(reduction_to_scalar(instruction(op))) +reduce_number_of_vectors(x) = + reduce_number_of_vectors(reduction_instruction_class(x)) reduce_number_of_vectors(op::Operation)::GlobalRef = lv(reduce_number_of_vectors(instruction(op))) -reduce_to_onevecunroll(x) = reduce_to_onevecunroll(reduction_instruction_class(x)) +reduce_to_onevecunroll(x) = + reduce_to_onevecunroll(reduction_instruction_class(x)) reduce_to_onevecunroll(op::Operation)::GlobalRef = lv(reduce_to_onevecunroll(instruction(op))) -reduction_to_single_vector(x) = reduction_to_single_vector(reduction_instruction_class(x)) +reduction_to_single_vector(x) = + reduction_to_single_vector(reduction_instruction_class(x)) reduction_to_single_vector(op::Operation)::GlobalRef = lv(reduction_to_single_vector(instruction(op))) diff --git a/src/parse/add_compute.jl b/src/parse/add_compute.jl index 2de835be8..ee1c7651e 100644 --- a/src/parse/add_compute.jl +++ b/src/parse/add_compute.jl @@ -14,7 +14,7 @@ end function mergesetdiffv!( s1::AbstractVector{T}, s2::AbstractVector{T}, - s3::AbstractVector{T}, + s3::AbstractVector{T} ) where {T} for s ∈ s2 s ∉ s3 && addsetv!(s1, s) @@ -25,7 +25,7 @@ end function setdiffv!( s3::AbstractVector{T}, s1::AbstractVector{T}, - s2::AbstractVector{T}, + s2::AbstractVector{T} ) where {T} for s ∈ s1 (s ∈ s2) || (s ∉ s3 && push!(s3, s)) @@ -35,13 +35,17 @@ function setdiffv!( s4::AbstractVector{T}, s3::AbstractVector{T}, s1::AbstractVector{T}, - s2::AbstractVector{T}, + s2::AbstractVector{T} ) where {T} for s ∈ s1 (s ∈ s2) ? (s ∉ s4 && push!(s4, s)) : (s ∉ s3 && push!(s3, s)) end end -function update_deps!(deps::Vector{Symbol}, reduceddeps::Vector{Symbol}, parent::Operation) +function update_deps!( + deps::Vector{Symbol}, + reduceddeps::Vector{Symbol}, + parent::Operation +) mergesetv!(deps, loopdependencies(parent))#, reduceddependencies(parent)) if !(isload(parent) || isconstant(parent)) #&& !isreductcombineinstr(parent) mergesetv!(reduceddeps, reduceddependencies(parent)) @@ -53,7 +57,7 @@ function pushparent!( parents::Vector{Operation}, deps::Vector{Symbol}, reduceddeps::Vector{Symbol}, - parent::Operation, + parent::Operation ) @assert parents !== NOPARENTS push!(parents, parent) @@ -69,7 +73,7 @@ function add_parent!( ls::LoopSet, var, elementbytes::Int, - position::Int, + position::Int ) parent = if var isa Symbol # if var === :kern_1_1 @@ -112,13 +116,17 @@ function search_tree(opv::Vector{Operation}, var::Symbol) # relies on cycles bei false end -search_tree_for_ref(ls::LoopSet, opv::Vector{Operation}, ::Nothing, var::Symbol) = - var, false +search_tree_for_ref( + ls::LoopSet, + opv::Vector{Operation}, + ::Nothing, + var::Symbol +) = var, false function search_tree_for_ref( ls::LoopSet, opv::Vector{Operation}, mpref::ArrayReferenceMetaPosition, - var::Symbol, + var::Symbol ) # relies on cycles being forbidden for opp ∈ opv if opp.ref == mpref.mref @@ -143,7 +151,7 @@ end function update_reduction_status!( parentvec::Vector{Operation}, deps::Vector{Symbol}, - parent::Symbol, + parent::Symbol ) for opp ∈ parentvec if name(opp) === parent @@ -193,7 +201,7 @@ function substitute_op_in_parents!( replacer::Operation, replacee::Operation, reduceddeps::Vector{Symbol}, - reductsym::Symbol, + reductsym::Symbol ) found = false for i ∈ eachindex(vparents) @@ -202,8 +210,13 @@ function substitute_op_in_parents!( vparents[i] = replacer found = true else - fopp = - substitute_op_in_parents!(parents(opp), replacer, replacee, reduceddeps, reductsym) + fopp = substitute_op_in_parents!( + parents(opp), + replacer, + replacee, + reduceddeps, + reductsym + ) if fopp add_reduced_deps!(opp, reduceddeps) # FIXME: https://github.com/JuliaSIMD/LoopVectorization.jl/issues/259 @@ -217,7 +230,6 @@ function substitute_op_in_parents!( found end - function add_reduction_update_parent!( vparents::Vector{Operation}, deps::Vector{Symbol}, @@ -226,7 +238,7 @@ function add_reduction_update_parent!( parent::Operation, instr::Instruction, reduction_ind::Int, - elementbytes::Int, + elementbytes::Int ) var = name(parent) # isouterreduction = iszero(length(loopdependencies(parent))) && (parent.instruction === LOOPCONSTANT) @@ -257,7 +269,7 @@ function add_reduction_update_parent!( loopdependencies(parent), reductsym, elementbytes, - :numericconstant, + :numericconstant ) if reduct_zero === :zero push!(ls.preamble_zeros, (identifier(reductinit), IntOrFloat)) @@ -278,7 +290,13 @@ function add_reduction_update_parent!( update_deps!(deps, reduceddeps, reductinit)#parent) # deps and reduced deps will not be disjoint end elseif !isouterreduction && reductinit !== parent - substitute_op_in_parents!(vparents, reductinit, parent, reduceddeps, reductsym) + substitute_op_in_parents!( + vparents, + reductinit, + parent, + reduceddeps, + reductsym + ) end update_reduction_status!(vparents, reduceddeps, name(reductinit)) # this is the op added by add_compute @@ -290,7 +308,7 @@ function add_reduction_update_parent!( compute, deps, reduceddeps, - vparents, + vparents ) isouterreduction && push!(ls.outer_reductions, identifier(op)) opout = pushop!(ls, op, var) # note this overwrites the entry in the operations dict, but not the vector @@ -314,7 +332,7 @@ function add_reduction_update_parent!( compute, childdeps, childrdeps, - childparents, + childparents ) # child = Operation( # length(operations(ls)), name(parent), elementbytes, Instruction(reductcombine,:identity), compute, childdeps, childrdeps, childparents @@ -332,7 +350,13 @@ function substitute!(ex::Expr, d::Dict{Symbol,Symbol}) end end end -function argsymbol(ls::LoopSet, arg, mpref, elementbytes::Int, position::Int)::Symbol +function argsymbol( + ls::LoopSet, + arg, + mpref, + elementbytes::Int, + position::Int +)::Symbol argsym = gensym!(ls, "anonarg") if mpref === nothing add_operation!(ls, argsym, arg, elementbytes, position) @@ -348,7 +372,7 @@ function add_anon_func!( ex::Expr, position::Int, mpref::Union{Nothing,ArrayReferenceMetaPosition}, - elementbytes::Int, + elementbytes::Int )::Operation d = Dict{Symbol,Symbol}() anonargs = f.args[1] @@ -387,7 +411,7 @@ function add_anon_func!( LHS, instruction(:identity), Operation[getop(ls, lastline)], - elementbytes, + elementbytes ) elseif Meta.isexpr(lastline, :call) add_compute!(ls, LHS, lastline, elementbytes, position, mpref) @@ -414,7 +438,7 @@ function maybe_fix_reduced_deps!( reduceddeps::Vector{Symbol}, parent::Operation, mpref::ArrayReferenceMetaPosition, - position::Int, + position::Int ) loopdeps_parent = loopdependencies(parent) reduceddeps_parent = reduceddependencies(parent) @@ -453,7 +477,7 @@ function add_compute!( ex::Expr, elementbytes::Int, position::Int, - mpref::Union{Nothing,ArrayReferenceMetaPosition} = nothing, + mpref::Union{Nothing,ArrayReferenceMetaPosition} = nothing )::Operation @assert ex.head === :call fexpr = first(ex.args) @@ -466,7 +490,14 @@ function add_compute!( arg1 = args[1] arg2 = args[2] if arg1 isa Number && convert(Float64, arg1) === -1.0 - return add_compute!(ls, var, :(2iseven($arg2) - 1), elementbytes, position, mpref) + return add_compute!( + ls, + var, + :(2iseven($arg2) - 1), + elementbytes, + position, + mpref + ) end if arg2 isa Number return add_pow!(ls, var, args[1], arg2, elementbytes, position) @@ -491,18 +522,40 @@ function add_compute!( if mpref == argref if varname(mpref) === var id = findfirst(==(mpref.mref), ls.refs_aliasing_syms) - mpref.varname = var = id === nothing ? var : ls.syms_aliasing_refs[id] + mpref.varname = + var = id === nothing ? var : ls.syms_aliasing_refs[id] reduction_ind = ind - mergesetv!(deps, loopdependencies(add_load!(ls, argref, elementbytes))) + mergesetv!( + deps, + loopdependencies(add_load!(ls, argref, elementbytes)) + ) else - pushparent!(vparents, deps, reduceddeps, add_load!(ls, argref, elementbytes)) + pushparent!( + vparents, + deps, + reduceddeps, + add_load!(ls, argref, elementbytes) + ) end else argref.varname = gensym!(ls, "tempload") - pushparent!(vparents, deps, reduceddeps, add_load!(ls, argref, elementbytes)) + pushparent!( + vparents, + deps, + reduceddeps, + add_load!(ls, argref, elementbytes) + ) end else - add_parent!(vparents, deps, reduceddeps, ls, arg, elementbytes, position) + add_parent!( + vparents, + deps, + reduceddeps, + ls, + arg, + elementbytes, + position + ) end elseif arg ∈ ls.loopsymbols loopsymop = add_loopvalue!(ls, arg, elementbytes) @@ -533,8 +586,14 @@ function add_compute!( ) && isone(length(vparents)) && (position == length(loopdependencies(only(vparents)))) - deps, reduceddeps = - maybe_fix_reduced_deps!(ls, deps, reduceddeps, only(vparents), mpref, position) + deps, reduceddeps = maybe_fix_reduced_deps!( + ls, + deps, + reduceddeps, + only(vparents), + mpref, + position + ) end # @show reduction, search_tree(vparents, var) ex var vparents mpref get(ls.opdict, var, nothing) search_tree_for_ref(ls, vparents, mpref, var) # relies on cycles being forbidden if reduction || search_tree(vparents, var) @@ -546,7 +605,7 @@ function add_compute!( vparents, reduction_ind, elementbytes, - instr, + instr ) else if mpref ≢ nothing && ( @@ -562,7 +621,7 @@ function add_compute!( vparents, reduction_ind, elementbytes, - instr, + instr ) end op = Operation( @@ -573,7 +632,7 @@ function add_compute!( compute, deps, reduceddeps, - vparents, + vparents ) return pushop!(ls, op, var) end @@ -587,7 +646,7 @@ function add_reduction!( vparents, reduction_ind, elementbytes, - instr, + instr ) parent = ls.opdict[var] setdiffv!(reduceddeps, deps, loopdependencies(parent)) @@ -606,7 +665,7 @@ function add_reduction!( compute, deps, reduceddeps, - vparents, + vparents ) pushop!(ls, op, var) else @@ -618,7 +677,7 @@ function add_reduction!( parent, instr, reduction_ind, - elementbytes, + elementbytes ) end end @@ -628,7 +687,7 @@ function add_compute!( LHS::Symbol, instr, vparents::Vector{Operation}, - elementbytes::Int, + elementbytes::Int ) deps = Symbol[] reduceddeps = Symbol[] @@ -643,7 +702,7 @@ function add_compute!( compute, deps, reduceddeps, - vparents, + vparents ) pushop!(ls, op, LHS) end @@ -654,7 +713,7 @@ function add_compute_ifelse!( cond::Operation, iftrue::Operation, iffalse::Operation, - elementbytes::Int, + elementbytes::Int ) deps = Symbol[] reduceddeps = Symbol[] @@ -676,7 +735,7 @@ function add_compute_ifelse!( iftrue, Instruction(:LoopVectorization, :ifelse), 2, - elementbytes, + elementbytes ) end elseif name(iffalse) === LHS @@ -691,7 +750,7 @@ function add_compute_ifelse!( iffalse, Instruction(:LoopVectorization, :ifelse), 3, - elementbytes, + elementbytes ) end end @@ -704,10 +763,9 @@ function add_compute_ifelse!( compute, deps, reduceddeps, - vparents, + vparents ) pushop!(ls, op, LHS) - end # adds x ^ (p::Real) @@ -717,7 +775,7 @@ function add_pow!( @nospecialize(x), p::Real, elementbytes::Int, - position::Int, + position::Int ) xop::Operation = if x isa Expr add_operation!( @@ -725,7 +783,7 @@ function add_pow!( Symbol("###xpow###$(length(operations(ls)))###"), x, elementbytes, - position, + position ) elseif x isa Symbol if x ∈ ls.loopsymbols @@ -796,9 +854,12 @@ function add_pow!( constant, NODEPENDENCY, Symbol[], - NOPARENTS, + NOPARENTS + ) + push!( + ls.preamble_funcofeltypes, + (identifier(op), MULTIPLICATIVE_IN_REDUCTIONS) ) - push!(ls.preamble_funcofeltypes, (identifier(op), MULTIPLICATIVE_IN_REDUCTIONS)) return pushop!(ls, op) elseif pint == 1#requires `pden ≠ 1`. return add_compute!(ls, var, :identity, [xop], elementbytes) @@ -817,14 +878,15 @@ function add_pow!( t = trailing_zeros(pint) + 1 pint >>= t while (t -= 1) >= 0 - xop = add_compute!(ls, gensym!(ls, "pbs"), :abs2_fast, [xop], elementbytes) + xop = + add_compute!(ls, gensym!(ls, "pbs"), :abs2_fast, [xop], elementbytes) end yop = add_compute!( ls, iszero(pint) ? var : gensym!(ls, "pbs"), :mul_fast, [xop, yop], - elementbytes, + elementbytes ) end yop diff --git a/src/parse/add_constants.jl b/src/parse/add_constants.jl index 3fafaf693..f818536ea 100644 --- a/src/parse/add_constants.jl +++ b/src/parse/add_constants.jl @@ -9,7 +9,7 @@ const CONSTANT_SYMBOLS = ( :Int32, :UInt32, :Int64, - :UInt64, + :UInt64 ) function add_constant!(ls::LoopSet, var::Symbol, elementbytes::Int) var ∈ ls.loopsymbols && return add_loopvalue!(ls, var, elementbytes) @@ -23,7 +23,7 @@ function add_constant!(ls::LoopSet, var::Symbol, elementbytes::Int) constant, NODEPENDENCY, Symbol[], - NOPARENTS, + NOPARENTS ) rop = pushop!(ls, op, var) (!globalconst && (rop === op)) && pushpreamble!(ls, op, var) @@ -38,7 +38,7 @@ function add_constant!( ls::LoopSet, var::Number, elementbytes::Int = 8, - varname = gensym!(ls, "loopconstnumber"), + varname = gensym!(ls, "loopconstnumber") ) op = Operation( length(operations(ls)), @@ -48,7 +48,7 @@ function add_constant!( constant, NODEPENDENCY, Symbol[], - NOPARENTS, + NOPARENTS ) ops = operations(ls) typ = var isa Integer ? HardInt : HardFloat @@ -109,7 +109,6 @@ function ensure_constant_lowered!(ls::LoopSet, op::Operation) pushpreamble!(ls, Expr(:(=), name(op), floatval)) return end - end for (id, typ) ∈ ls.preamble_zeros if id == opid @@ -119,7 +118,10 @@ function ensure_constant_lowered!(ls::LoopSet, op::Operation) end for (id, f) ∈ ls.preamble_funcofeltypes if id == opid - pushpreamble!(ls, Expr(:(=), name(op), Expr(:call, reduction_zero(f), Float64))) + pushpreamble!( + ls, + Expr(:(=), name(op), Expr(:call, reduction_zero(f), Float64)) + ) return end end @@ -128,7 +130,7 @@ end function ensure_constant_lowered!( ls::LoopSet, mpref::ArrayReferenceMetaPosition, - ind::Symbol, + ind::Symbol ) length(loopdependencies(mpref)) == 0 && return for (id, opp) ∈ enumerate(parents(mpref)) @@ -142,7 +144,7 @@ function add_constant_vload!( ls::LoopSet, op::Operation, mpref::ArrayReferenceMetaPosition, - elementbytes::Int, + elementbytes::Int ) temp = gensym!(ls, "intermediateconstref") use_getindex = vptr(name(mpref)) === mpref.mref.ptr @@ -165,8 +167,8 @@ function add_constant_vload!( fill(false, nindices), true, ls, - false, - ).args, + false + ).args ) else push!( @@ -177,8 +179,8 @@ function add_constant_vload!( fill(false, nindices), true, ls, - false, - ), + false + ) ) end end @@ -192,7 +194,11 @@ function add_constant_vload!( pushpreamble!(ls, op, temp) return temp end -function add_constant!(ls::LoopSet, mpref::ArrayReferenceMetaPosition, elementbytes::Int) +function add_constant!( + ls::LoopSet, + mpref::ArrayReferenceMetaPosition, + elementbytes::Int +) op = Operation( length(operations(ls)), varname(mpref), @@ -202,7 +208,7 @@ function add_constant!(ls::LoopSet, mpref::ArrayReferenceMetaPosition, elementby NODEPENDENCY, Symbol[], NOPARENTS, - mpref.mref, + mpref.mref ) add_vptr!(ls, op) temp = add_constant_vload!(ls, op, mpref, elementbytes) @@ -217,7 +223,7 @@ function add_constant!( deps::Vector{Symbol}, assignedsym::Symbol, elementbytes::Int, - f::Symbol = Symbol(""), + f::Symbol = Symbol("") ) value ∈ ls.loopsymbols && return add_loopvalue!(ls, value, elementbytes) retop = get(ls.opdict, value, nothing) @@ -230,7 +236,7 @@ function add_constant!( constant, deps, NODEPENDENCY, - NOPARENTS, + NOPARENTS ) else op = Operation( @@ -241,7 +247,7 @@ function add_constant!( compute, deps, reduceddependencies(retop), - [retop], + [retop] ) end pushop!(ls, op, assignedsym) @@ -258,7 +264,7 @@ function add_constant!( value::Number, deps::Vector{Symbol}, assignedsym::Symbol, - elementbytes::Int, + elementbytes::Int ) op = add_constant!( ls, @@ -266,7 +272,7 @@ function add_constant!( deps, assignedsym, elementbytes, - :numericconstant, + :numericconstant ) pushpreamble!(ls, op, value) op diff --git a/src/parse/add_ifelse.jl b/src/parse/add_ifelse.jl index 611d87ed2..cc3e0d106 100644 --- a/src/parse/add_ifelse.jl +++ b/src/parse/add_ifelse.jl @@ -11,7 +11,7 @@ function add_if!( RHS::Expr, elementbytes::Int, position::Int, - mpref::Union{Nothing,ArrayReferenceMetaPosition} = nothing, + mpref::Union{Nothing,ArrayReferenceMetaPosition} = nothing ) # for now, just simple 1-liners @assert length(RHS.args) == 3 "if statements without an else cannot be assigned to a variable." @@ -21,11 +21,19 @@ function add_if!( elseif mpref === nothing add_operation!(ls, gensym!(ls, "mask"), condition, elementbytes, position) else - add_operation!(ls, gensym!(ls, "mask"), condition, mpref, elementbytes, position) + add_operation!( + ls, + gensym!(ls, "mask"), + condition, + mpref, + elementbytes, + position + ) end iftrue = RHS.args[2] if iftrue isa Expr - trueop = add_operation!(ls, gensym!(ls, "iftrue"), iftrue, elementbytes, position) + trueop = + add_operation!(ls, gensym!(ls, "iftrue"), iftrue, elementbytes, position) if iftrue.head === :ref && all(ld -> ld ∈ loopdependencies(trueop), loopdependencies(condop)) && !search_tree(parents(condop), trueop) @@ -49,7 +57,13 @@ function add_if!( end end if iffalse isa Expr - falseop = add_operation!(ls, gensym!(ls, "iffalse"), iffalse, elementbytes, position) + falseop = add_operation!( + ls, + gensym!(ls, "iffalse"), + iffalse, + elementbytes, + position + ) if iffalse.head === :ref && all(ld -> ld ∈ loopdependencies(falseop), loopdependencies(condop)) && !search_tree(parents(condop), falseop) @@ -78,7 +92,7 @@ function add_andblock!( LHS, rhsop::Operation, elementbytes::Int, - position::Int, + position::Int ) if LHS isa Symbol altop = getop(ls, LHS, elementbytes) @@ -95,9 +109,10 @@ function add_andblock!( LHS, RHS::Expr, elementbytes::Int, - position::Int, + position::Int ) - rhsop = add_compute!(ls, gensym!(ls, "iftruerhs"), RHS, elementbytes, position) + rhsop = + add_compute!(ls, gensym!(ls, "iftruerhs"), RHS, elementbytes, position) add_andblock!(ls, condop, LHS, rhsop, elementbytes, position) end function add_andblock!( @@ -106,7 +121,7 @@ function add_andblock!( LHS, RHS, elementbytes::Int, - position::Int, + position::Int ) rhsop = getop(ls, RHS, elementbytes) add_andblock!(ls, condop, LHS, rhsop, elementbytes, position) @@ -116,9 +131,10 @@ function add_andblock!( condexpr::Expr, condeval::Expr, elementbytes::Int, - position::Int, + position::Int ) - condop = add_operation!(ls, gensym!(ls, "mask"), condexpr, elementbytes, position) + condop = + add_operation!(ls, gensym!(ls, "mask"), condexpr, elementbytes, position) add_andblock!(ls, condop, condeval, elementbytes, position) end function add_andblock!( @@ -126,14 +142,21 @@ function add_andblock!( condop::Operation, condeval::Expr, elementbytes::Int, - position::Int, + position::Int ) if condeval.head === :call @assert first(condeval.args) === :setindex! array, raw_indices = ref_from_setindex!(ls, condeval) ref = Expr(:ref, array) append!(ref.args, raw_indices) - return add_andblock!(ls, condop, ref, condeval.args[3], elementbytes, position) + return add_andblock!( + ls, + condop, + ref, + condeval.args[3], + elementbytes, + position + ) end @assert condeval.head === :(=) @assert length(condeval.args) == 2 @@ -151,7 +174,7 @@ function add_andblock!(ls::LoopSet, ex::Expr, elementbytes::Int, position::Int) getop(ls, condexpr, elementbytes), last(ex.args)::Expr, elementbytes, - position, + position ) end end @@ -162,7 +185,7 @@ function add_orblock!( LHS, rhsop::Operation, elementbytes::Int, - position::Int, + position::Int ) negatedcondop = negateop!(ls, condop, elementbytes) if LHS isa Symbol @@ -170,7 +193,14 @@ function add_orblock!( # return add_compute!(ls, LHS, :ifelse, [condop, altop, rhsop], elementbytes) # Placing altop second seems to let LLVM fuse operations; but as of LLVM 9.0.1 it will not if altop is first # therefore, we negate the condition and switch order so that the altop is second. - return add_compute_ifelse!(ls, LHS, negatedcondop, rhsop, altop, elementbytes) + return add_compute_ifelse!( + ls, + LHS, + negatedcondop, + rhsop, + altop, + elementbytes + ) elseif LHS isa Expr && LHS.head === :ref # negatedcondop = add_compute!(ls, gensym(:negated_mask), :~, [condop], elementbytes) return add_conditional_store!(ls, LHS, negatedcondop, rhsop, elementbytes) @@ -184,9 +214,10 @@ function add_orblock!( LHS, RHS::Expr, elementbytes::Int, - position::Int, + position::Int ) - rhsop = add_compute!(ls, gensym!(ls, "iffalserhs"), RHS, elementbytes, position) + rhsop = + add_compute!(ls, gensym!(ls, "iffalserhs"), RHS, elementbytes, position) add_orblock!(ls, condop, LHS, rhsop, elementbytes, position) end function add_orblock!( @@ -195,7 +226,7 @@ function add_orblock!( LHS, RHS, elementbytes::Int, - position::Int, + position::Int ) rhsop = getop(ls, RHS, elementbytes) add_orblock!(ls, condop, LHS, rhsop, elementbytes, position) @@ -205,15 +236,23 @@ function add_orblock!( condexpr::Expr, condeval::Expr, elementbytes::Int, - position::Int, + position::Int ) - condop = add_operation!(ls, gensym!(ls, "mask"), condexpr, elementbytes, position) + condop = + add_operation!(ls, gensym!(ls, "mask"), condexpr, elementbytes, position) if condeval.head === :call @assert first(condeval.args) === :setindex! array, raw_indices = ref_from_setindex!(ls, condeval) ref = Expr(:ref, array) append!(ref.args, raw_indices) - return add_orblock!(ls, condop, ref, condeval.args[3], elementbytes, position) + return add_orblock!( + ls, + condop, + ref, + condeval.args[3], + elementbytes, + position + ) end @assert condeval.head === :(=) @assert length(condeval.args) == 2 @@ -222,5 +261,11 @@ function add_orblock!( add_orblock!(ls, condop, LHS, RHS, elementbytes, position) end function add_orblock!(ls::LoopSet, ex::Expr, elementbytes::Int, position::Int) - add_orblock!(ls, first(ex.args)::Expr, last(ex.args)::Expr, elementbytes, position) + add_orblock!( + ls, + first(ex.args)::Expr, + last(ex.args)::Expr, + elementbytes, + position + ) end diff --git a/src/parse/add_loads.jl b/src/parse/add_loads.jl index be167176d..bf8537c7f 100644 --- a/src/parse/add_loads.jl +++ b/src/parse/add_loads.jl @@ -42,7 +42,13 @@ function add_load!(ls::LoopSet, op::Operation, actualarray::Bool = true) pushop!(ls, op, name(op)) end -function add_load!(ls::LoopSet, var::Symbol, array::Symbol, rawindices, elementbytes::Int) +function add_load!( + ls::LoopSet, + var::Symbol, + array::Symbol, + rawindices, + elementbytes::Int +) mpref = array_reference_meta!(ls, array, rawindices, elementbytes, var) add_load!(ls, mpref, elementbytes) end @@ -57,7 +63,11 @@ function load_is_constant(mpref::ArrayReferenceMetaPosition) end true end -function add_load!(ls::LoopSet, mpref::ArrayReferenceMetaPosition, elementbytes::Int) +function add_load!( + ls::LoopSet, + mpref::ArrayReferenceMetaPosition, + elementbytes::Int +) if length(mpref.loopdependencies) == 0 || load_is_constant(mpref) return add_constant!(ls, mpref, elementbytes) end @@ -71,7 +81,7 @@ function add_simple_load!( var::Symbol, ref::ArrayReference, elementbytes::Int, - actualarray::Bool = true, + actualarray::Bool = true ) loopdeps = copy(getindicesonly(ref)) mref = ArrayReferenceMeta(ref, fill(true, length(loopdeps))) @@ -83,7 +93,7 @@ function add_simple_load!( mref::ArrayReferenceMeta, loopdeps::Vector{Symbol}, elementbytes::Int, - actualarray::Bool = true, + actualarray::Bool = true ) op = Operation( length(operations(ls)), @@ -94,7 +104,7 @@ function add_simple_load!( loopdeps, NODEPENDENCY, NOPARENTS, - mref, + mref ) add_vptr!(ls, op.ref.ref.array, vptr(op), actualarray) pushop!(ls, op, var) @@ -103,7 +113,12 @@ function add_load_ref!(ls::LoopSet, var::Symbol, ex::Expr, elementbytes::Int) array, rawindices = ref_from_ref!(ls, ex) add_load!(ls, var, array, rawindices, elementbytes) end -function add_load_getindex!(ls::LoopSet, var::Symbol, ex::Expr, elementbytes::Int) +function add_load_getindex!( + ls::LoopSet, + var::Symbol, + ex::Expr, + elementbytes::Int +) array, rawindices = ref_from_getindex!(ls, ex) add_load!(ls, var, array, rawindices, elementbytes) end @@ -114,6 +129,13 @@ function add_loopvalue!(ls::LoopSet, arg::Symbol, elementbytes::Int) for op ∈ operations(ls)#check to CSE (op.variable === arg && instr == instruction(op)) && return op end - op = Operation(length(operations(ls)), arg, elementbytes, instr, loopvalue, [arg]) + op = Operation( + length(operations(ls)), + arg, + elementbytes, + instr, + loopvalue, + [arg] + ) pushop!(ls, op, arg) end diff --git a/src/parse/add_stores.jl b/src/parse/add_stores.jl index c462a7365..e0139588a 100644 --- a/src/parse/add_stores.jl +++ b/src/parse/add_stores.jl @@ -11,7 +11,7 @@ end function add_store!( ls::LoopSet, op::Operation, - add_pvar::Bool = !any(r -> r == op.ref, ls.refs_aliasing_syms), + add_pvar::Bool = !any(r -> r == op.ref, ls.refs_aliasing_syms) ) @assert isstore(op) if add_pvar @@ -25,19 +25,19 @@ function add_copystore!( ls::LoopSet, parent::Operation, mpref::ArrayReferenceMetaPosition, - elementbytes::Int, + elementbytes::Int ) - op = add_compute!(ls, gensym!(ls, "identity"), :identity, [parent], elementbytes) + op = + add_compute!(ls, gensym!(ls, "identity"), :identity, [parent], elementbytes) # pushfirst!(mpref.parents, parent) add_store!(ls, mpref, elementbytes, op) end - function add_store!( ls::LoopSet, mpref::ArrayReferenceMetaPosition, elementbytes::Int, - parent = getop(ls, varname(mpref), mpref.loopdependencies, elementbytes), + parent = getop(ls, varname(mpref), mpref.loopdependencies, elementbytes) ) isload(parent) && return add_copystore!(ls, parent, mpref, elementbytes) vparents = mpref.parents @@ -62,7 +62,13 @@ function add_store!( add_store!(ls, op, add_pvar) end -function add_store!(ls::LoopSet, var::Symbol, array::Symbol, rawindices, elementbytes::Int) +function add_store!( + ls::LoopSet, + var::Symbol, + array::Symbol, + rawindices, + elementbytes::Int +) mpref = array_reference_meta!(ls, array, rawindices, elementbytes, var) add_store!(ls, mpref, elementbytes) end @@ -70,7 +76,7 @@ function add_simple_store!( ls::LoopSet, parent::Operation, mref::ArrayReferenceMeta, - elementbytes::Int, + elementbytes::Int ) op = Operation( ls, @@ -81,7 +87,7 @@ function add_simple_store!( getindices(mref.ref), NODEPENDENCY, [parent], - mref, + mref ) add_unique_store!(ls, op) end @@ -89,7 +95,7 @@ function add_simple_store!( ls::LoopSet, var::Union{Symbol,Operation}, ref::Union{ArrayReference,ArrayReferenceMeta}, - elementbytes::Int, + elementbytes::Int ) parent = isa(var, Symbol) ? getop(ls, var, elementbytes) : var mref = @@ -104,7 +110,13 @@ end function add_store_ref!(ls::LoopSet, var, ex::Expr, elementbytes::Int) array, raw_indices = ref_from_ref!(ls, ex) mpref = array_reference_meta!(ls, array, raw_indices, elementbytes) - c = add_constant!(ls, var, loopdependencies(mpref), gensym(:storeconst), elementbytes) + c = add_constant!( + ls, + var, + loopdependencies(mpref), + gensym(:storeconst), + elementbytes + ) add_store!(ls, mpref, elementbytes, c) end @@ -115,7 +127,7 @@ function add_conditional_store!( LHS, condop::Operation, storeop::Operation, - elementbytes::Int, + elementbytes::Int ) array, rawindices = ref_from_ref!(ls, LHS) mpref = array_reference_meta!(ls, array, rawindices, elementbytes) @@ -169,7 +181,7 @@ function add_conditional_store!( ldref, reduceddependencies(storeop), storeparents, - mref, + mref ) add_unique_store!(ls, op) end diff --git a/src/parse/memory_ops_common.jl b/src/parse/memory_ops_common.jl index fe7be5538..21a30b108 100644 --- a/src/parse/memory_ops_common.jl +++ b/src/parse/memory_ops_common.jl @@ -6,7 +6,11 @@ function dottosym(x::Expr)::Symbol xa2 isa QuoteNode ? Symbol(s1, "###extractarray###", xa2.value) : Symbol(s1, "###extractarray###", xa2) end -function extract_array_symbol_from_ref!(ls::LoopSet, ex::Expr, offset1::Int)::Symbol +function extract_array_symbol_from_ref!( + ls::LoopSet, + ex::Expr, + offset1::Int +)::Symbol ar = ex.args[1+offset1] if isa(ar, Symbol) return ar @@ -20,7 +24,6 @@ function extract_array_symbol_from_ref!(ls::LoopSet, ex::Expr, offset1::Int)::Sy end end - function ref_from_expr!(ls, ex, offset1::Int, offset2::Int) ar = extract_array_symbol_from_ref!(ls, ex, offset1) ar, @view(ex.args[2+offset2:end]) @@ -38,10 +41,16 @@ function ref_from_expr!(ls::LoopSet, ex::Expr) end add_vptr!(ls::LoopSet, op::Operation) = add_vptr!(ls, op.ref) -add_vptr!(ls::LoopSet, mref::ArrayReferenceMeta) = add_vptr!(ls, mref.ref.array, vptr(mref)) +add_vptr!(ls::LoopSet, mref::ArrayReferenceMeta) = + add_vptr!(ls, mref.ref.array, vptr(mref)) # using VectorizationBase: noaliasstridedpointer presbufsym(array) = Symbol('#', array, "#preserve#buffer#") -function add_vptr!(ls::LoopSet, array::Symbol, vptrarray::Symbol, actualarray::Bool = true) +function add_vptr!( + ls::LoopSet, + array::Symbol, + vptrarray::Symbol, + actualarray::Bool = true +) if !includesarray(ls, array) push!(ls.includedarrays, array) actualarray && push!(ls.includedactualarrays, array) @@ -50,8 +59,8 @@ function add_vptr!(ls::LoopSet, array::Symbol, vptrarray::Symbol, actualarray::B Expr( :(=), Expr(:tuple, vptrarray, presbufsym(array)), - Expr(:call, lv(:stridedpointer_preserve), array), - ), + Expr(:call, lv(:stridedpointer_preserve), array) + ) ) end nothing @@ -72,7 +81,7 @@ function subset_vptr!( ind, previndices, loopindex, - D::Int, + D::Int ) subset = D == 0 str_typ = subset ? "subset" : "index" @@ -94,7 +103,8 @@ function subset_vptr!( # A[I + J, constindex], I and J may be CartesianIndices. This requires they all be of same number of dims loopdep = first(loopdependencies(ls.opdict[previndices[i+offset]])) end - constoffset = append_loop_staticdims!(valcall, getloop(ls, loopdep), constoffset) + constoffset = + append_loop_staticdims!(valcall, getloop(ls, loopdep), constoffset) end end # indm1 = ind isa Integer ? ind - 1 : Expr(:call, :-, ind, 1) @@ -118,7 +128,7 @@ function gesp_const_offset!( loopedindex::Vector{Bool}, mlt::Integer, sym, - D::Int, + D::Int ) if isone(mlt) subset_vptr!(ls, vptrarray, ninds, sym, indices, loopedindex, D) @@ -137,16 +147,24 @@ function gesp_const_offsets!( indices::Vector{Symbol}, loopedindex::Vector{Bool}, mltsyms::Vector{Tuple{Int,Symbol}}, - D::Int, + D::Int ) - length(mltsyms) > 1 && sort!(mltsyms, by = last) # if multiple have same combination of syms, make sure they match even if order is different + length(mltsyms) > 1 && sort!(mltsyms; by = last) # if multiple have same combination of syms, make sure they match even if order is different for (mlt, sym) ∈ mltsyms - vptrarray = gesp_const_offset!(ls, vptrarray, ninds, indices, loopedindex, mlt, sym, D) + vptrarray = gesp_const_offset!( + ls, + vptrarray, + ninds, + indices, + loopedindex, + mlt, + sym, + D + ) end vptrarray end - byterepresentable(x)::Bool = false byterepresentable(x::Integer)::Bool = typemin(Int8) ≤ x ≤ typemax(Int8) function _addoffset!( @@ -157,7 +175,7 @@ function _addoffset!( loopdependencies, ind, offset, - stride, + stride ) push!(indices, ind) push!(offsets, offset % Int8) @@ -186,7 +204,7 @@ function addopindex!( loopedindex::Vector{Bool}, indop::Operation, stride = one(Int8), - offset = zero(Int8), + offset = zero(Int8) ) pushparent!(parents, loopdependencies, reduceddeps, indop) push!(indices, name(indop)) @@ -205,7 +223,7 @@ function add_affine_index_expr!( mult_syms::Vector{Tuple{Int,Symbol}}, constant::Base.RefValue{Int}, stride::Int, - expr::Symbol, + expr::Symbol ) push!(mult_syms, (stride, expr)) return nothing @@ -215,7 +233,7 @@ function add_affine_index_expr!( mult_syms::Vector{Tuple{Int,Symbol}}, constant::Base.RefValue{Int}, stride::Int, - expr::Integer, + expr::Integer ) constant[] += stride * expr return nothing @@ -225,10 +243,15 @@ function add_affine_op!( mult_syms::Vector{Tuple{Int,Symbol}}, constant::Base.RefValue{Int}, stride::Int, - expr::Expr, + expr::Expr ) - parent = - add_operation!(ls, gensym!(ls, "indexpr"), expr, sizeof(Int), length(ls.loopsymbols)) + parent = add_operation!( + ls, + gensym!(ls, "indexpr"), + expr, + sizeof(Int), + length(ls.loopsymbols) + ) add_affine_index_expr!(ls, mult_syms, constant, stride, name(parent)) return nothing end @@ -239,7 +262,7 @@ function add_mul!( stride::Int, arg1, arg2, - expr, + expr ) if arg1 isa Integer add_affine_index_expr!(ls, mult_syms, constant, stride * arg1, arg2) @@ -255,9 +278,10 @@ function add_affine_index_expr!( mult_syms::Vector{Tuple{Int,Symbol}}, constant::Base.RefValue{Int}, stride::Int, - expr::Expr, + expr::Expr ) - expr.head === :call || return add_affine_op!(ls, mult_syms, constant, stride, expr) + expr.head === :call || + return add_affine_op!(ls, mult_syms, constant, stride, expr) f = expr.args[1] if f === :(*) @assert length(expr.args) == 3 @@ -278,7 +302,10 @@ function add_affine_index_expr!( end return nothing end -function affine_index_expression(ls::LoopSet, expr)::Tuple{Int,Vector{Tuple{Int,Symbol}}} +function affine_index_expression( + ls::LoopSet, + expr +)::Tuple{Int,Vector{Tuple{Int,Symbol}}} mult_syms = Tuple{Int,Symbol}[] constant = Ref(0) add_affine_index_expr!(ls, mult_syms, constant, 1, expr) @@ -296,7 +323,7 @@ function muladd_index!( loopedindex, mlt::Int, sym::Symbol, - offset::Int, + offset::Int ) muladd_index!( ls, @@ -309,7 +336,7 @@ function muladd_index!( loopedindex, mlt, getop(ls, sym, sizeof(Int)), - offset, + offset ) end function muladd_op!(ls::LoopSet, mlt::Int, sym::Symbol, offset::Int) @@ -350,7 +377,7 @@ function muladd_index!( loopedindex, mlt::Int, symop::Operation, - offset::Int, + offset::Int ) if byterepresentable(offset) & byterepresentable(mlt) addopindex!( @@ -363,7 +390,7 @@ function muladd_index!( loopedindex, symop, mlt, - offset, + offset ) else indop = muladd_op!(ls, mlt, symop, offset) @@ -375,7 +402,7 @@ function muladd_index!( offsets, strides, loopedindex, - indop, + indop ) end end @@ -403,7 +430,7 @@ function add_additive_index!( reduceddeps, offset, mlt, - D, + D ) factor = Core.ifelse((instruction(mop).instr === :sub_fast), -1, 1) if length(parents(mop)) == 2 @@ -420,7 +447,7 @@ function add_additive_index!( loopdependencies, name(sub1), offset + literalval, - mlt, + mlt ) else vptrarray = gesp_const_offset!( @@ -431,7 +458,7 @@ function add_additive_index!( loopedindex, factor * mlt, name(sub2), - D, + D ) _addoffset!( indices, @@ -441,7 +468,7 @@ function add_additive_index!( loopdependencies, name(sub1), offset, - mlt, + mlt ) end elseif isloopvalue(sub2) & isconstant(sub1) @@ -455,11 +482,19 @@ function add_additive_index!( loopdependencies, name(sub2), offset + literalval, - factor * mlt, + factor * mlt ) else - vptrarray = - gesp_const_offset!(ls, vptrarray, ninds, indices, loopedindex, mlt, name(sub1), D) + vptrarray = gesp_const_offset!( + ls, + vptrarray, + ninds, + indices, + loopedindex, + mlt, + name(sub1), + D + ) _addoffset!( indices, offsets, @@ -468,7 +503,7 @@ function add_additive_index!( loopdependencies, name(sub2), offset, - factor * mlt, + factor * mlt ) end else @@ -483,7 +518,7 @@ function add_additive_index!( loopedindex, mlt, sym, - offset, + offset ) end else @@ -498,7 +533,7 @@ function add_additive_index!( loopedindex, mlt, sym, - offset, + offset ) end vptrarray @@ -516,9 +551,8 @@ function checkforoffset!( loopdependencies::Vector{Symbol}, reduceddeps::Vector{Symbol}, ind::Expr, - D::Int, + D::Int )::Symbol - offset, mult_syms = affine_index_expression(ls, ind) let deleted = 0, N = length(mult_syms) for n ∈ 1:N @@ -563,14 +597,22 @@ function checkforoffset!( loopedindex, mlt, sym, - offset, + offset ) return vptrarray end end r = copysign(abs(offset) & 127, offset) - vptrarray = - gesp_const_offset!(ls, vptrarray, ninds, indices, loopedindex, 1, offset - r, D) + vptrarray = gesp_const_offset!( + ls, + vptrarray, + ninds, + indices, + loopedindex, + 1, + offset - r, + D + ) offset = r end # (success && byterepresentable(offset)) || return false, vptrarray @@ -589,7 +631,7 @@ function checkforoffset!( loopdependencies, sym, offset, - mlt, + mlt ) else muladd_index!( @@ -603,7 +645,7 @@ function checkforoffset!( loopedindex, mlt, sym, - offset, + offset ) end elseif !byterepresentable(mlt) @@ -618,17 +660,33 @@ function checkforoffset!( loopedindex, mlt, sym, - offset, + offset ) else mop = get(ls.opdict, sym, nothing) if mop === nothing - vptrarray = - gesp_const_offset!(ls, vptrarray, ninds, indices, loopedindex, mlt, sym, D) + vptrarray = gesp_const_offset!( + ls, + vptrarray, + ninds, + indices, + loopedindex, + mlt, + sym, + D + ) addconstindex!(indices, offsets, strides, loopedindex, offset) elseif isconstant(mop) - vptrarray = - gesp_const_offset!(ls, vptrarray, ninds, indices, loopedindex, mlt, name(mop), D) + vptrarray = gesp_const_offset!( + ls, + vptrarray, + ninds, + indices, + loopedindex, + mlt, + name(mop), + D + ) addconstindex!(indices, offsets, strides, loopedindex, offset) elseif (instruction(mop).instr === :add_fast) || (instruction(mop).instr === :sub_fast) @@ -647,7 +705,7 @@ function checkforoffset!( reduceddeps, offset, mlt, - D, + D ) else muladd_index!( @@ -661,7 +719,7 @@ function checkforoffset!( loopedindex, mlt, sym, - offset, + offset ) end end @@ -685,11 +743,28 @@ function checkforoffset!( sop = get(ls.opdict, s, nothing) if sop === nothing push!(deleteat_inds, i) - vptrarray = gesp_const_offset!(ls, vptrarray, ninds, indices, loopedindex, m, s, D) + vptrarray = gesp_const_offset!( + ls, + vptrarray, + ninds, + indices, + loopedindex, + m, + s, + D + ) elseif isconstant(sop) push!(deleteat_inds, i) - vptrarray = - gesp_const_offset!(ls, vptrarray, ninds, indices, loopedindex, m, name(sop), D) + vptrarray = gesp_const_offset!( + ls, + vptrarray, + ninds, + indices, + loopedindex, + m, + name(sop), + D + ) else # @show sop # if instruction(sop).instr === :sub_fast @@ -729,18 +804,38 @@ function checkforoffset!( loopdependencies, sym, offset, - mlt, + mlt ) end deleteat!(mult_syms, deleteat_inds) - return gesp_const_offsets!(ls, vptrarray, ninds, indices, loopedindex, mult_syms, D) + return gesp_const_offsets!( + ls, + vptrarray, + ninds, + indices, + loopedindex, + mult_syms, + D + ) end deleteat!(mult_syms, deleteat_inds) - vptrarray = gesp_const_offsets!(ls, vptrarray, ninds, indices, loopedindex, mult_syms, D) + vptrarray = gesp_const_offsets!( + ls, + vptrarray, + ninds, + indices, + loopedindex, + mult_syms, + D + ) if length(operations) == 1 _mlt = only(operation_mults) - indop = - muladd_op!(ls, Core.ifelse(byterepresentable(_mlt), 1, _mlt), only(operations), 0) + indop = muladd_op!( + ls, + Core.ifelse(byterepresentable(_mlt), 1, _mlt), + only(operations), + 0 + ) addopindex!( opparents, loopdependencies, @@ -751,7 +846,7 @@ function checkforoffset!( loopedindex, indop, Core.ifelse(byterepresentable(_mlt), _mlt % Int8, one(Int8)), - offset % Int8, + offset % Int8 ) else mlt1ind = findfirst(isone, operation_mults) @@ -770,7 +865,7 @@ function checkforoffset!( gensym!(ls, "indexaccum"), instruction(:(-)), [opbase, _op], - sizeof(Int), + sizeof(Int) ) elseif _mlt == 1 add_compute!( @@ -778,7 +873,7 @@ function checkforoffset!( gensym!(ls, "indexaccum"), instruction(:(+)), [opbase, _op], - sizeof(Int), + sizeof(Int) ) else add_compute!( @@ -786,7 +881,7 @@ function checkforoffset!( gensym!(ls, "indexaccum"), instruction(:muladd), [add_constant!(ls, _mlt, sizeof(Int)), _op, opbase], - sizeof(Int), + sizeof(Int) ) end end @@ -800,7 +895,7 @@ function checkforoffset!( loopedindex, opbase, one(Int8), - offset % Int8, + offset % Int8 ) end return vptrarray @@ -819,14 +914,15 @@ function repeated_index!( indices::Vector{Symbol}, vptr::Symbol, indnum::Int, - firstind::Int, + firstind::Int ) # Move ind to last position - vptrrepremoved = Symbol(vptr, "##ind##", firstind, "##repeated##", indnum, "##") + vptrrepremoved = + Symbol(vptr, "##ind##", firstind, "##repeated##", indnum, "##") f = Expr( :(.), Expr(:(.), :LoopVectorization, QuoteNode(:VectorizationBase)), - QuoteNode(:double_index), + QuoteNode(:double_index) ) fiv = Expr(:call, Expr(:curly, :Val, firstind - 1)) siv = Expr(:call, Expr(:curly, :Val, indnum - 1)) @@ -839,7 +935,7 @@ function array_reference_meta!( array::Symbol, rawindices, elementbytes::Int, - var::Union{Nothing,Symbol} = nothing, + var::Union{Nothing,Symbol} = nothing ) vptrarray = vptr(array) add_vptr!(ls, array, vptrarray) # now, subset @@ -860,7 +956,15 @@ function array_reference_meta!( ninds += 1 else # convert ind to reduce invalidations - vptrarray = subset_vptr!(ls, vptrarray, ninds, convert(Int, ind), indices, loopedindex, 0) + vptrarray = subset_vptr!( + ls, + vptrarray, + ninds, + convert(Int, ind), + indices, + loopedindex, + 0 + ) length(indices) == 0 && push!(indices, DISCONTIGUOUS) end elseif ind isa Expr @@ -877,7 +981,7 @@ function array_reference_meta!( loopdependencies, reduceddeps, ind, - D, + D ) ninds += 1 elseif ind isa Symbol @@ -900,7 +1004,7 @@ function array_reference_meta!( else move_to_last!( loopdependencies, - findfirst(Base.Fix2(===, ind), loopdependencies)::Int, + findfirst(Base.Fix2(===, ind), loopdependencies)::Int ) end vptrarray = repeated_index!( @@ -908,7 +1012,7 @@ function array_reference_meta!( indices, vptrarray, ninds, - ind_prev_index + (first(indices) === DISCONTIGUOUS), + ind_prev_index + (first(indices) === DISCONTIGUOUS) ) makediscontiguous!(indices) end @@ -932,7 +1036,7 @@ function array_reference_meta!( reduceddeps, 0, 1, - D, + D ) ninds += 1 else @@ -944,7 +1048,8 @@ function array_reference_meta!( push!(loopedindex, false) end else - vptrarray = subset_vptr!(ls, vptrarray, ninds, ind, indices, loopedindex, 0) + vptrarray = + subset_vptr!(ls, vptrarray, ninds, ind, indices, loopedindex, 0) length(indices) == 0 && push!(indices, DISCONTIGUOUS) end end @@ -955,21 +1060,21 @@ function array_reference_meta!( mref = ArrayReferenceMeta( ArrayReference(array, indices, offsets, strides), loopedindex, - vptrarray, + vptrarray ) ArrayReferenceMetaPosition( mref, parents, loopdependencies, reduceddeps, - var === nothing ? Symbol("") : var, + var === nothing ? Symbol("") : var ) end function tryrefconvert( ls::LoopSet, ex::Expr, elementbytes::Int, - var::Union{Nothing,Symbol} = nothing, + var::Union{Nothing,Symbol} = nothing )::Tuple{Bool,ArrayReferenceMetaPosition} ya, yinds = if ex.head === :ref ref_from_ref!(ls, ex) diff --git a/src/predicates.jl b/src/predicates.jl index 192d367c2..48291974d 100644 --- a/src/predicates.jl +++ b/src/predicates.jl @@ -23,4 +23,5 @@ hasscope(modex, modpath::Tuple{Vararg{Symbol}}) = Return true if `g` is equal to `GlobalRef(mod, name)`. """ -isglobalref(g, mod, name) = isa(g, GlobalRef) && g.mod === mod && g.name === name +isglobalref(g, mod, name) = + isa(g, GlobalRef) && g.mod === mod && g.name === name diff --git a/src/reconstruct_loopset.jl b/src/reconstruct_loopset.jl index b3eef51d6..e704100d5 100644 --- a/src/reconstruct_loopset.jl +++ b/src/reconstruct_loopset.jl @@ -2,39 +2,47 @@ const NOpsType = Int#Union{Int,Vector{Int}} struct UpperBoundedInteger{N,T<:Base.BitInteger} <: Integer i::T - @inline UpperBoundedInteger{N}(i::T) where {N,T<:Base.BitInteger} = new{N,T}(i) + @inline UpperBoundedInteger{N}(i::T) where {N,T<:Base.BitInteger} = + new{N,T}(i) end @inline UpperBoundedInteger(i::T, ::StaticInt{N}) where {N,T<:Base.BitInteger} = UpperBoundedInteger{N}(i) -@inline UpperBoundedInteger(::StaticInt{M}, ::StaticInt{N}) where {N,M} = StaticInt{M}() +@inline UpperBoundedInteger(::StaticInt{M}, ::StaticInt{N}) where {N,M} = + StaticInt{M}() @inline UpperBoundedInteger{N}(::StaticInt{M}) where {N,M} = StaticInt{M}() -@inline Base.:(%)(a::UpperBoundedInteger, ::Type{T}) where {T<:Base.BitInteger} = a.i % T +@inline Base.:(%)( + a::UpperBoundedInteger, + ::Type{T} +) where {T<:Base.BitInteger} = a.i % T Base.promote_rule( ::Type{T}, - ::Type{UpperBoundedInteger{N,S}}, + ::Type{UpperBoundedInteger{N,S}} ) where {N,T<:Base.BitInteger,S} = promote_rule(T, S) Base.promote_rule( ::Type{UpperBoundedInteger{N,S}}, - ::Type{T}, + ::Type{T} ) where {N,T<:Base.BitInteger,S} = promote_rule(S, T) Base.promote_rule( ::Type{UpperBoundedInteger{N,T}}, - ::Type{T}, + ::Type{T} ) where {N,T<:Base.BitInteger} = T -Base.convert(::Type{T}, i::UpperBoundedInteger) where {T<:Number} = convert(T, i.i) +Base.convert(::Type{T}, i::UpperBoundedInteger) where {T<:Number} = + convert(T, i.i) Base.convert( ::Type{UpperBoundedInteger{N,T}}, - i::UpperBoundedInteger{N,T}, + i::UpperBoundedInteger{N,T} ) where {N,T<:Base.BitInteger} = i upper_bound(_) = typemax(Int) -upper_bound(::Type{CO}) where {T,N,S,CO<:AbstractCloseOpen{T,UpperBoundedInteger{N,S}}} = - N - 1 +upper_bound( + ::Type{CO} +) where {T,N,S,CO<:AbstractCloseOpen{T,UpperBoundedInteger{N,S}}} = N - 1 @inline Base.last(r::AbstractCloseOpen{<:Integer,<:UpperBoundedInteger}) = getfield(getfield(r, :upper), :i) - One() -@inline ArrayInterface.static_last(r::CloseOpen{<:Integer,<:UpperBoundedInteger}) = - getfield(getfield(r, :upper), :i) - One() +@inline ArrayInterface.static_last( + r::CloseOpen{<:Integer,<:UpperBoundedInteger} +) = getfield(getfield(r, :upper), :i) - One() @inline Base.length(r::AbstractCloseOpen{<:Integer,<:UpperBoundedInteger}) = getfield(getfield(r, :upper), :i) - getfield(r, :start) @inline Base.length(r::AbstractCloseOpen{Zero,<:UpperBoundedInteger}) = @@ -50,13 +58,17 @@ function Loop(ls::LoopSet, ex::Expr, sym::Symbol, f, s, l, ub::Int) pushpreamble!(ls, Expr(:(=), rangesym, ex)) pushpreamble!( ls, - Expr(:(=), lensym, Expr(:call, GlobalRef(ArrayInterface, :static_length), rangesym)), + Expr( + :(=), + lensym, + Expr(:call, GlobalRef(ArrayInterface, :static_length), rangesym) + ) ) F = if f === nothing start = gensym(ssym * "_loopstart") pushpreamble!( ls, - Expr(:(=), start, Expr(:call, %, Expr(:call, lv(:first), rangesym), Int)), + Expr(:(=), start, Expr(:call, %, Expr(:call, lv(:first), rangesym), Int)) ) MaybeKnown(start, 1) else @@ -66,7 +78,7 @@ function Loop(ls::LoopSet, ex::Expr, sym::Symbol, f, s, l, ub::Int) step = gensym(ssym * "_loopstep") pushpreamble!( ls, - Expr(:(=), step, Expr(:call, %, Expr(:call, lv(:step), rangesym), Int)), + Expr(:(=), step, Expr(:call, %, Expr(:call, lv(:step), rangesym), Int)) ) MaybeKnown(step, 1) else @@ -76,7 +88,7 @@ function Loop(ls::LoopSet, ex::Expr, sym::Symbol, f, s, l, ub::Int) stop = gensym(ssym * "_loopstop") pushpreamble!( ls, - Expr(:(=), stop, Expr(:call, %, Expr(:call, lv(:last), rangesym), Int)), + Expr(:(=), stop, Expr(:call, %, Expr(:call, lv(:last), rangesym), Int)) ) MaybeKnown(stop, min(ub, 1024)) else @@ -84,7 +96,12 @@ function Loop(ls::LoopSet, ex::Expr, sym::Symbol, f, s, l, ub::Int) end loopiteratesatleastonce!(ls, Loop(sym, F, L, S, rangesym, lensym)) end -function Loop(ls::LoopSet, ex::Expr, sym::Symbol, ::Type{R}) where {R<:AbstractRange} +function Loop( + ls::LoopSet, + ex::Expr, + sym::Symbol, + ::Type{R} +) where {R<:AbstractRange} f = ArrayInterface.known_first(R) s = ArrayInterface.known_step(R) l = ArrayInterface.known_last(R) @@ -93,13 +110,20 @@ function Loop(ls::LoopSet, ex::Expr, sym::Symbol, ::Type{R}) where {R<:AbstractR end function static_loop(sym::Symbol, L::Int, S::Int, U::Int) - Loop(sym, MaybeKnown(L, 0), MaybeKnown(U, 0), MaybeKnown(S, 0), Symbol(""), Symbol("")) + Loop( + sym, + MaybeKnown(L, 0), + MaybeKnown(U, 0), + MaybeKnown(S, 0), + Symbol(""), + Symbol("") + ) end function Loop( ::LoopSet, ::Expr, sym::Symbol, - ::Type{OptionallyStaticUnitRange{StaticInt{L},StaticInt{U}}}, + ::Type{OptionallyStaticUnitRange{StaticInt{L},StaticInt{U}}} ) where {L,U} static_loop(sym, L, 1, U) end @@ -107,7 +131,13 @@ function Loop( ::LoopSet, ::Expr, sym::Symbol, - ::Type{ArrayInterface.OptionallyStaticStepRange{StaticInt{L},StaticInt{S},StaticInt{U}}}, + ::Type{ + ArrayInterface.OptionallyStaticStepRange{ + StaticInt{L}, + StaticInt{S}, + StaticInt{U} + } + } ) where {L,S,U} static_loop(sym, L, S, U) end @@ -115,12 +145,11 @@ function Loop( ::LoopSet, ::Expr, sym::Symbol, - ::Type{CO}, + ::Type{CO} ) where {L,U,CO<:AbstractCloseOpen{StaticInt{L},StaticInt{U}}} static_loop(sym, L, 1, U - 1) end - extract_loop(l) = Expr(:call, getfield, Symbol("#loop#bounds#"), l) function add_loops!(ls::LoopSet, LPSYM, LB) @@ -148,7 +177,12 @@ function add_loops!( :($getfield($getfield($getfield(var"#loop#bounds#", $i), :indices), $k)) add_loop!( ls, - Loop(ls, axisexpr, Symbol(ssym * '#' * string(k) * '#'), T.parameters[k])::Loop, + Loop( + ls, + axisexpr, + Symbol(ssym * '#' * string(k) * '#'), + T.parameters[k] + )::Loop ) end push!(ls.loopsymbol_offsets, ls.loopsymbol_offsets[end] + N) @@ -160,7 +194,7 @@ function ArrayReferenceMeta( arraysymbolinds::Vector{Symbol}, opsymbols::Vector{Symbol}, nopsv::Vector{NOpsType}, - expandedv::Vector{Bool}, + expandedv::Vector{Bool} ) # unpack the `ArrayRefStruct` # we don't want to specialize on it, as it is typed on symbols. @@ -182,7 +216,7 @@ function ArrayReferenceMeta( arraysymbolinds, opsymbols, nopsv, - expandedv, + expandedv ) end function ArrayReferenceMeta( @@ -196,7 +230,7 @@ function ArrayReferenceMeta( arraysymbolinds::Vector{Symbol}, opsymbols::Vector{Symbol}, nopsv::Vector{NOpsType}, - expandedv::Vector{Bool}, + expandedv::Vector{Bool} ) ni = filled_8byte_chunks(index_types) index_vec = Symbol[] @@ -253,11 +287,10 @@ function ArrayReferenceMeta( ArrayReferenceMeta( ArrayReference(arrayar, index_vec, offset_vec, stride_vec), loopedindex, - ptrar, + ptrar ) end - extract_varg(i) = :($getfield(var"#vargs#", $i)) # _extract(::Type{StaticInt{N}}) where {N} = N extract_gsp!(sptrs::Expr, name::Symbol) = (push!(sptrs.args, name); nothing) @@ -275,7 +308,9 @@ function loop_indexes_bit!(ls::LoopSet, ar::ArrayReferenceMeta) ind = first(getindices(ar)) ind === DISCONTIGUOUS && return first(li) || throw( - LoopError("The contiguous index of a `BitArray` shouldn't be a complex function.")ind, + LoopError( + "The contiguous index of a `BitArray` shouldn't be a complex function." + )ind ) ls.loopindexesbit[getloopid(ls, ind)] = true nothing @@ -288,13 +323,14 @@ function add_mref!( C::Int, B::Int, sp::Vector{Int}, - name::Symbol, + name::Symbol ) where {T} @assert B ≤ 0 "Batched arrays not supported yet." _add_mref!(sptrs, ls, ar, typetosym(T), C, B, sp, name) sizeof(T) end -typetosym(::Type{T}) where {T<:NativeTypes} = (VectorizationBase.JULIA_TYPES[T])::Symbol +typetosym(::Type{T}) where {T<:NativeTypes} = + (VectorizationBase.JULIA_TYPES[T])::Symbol typetosym(T) = T function _add_mref!( sptrs::Expr, @@ -304,7 +340,7 @@ function _add_mref!( C::Int, B::Int, sp::Vector{Int}, - name::Symbol, + name::Symbol ) # maybe no change needed? -- optimize common case li = ar.loopedindex @@ -333,14 +369,15 @@ function _add_mref!( for n ∈ eachindex(sp) push!(column_major.args, n) end - sitype = Expr(:curly, lv(:StrideIndex), length(sp), column_major, (C == -1 ? -1 : 1)) + sitype = + Expr(:curly, lv(:StrideIndex), length(sp), column_major, (C == -1 ? -1 : 1)) siexpr = Expr(:call, sitype, strd_tup, offsets_tup) sptr = Expr( :call, lv(:stridedpointer), Expr(:call, lv(:pointer), tmpsp), siexpr, - staticexpr(B), + staticexpr(B) ) pushpreamble!(ls, Expr(:(=), name, sptr)) @@ -373,7 +410,7 @@ function add_mref!( ::Int, ::Int, sp::Vector{Int}, - name::Symbol, + name::Symbol ) where {T,F,S,O} extract_gsp!(sptrs, name) sizeof(T) @@ -385,12 +422,12 @@ function create_mrefs!( os::Vector{Symbol}, nopsv::Vector{NOpsType}, expanded::Vector{Bool}, - ::Type{Tuple{}}, + ::Type{Tuple{}} ) length(arf) == 0 || throw( ArgumentError( - "Length of array ref vector should be 0 if there are no stridedpointers.", - ), + "Length of array ref vector should be 0 if there are no stridedpointers." + ) ) Vector{ArrayReferenceMeta}(undef, length(arf)), Int[] end @@ -404,7 +441,10 @@ function stabilize_grouped_stridedpointer_type(C, B, R) Bv[n] = B[n] Rₙ = R[n] let L::Int = length(Rₙ) - Rv[n] = (ntuple(i -> i > L ? typemax(Int) : (Rₙ[i])::Int, Val(8))::NTuple{8,Int}, L) + Rv[n] = ( + ntuple(i -> i > L ? typemax(Int) : (Rₙ[i])::Int, Val(8))::NTuple{8,Int}, + L + ) end end Cv, Bv, Rv @@ -418,7 +458,6 @@ function create_mrefs!( expanded::Vector{Bool}, @nospecialize(_::Type{GroupedStridedPointers{P,C,B,R,I,X,O}}) ) where {P,C,B,R,I,X,O} - Cv, Bv, Rv = stabilize_grouped_stridedpointer_type(C, B, R) _create_mrefs!(ls, arf, as, os, nopsv, expanded, P.parameters, Cv, Bv, Rv) end @@ -432,9 +471,10 @@ function _create_mrefs!( P::Core.SimpleVector, C::Vector{Int}, B::Vector{Int}, - R::Vector{Tuple{NTuple{8,Int},Int}}, + R::Vector{Tuple{NTuple{8,Int},Int}} ) - mrefs::Vector{ArrayReferenceMeta} = Vector{ArrayReferenceMeta}(undef, length(arf)) + mrefs::Vector{ArrayReferenceMeta} = + Vector{ArrayReferenceMeta}(undef, length(arf)) elementbytes::Vector{Int} = Vector{Int}(undef, length(arf)) sptrs = Expr(:tuple) # pushpreamble!(ls, Expr(:(=), sptrs, :(VectorizationBase.stridedpointers(getfield(vargs, 1, false))))) @@ -443,8 +483,8 @@ function _create_mrefs!( Expr( :(=), sptrs, - :(VectorizationBase.stridedpointers(getfield(var"#vargs#", 1, false))), - ), + :(VectorizationBase.stridedpointers(getfield(var"#vargs#", 1, false))) + ) ) j = 0 rank_to_sps = Vector{Tuple{Int,Vector{Int}}}(undef, length(arf)) @@ -504,7 +544,7 @@ function expandbyoffset!( indexpand::Vector{T}, inds, offsets::Vector{Int}, - expand::Bool = true, + expand::Bool = true ) where {T<:Union{Int,Tuple{Int,<:Any}}} for _ind ∈ inds ind = T === Int ? _ind : first(_ind) @@ -524,7 +564,12 @@ function expandbyoffset!( end expandbyoffset(inds::Vector{Int}, offsets::Vector{Int}, expand::Bool) = expandbyoffset!(Int[], inds, offsets, expand) -function loopindex!(idxs::Vector{Int}, ls::LoopSet, u::Unsigned, shift::Unsigned) +function loopindex!( + idxs::Vector{Int}, + ls::LoopSet, + u::Unsigned, + shift::Unsigned +) mask = (one(shift) << shift) - one(shift) # mask to zero out all but shift-bits while u != zero(u) pushfirst!(idxs, (u % typeof(shift)) & mask) @@ -534,7 +579,12 @@ function loopindex!(idxs::Vector{Int}, ls::LoopSet, u::Unsigned, shift::Unsigned end loopindex(ls::LoopSet, u::Unsigned, shift::Unsigned) = reverse!(loopindex!(Int[], ls, u, shift)) -function loopindexoffset(ls::LoopSet, u::Unsigned, li::Bool, expand::Bool = false) +function loopindexoffset( + ls::LoopSet, + u::Unsigned, + li::Bool, + expand::Bool = false +) if li shift = 0x04 offsets = ls.loopsymbol_offsets @@ -551,13 +601,27 @@ function parents_symvec(ls::LoopSet, u::Unsigned, expand, offset) end loopdependencies(ls::LoopSet, os::OperationStruct, expand = false, offset = 0) = parents_symvec(ls, os.loopdeps, expand, offset) -reduceddependencies(ls::LoopSet, os::OperationStruct, expand = false, offset = 0) = - parents_symvec(ls, os.reduceddeps, expand, offset) -childdependencies(ls::LoopSet, os::OperationStruct, expand = false, offset = 0) = - parents_symvec(ls, os.childdeps, expand, offset) +reduceddependencies( + ls::LoopSet, + os::OperationStruct, + expand = false, + offset = 0 +) = parents_symvec(ls, os.reduceddeps, expand, offset) +childdependencies( + ls::LoopSet, + os::OperationStruct, + expand = false, + offset = 0 +) = parents_symvec(ls, os.childdeps, expand, offset) # parents(ls::LoopSet, u::UInt128) = loopindexoffset(ls, u, false) -function parents(ls::LoopSet, u₀::UInt128, u₁::UInt128, u₂::UInt128, u₃::UInt128) +function parents( + ls::LoopSet, + u₀::UInt128, + u₁::UInt128, + u₂::UInt128, + u₃::UInt128 +) idxs = Int[] u₃ == zero(u₃) || loopindex!(idxs, ls, u₃, 0x0010) u₂ == zero(u₂) || loopindex!(idxs, ls, u₂, 0x0010) @@ -584,7 +648,7 @@ function isexpanded( ls::LoopSet, ops::Vector{OperationStruct}, nopsv::Vector{NOpsType}, - i::Int, + i::Int ) nops = nopsv[i] # nops isa Vector{Int} only if accesses_memory(os), which means isexpanded must be false @@ -602,7 +666,7 @@ end function mref_elbytes( os::OperationStruct, mrefs::Vector{ArrayReferenceMeta}, - elementbytes::Vector{Int}, + elementbytes::Vector{Int} ) if isload(os) | isstore(os) mrefs[os.array], elementbytes[os.array] @@ -619,7 +683,7 @@ function add_op!( i::Int, mrefs::Vector{ArrayReferenceMeta}, opsymbol, - elementbytes::Vector{Int}, + elementbytes::Vector{Int} ) os = ops[i] mref, elbytes = mref_elbytes(os, mrefs, elementbytes) @@ -640,7 +704,7 @@ function add_op!( reduceddependencies(ls, os, true), Operation[], mref, - childdependencies(ls, os, true), + childdependencies(ls, os, true) ) push!(ls.operations, op) push!(opoffsets, opoffsets[end] + 1) @@ -660,7 +724,7 @@ function add_op!( reduceddependencies(ls, os, false, offset), Operation[], mref, - childdependencies(ls, os, false, offset), + childdependencies(ls, os, false, offset) ) push!(ls.operations, op) end @@ -675,7 +739,7 @@ function add_parents_to_op!( up₂::UInt128, up₃::UInt128, k::Int, - Δ::Int, + Δ::Int ) vparents = parents(op) ops = operations(ls) @@ -701,7 +765,11 @@ function add_parents_to_op!( end end end -function add_parents_to_ops!(ls::LoopSet, ops::Vector{OperationStruct}, constoffset) +function add_parents_to_ops!( + ls::LoopSet, + ops::Vector{OperationStruct}, + constoffset +) offsets = ls.operation_offsets for i = 1:length(offsets)-1 pos = offsets[i] @@ -723,7 +791,7 @@ function add_parents_to_ops!(ls::LoopSet, ops::Vector{OperationStruct}, constoff ops[i].parents₂, ops[i].parents₃, k, - Δ, + Δ ) end end @@ -739,13 +807,23 @@ function add_ops!( opsymbols::Vector{Symbol}, constoffset::Int, nopsv::Vector{NOpsType}, - expandedv::Vector{Bool}, + expandedv::Vector{Bool} ) # @show ls.loopsymbols ls.loopsymbol_offsets for i ∈ eachindex(ops) os = ops[i] opsymbol = opsymbols[os.symid] - add_op!(ls, instr[i], ops, nopsv, expandedv, i, mrefs, opsymbol, elementbytes) + add_op!( + ls, + instr[i], + ops, + nopsv, + expandedv, + i, + mrefs, + opsymbol, + elementbytes + ) end add_parents_to_ops!(ls, ops, constoffset) # for op ∈ operations(ls) @@ -766,7 +844,11 @@ typeeltype(::Type{VectorizationBase.FastRange{T,F,S,O}}) where {T,F,S,O} = T typeeltype(::Type{T}) where {T<:Real} = T # typeeltype(::Any) = Int8 -function add_array_symbols!(ls::LoopSet, arraysymbolinds::Vector{Symbol}, offset::Int) +function add_array_symbols!( + ls::LoopSet, + arraysymbolinds::Vector{Symbol}, + offset::Int +) for as ∈ arraysymbolinds pushpreamble!(ls, Expr(:(=), as, extract_varg((offset += 1)))) end @@ -780,8 +862,13 @@ function extract_external_functions!(ls::LoopSet, offset::Int, vargs) offset += 1 instr_new = get(FUNCTIONSYMBOLS, vargs[offset], instr) if instr_new === instr - extractf = - Expr(:call, GlobalRef(Core, :getfield), Symbol("#vargs#"), offset, false) + extractf = Expr( + :call, + GlobalRef(Core, :getfield), + Symbol("#vargs#"), + offset, + false + ) pushpreamble!(ls, Expr(:(=), instr.instr, extractf)) else op.instruction = instr_new @@ -791,12 +878,18 @@ function extract_external_functions!(ls::LoopSet, offset::Int, vargs) end offset end -outer_reduct_init_typename(op::Operation) = Symbol(mangledvar(op), "#or#init#type#") +outer_reduct_init_typename(op::Operation) = + Symbol(mangledvar(op), "#or#init#type#") function extract_outerreduct_types!(ls::LoopSet, offset::Int, vargs) # for op for or ∈ ls.outer_reductions - extractt = - Expr(:call, GlobalRef(Core, :getfield), Symbol("#vargs#"), (offset += 1), false) + extractt = Expr( + :call, + GlobalRef(Core, :getfield), + Symbol("#vargs#"), + (offset += 1), + false + ) op = operations(ls)[or] if instruction(op).instr ≢ :ifelse pushpreamble!(ls, Expr(:(=), outer_reduct_init_typename(op), extractt)) @@ -805,7 +898,11 @@ function extract_outerreduct_types!(ls::LoopSet, offset::Int, vargs) pushpreamble!(ls, Expr(:(=), opextractbase, extractt)) pushpreamble!( ls, - Expr(:(=), outer_reduct_init_typename(op), Expr(:call, lv(:typeof), opextractbase)), + Expr( + :(=), + outer_reduct_init_typename(op), + Expr(:call, lv(:typeof), opextractbase) + ) ) end end @@ -819,10 +916,13 @@ function sizeofeltypes(v)::Int T = typeeltype(v[1]) sz = if ( - VectorizationBase.simd_integer_register_size() != VectorizationBase.register_size() + VectorizationBase.simd_integer_register_size() != + VectorizationBase.register_size() ) && T <: Integer # hack - (VectorizationBase.register_size() ÷ VectorizationBase.simd_integer_register_size()) * - sizeof(T) + ( + VectorizationBase.register_size() ÷ + VectorizationBase.simd_integer_register_size() + ) * sizeof(T) else sz = sizeof(T) end @@ -830,7 +930,8 @@ function sizeofeltypes(v)::Int Ttemp = typeeltype(v[i]) szᵢ = if ( - VectorizationBase.simd_integer_register_size() != VectorizationBase.register_size() + VectorizationBase.simd_integer_register_size() != + VectorizationBase.register_size() ) && T <: Integer # hack ( VectorizationBase.register_size() ÷ @@ -857,9 +958,12 @@ function avx_loopset!( AM::Vector{Any}, LPSYM::Vector{Any}, LB::Core.SimpleVector, - vargs::Core.SimpleVector, + vargs::Core.SimpleVector ) - pushpreamble!(ls, :((var"#loop#bounds#", var"#vargs#") = var"#lv#tuple#args#")) + pushpreamble!( + ls, + :((var"#loop#bounds#", var"#vargs#") = var"#lv#tuple#args#") + ) add_loops!(ls, LPSYM, LB) resize!(ls.loop_order, ls.loopsymbol_offsets[end]) arraysymbolinds = gen_array_syminds(AM) @@ -869,13 +973,30 @@ function avx_loopset!( resize!(ls.loopindexesbit, length(ls.loops)) fill!(ls.loopindexesbit, false) - mrefs, elementbytes = - create_mrefs!(ls, arf, arraysymbolinds, opsymbols, nopsv, expandedv, vargs[1]) + mrefs, elementbytes = create_mrefs!( + ls, + arf, + arraysymbolinds, + opsymbols, + nopsv, + expandedv, + vargs[1] + ) for mref ∈ mrefs push!(ls.includedactualarrays, vptr(mref)) end # extra args extraction - extractind = add_ops!(ls, instr, ops, mrefs, elementbytes, opsymbols, 1, nopsv, expandedv) + extractind = add_ops!( + ls, + instr, + ops, + mrefs, + elementbytes, + opsymbols, + 1, + nopsv, + expandedv + ) extractind = process_metadata!(ls, AM, extractind) extractind = add_array_symbols!(ls, arraysymbolinds, extractind) extractind = extract_external_functions!(ls, extractind, vargs) @@ -884,7 +1005,7 @@ function avx_loopset!( end function avx_body( ls::LoopSet, - UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,UInt,Int,Bool}, + UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,UInt,Int,Bool} ) inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, nt = UNROLL q = @@ -902,10 +1023,18 @@ function _turbo_loopset_debug( ::Val{ARF}, ::Val{AM}, ::Val{LPSYM}, - _vargs::Tuple{LB,V}, + _vargs::Tuple{LB,V} ) where {UNROLL,OPS,ARF,AM,LPSYM,LB,V} # @show OPS ARF AM LPSYM _vargs - _turbo_loopset(OPS, ARF, AM, LPSYM, _vargs[1].parameters, V.parameters, UNROLL) + _turbo_loopset( + OPS, + ARF, + AM, + LPSYM, + _vargs[1].parameters, + V.parameters, + UNROLL + ) end function tovector(@nospecialize(t)) v = Vector{Any}(undef, length(t)) @@ -926,7 +1055,7 @@ function _turbo_loopset( @nospecialize(LPSYMsv), LBsv::Core.SimpleVector, vargs::Core.SimpleVector, - UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,UInt,Int,Bool}, + UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,UInt,Int,Bool} ) nops = length(OPSsv) ÷ 3 instr = Instruction[Instruction(OPSsv[3i+1], OPSsv[3i+2]) for i ∈ 0:nops-1] @@ -940,27 +1069,37 @@ function _turbo_loopset( for i ∈ eachindex(arsv) arsv[i] = ARFsv[i] end - avx_loopset!(ls, instr, ops, arsv, tovector(AMsv), tovector(LPSYMsv), LBsv, vargs) + avx_loopset!( + ls, + instr, + ops, + arsv, + tovector(AMsv), + tovector(LPSYMsv), + LBsv, + vargs + ) end """ _turbo_!(unroll, ops, arf, am, lpsym, lb, vargs...) Execute an `@turbo` block. The block's code is represented via the arguments: -- `unroll` is `Val((u₁,u₂))` and specifies the loop unrolling factor(s). - These values may be supplied manually via the `unroll` keyword - of [`@turbo`](@ref). -- `ops` is `Tuple{mod1, sym1, op1, mod2, sym2, op2...}` encoding the operations of the loop. - `mod` and `sym` encode the module and symbol of the called function; `op` is an [`OperationStruct`](@ref) - encoding the details of the operation. -- `arf` is `Tuple{arf1, arf2...}`, where each `arfi` is an [`ArrayRefStruct`](@ref) encoding - an array reference. -- `am` contains miscellaneous data about the LoopSet (see `process_metadata!`) -- `lpsym` is `Tuple{:i,:j,...}`, a Tuple of the "loop symbols", i.e. the item variable `i` in `for i ∈ iter` -- `lb` is `Tuple{RngTypei,RngTypej,...}`, a Tuple encoding syntactically-knowable information about - the iterators corresponding to `lpsym`. For example, in `for i ∈ 1:n`, the `1:n` would be encoded with - `StaticLowerUnitRange(1)` because the lower bound of the iterator can be determined to be 1. -- `vargs...` holds the encoded pointers of all the arrays (see `VectorizationBase`'s various pointer types). + + - `unroll` is `Val((u₁,u₂))` and specifies the loop unrolling factor(s). + These values may be supplied manually via the `unroll` keyword + of [`@turbo`](@ref). + - `ops` is `Tuple{mod1, sym1, op1, mod2, sym2, op2...}` encoding the operations of the loop. + `mod` and `sym` encode the module and symbol of the called function; `op` is an [`OperationStruct`](@ref) + encoding the details of the operation. + - `arf` is `Tuple{arf1, arf2...}`, where each `arfi` is an [`ArrayRefStruct`](@ref) encoding + an array reference. + - `am` contains miscellaneous data about the LoopSet (see `process_metadata!`) + - `lpsym` is `Tuple{:i,:j,...}`, a Tuple of the "loop symbols", i.e. the item variable `i` in `for i ∈ iter` + - `lb` is `Tuple{RngTypei,RngTypej,...}`, a Tuple encoding syntactically-knowable information about + the iterators corresponding to `lpsym`. For example, in `for i ∈ 1:n`, the `1:n` would be encoded with + `StaticLowerUnitRange(1)` because the lower bound of the iterator can be determined to be 1. + - `vargs...` holds the encoded pointers of all the arrays (see `VectorizationBase`'s various pointer types). """ @generated function _turbo_!( ::Val{var"#UNROLL#"}, @@ -969,7 +1108,7 @@ Execute an `@turbo` block. The block's code is represented via the arguments: ::Val{var"#AM#"}, ::Val{var"#LPSYM#"}, ::Val{Tuple{var"#LB#",var"#V#"}}, - var"#flattened#var#arguments#"::Vararg{Any,var"#num#vargs#"}, + var"#flattened#var#arguments#"::Vararg{Any,var"#num#vargs#"} ) where { var"#UNROLL#", var"#OPS#", @@ -978,7 +1117,7 @@ Execute an `@turbo` block. The block's code is represented via the arguments: var"#LPSYM#", var"#LB#", var"#V#", - var"#num#vargs#", + var"#num#vargs#" } # 1 + 1 # Irrelevant line you can comment out/in to force recompilation... ls = _turbo_loopset( @@ -988,19 +1127,22 @@ Execute an `@turbo` block. The block's code is represented via the arguments: var"#LPSYM#", var"#LB#".parameters, var"#V#".parameters, - var"#UNROLL#", + var"#UNROLL#" ) pushfirst!( ls.preamble.args, :( - var"#lv#tuple#args#" = - reassemble_tuple(Tuple{var"#LB#",var"#V#"}, var"#flattened#var#arguments#") - ), + var"#lv#tuple#args#" = reassemble_tuple( + Tuple{var"#LB#",var"#V#"}, + var"#flattened#var#arguments#" + ) + ) ) post = hoist_constant_memory_accesses!(ls) # q = @show(avx_body(ls, var"#UNROLL#")); post === ls.preamble ? q : Expr(:block, q, post) q = if (var"#UNROLL#"[10] > 1) && length(var"#LPSYM#") == length(ls.loops) - inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, nt, wca, safe = var"#UNROLL#" + inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, nt, wca, safe = + var"#UNROLL#" # wrap in `var"#OPS#", var"#ARF#", var"#AM#", var"#LPSYM#"` in `Expr` to homogenize types avx_threads_expr( ls, @@ -1009,7 +1151,7 @@ Execute an `@turbo` block. The block's code is represented via the arguments: :(Val{$(var"#OPS#")}()), :(Val{$(var"#ARF#")}()), :(Val{$(var"#AM#")}()), - :(Val{$(var"#LPSYM#")}()), + :(Val{$(var"#LPSYM#")}()) ) else # Main.BODY[] = avx_body(ls, var"#UNROLL#") @@ -1026,7 +1168,7 @@ end ::Val{var"#AM#"}, ::Val{var"#LPSYM#"}, ::Val{Tuple{var"#LB#",var"#V#"}}, - var"#flattened#var#arguments#"::Tuple{Vararg{Any,var"#num#vargs#"}}, + var"#flattened#var#arguments#"::Tuple{Vararg{Any,var"#num#vargs#"}} ) where { var"#UNROLL#", var"#OPS#", @@ -1035,7 +1177,7 @@ end var"#LPSYM#", var"#LB#", var"#V#", - var"#num#vargs#", + var"#num#vargs#" } 1 + 1 # Irrelevant line you can comment out/in to force recompilation... ls = _turbo_loopset( @@ -1045,19 +1187,22 @@ end var"#LPSYM#", var"#LB#".parameters, var"#V#".parameters, - var"#UNROLL#", + var"#UNROLL#" ) pushfirst!( ls.preamble.args, :( - var"#lv#tuple#args#" = - reassemble_tuple(Tuple{var"#LB#",var"#V#"}, var"#flattened#var#arguments#") - ), + var"#lv#tuple#args#" = reassemble_tuple( + Tuple{var"#LB#",var"#V#"}, + var"#flattened#var#arguments#" + ) + ) ) post = hoist_constant_memory_accesses!(ls) # q = @show(avx_body(ls, var"#UNROLL#")); post === ls.preamble ? q : Expr(:block, q, post) q = if (var"#UNROLL#"[10] > 1) && length(var"#LPSYM#") == length(ls.loops) - inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, nt, wca, safe = var"#UNROLL#" + inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, nt, wca, safe = + var"#UNROLL#" # wrap in `var"#OPS#", var"#ARF#", var"#AM#", var"#LPSYM#"` in `Expr` to homogenize types avx_threads_expr( ls, @@ -1066,7 +1211,7 @@ end :(Val{$(var"#OPS#")}()), :(Val{$(var"#ARF#")}()), :(Val{$(var"#AM#")}()), - :(Val{$(var"#LPSYM#")}()), + :(Val{$(var"#LPSYM#")}()) ) else # Main.BODY[] = avx_body(ls, var"#UNROLL#") diff --git a/src/simdfunctionals/filter.jl b/src/simdfunctionals/filter.jl index 00c746602..39ab11ce7 100644 --- a/src/simdfunctionals/filter.jl +++ b/src/simdfunctionals/filter.jl @@ -1,5 +1,9 @@ -function vfilter!(f::F, x::Vector{T}, y::AbstractArray{T}) where {F,T<:NativeTypes} +function vfilter!( + f::F, + x::Vector{T}, + y::AbstractArray{T} +) where {F,T<:NativeTypes} W, Wshift = VectorizationBase.pick_vector_width_shift(T) N = length(y) Nrep = N >>> Wshift @@ -13,20 +17,31 @@ function vfilter!(f::F, x::Vector{T}, y::AbstractArray{T}) where {F,T<:NativeTyp ptr_x = pointer(x) ptr_y = pointer(y) for _ ∈ 1:Nrep - vy = VectorizationBase.__vload(ptr_y, zero_index, False(), register_size()) + vy = + VectorizationBase.__vload(ptr_y, zero_index, False(), register_size()) mask = f(vy) VectorizationBase.compressstore!( gep(ptr_x, VectorizationBase.lazymul(st, j)), vy, - mask, + mask ) ptr_y = gep(ptr_y, incr) j = vadd_nw(j, count_ones(mask)) end rem_mask = VectorizationBase.mask(T, Nrem) - vy = VectorizationBase.__vload(ptr_y, zero_index, rem_mask, False(), register_size()) + vy = VectorizationBase.__vload( + ptr_y, + zero_index, + rem_mask, + False(), + register_size() + ) mask = rem_mask & f(vy) - VectorizationBase.compressstore!(gep(ptr_x, VectorizationBase.lazymul(st, j)), vy, mask) + VectorizationBase.compressstore!( + gep(ptr_x, VectorizationBase.lazymul(st, j)), + vy, + mask + ) j = vadd_nw(j, count_ones(mask)) Base._deleteend!(x, N - j) # resize!(x, j) end diff --git a/src/simdfunctionals/map.jl b/src/simdfunctionals/map.jl index 7e63e4073..3f22f8e2c 100644 --- a/src/simdfunctionals/map.jl +++ b/src/simdfunctionals/map.jl @@ -6,7 +6,7 @@ function setup_vmap!( f::F, y::AbstractArray{T}, ::Val{true}, - args::Vararg{AbstractArray,A}, + args::Vararg{AbstractArray,A} ) where {F,T<:Base.HWReal,A} N = length(y) ptry = VectorizationBase.zstridedpointer(y) @@ -18,7 +18,8 @@ function setup_vmap!( @assert iszero(uintptry & (sizeof(T) - 1)) "The destination vector (`dest`) must be aligned to `sizeof(eltype(dest)) == $(sizeof(T))` bytes." alignment = uintptry & (register_size() - 1) if alignment > 0 - i = reinterpret(Int, W - (alignment >>> VectorizationBase.intlog2(sizeof(T)))) + i = + reinterpret(Int, W - (alignment >>> VectorizationBase.intlog2(sizeof(T)))) m = mask(T, i) if N < i m &= mask(T, N & (W - 1)) @@ -31,7 +32,7 @@ function setup_vmap!( False(), True(), False(), - register_size(), + register_size() ) gesp(ptry, (i,)), map1(gesp, ptrargs, (i,)), N - i else @@ -50,10 +51,17 @@ function map1_quote(K::Int, args::Int) end Expr(:block, Expr(:meta, :inline), t) end -@generated map1(f::F, x_1::Tuple{Vararg{Any,K}}, x_2) where {F,K} = map1_quote(K, 2) -@generated map1(f::F, x_1::Tuple{Vararg{Any,K}}, x_2, x_3) where {F,K} = map1_quote(K, 3) +@generated map1(f::F, x_1::Tuple{Vararg{Any,K}}, x_2) where {F,K} = + map1_quote(K, 2) +@generated map1(f::F, x_1::Tuple{Vararg{Any,K}}, x_2, x_3) where {F,K} = + map1_quote(K, 3) -@inline function setup_vmap!(f, y, ::Val{false}, args::Vararg{AbstractArray,A}) where {A} +@inline function setup_vmap!( + f, + y, + ::Val{false}, + args::Vararg{AbstractArray,A} +) where {A} N = length(y) ptry = VectorizationBase.zstridedpointer(y) ptrargs = map(VectorizationBase.zstridedpointer, args) @@ -64,7 +72,7 @@ function vmap_singlethread!( f::F, y::AbstractArray{T}, ::Val{NonTemporal}, - args::Vararg{AbstractArray,A}, + args::Vararg{AbstractArray,A} ) where {F,T<:NativeTypes,A,NonTemporal} ptry, ptrargs, N = setup_vmap!(f, y, Val{NonTemporal}(), args...) _vmap_singlethread!(f, ptry, Zero(), N, Val{NonTemporal}(), ptrargs) @@ -76,11 +84,11 @@ function _vmap_singlethread!( start, N, ::Val{NonTemporal}, - ptrargs::Tuple{Vararg{Any,A}}, + ptrargs::Tuple{Vararg{Any,A}} ) where {F,T,NonTemporal,A} i = convert(Int, start) V = VectorizationBase.pick_vector_width( - promote_type(T, reduce(promote_type, map(eltype, ptrargs))), + promote_type(T, reduce(promote_type, map(eltype, ptrargs))) ) W = unwrap(V) UNROLL = 4 @@ -116,22 +124,42 @@ function _vmap_singlethread!( m = mask(StaticInt(W), N & (W - 1)) vfinal = f(map1(vload, ptrargs, (MM{W}(i),), m)...) if NonTemporal - _vstore!(ptry, vfinal, (MM{W}(i),), m, True(), True(), False(), register_size()) + _vstore!( + ptry, + vfinal, + (MM{W}(i),), + m, + True(), + True(), + False(), + register_size() + ) else - _vstore!(ptry, vfinal, (MM{W}(i),), m, False(), True(), False(), register_size()) + _vstore!( + ptry, + vfinal, + (MM{W}(i),), + m, + False(), + True(), + False(), + register_size() + ) end end # end nothing end -abstract type AbstractVmapClosure{NonTemporal,F,D,N,A<:Tuple{Vararg{Any,N}}} <: Function end -struct VmapClosure{NonTemporal,F,D,N,A} <: AbstractVmapClosure{NonTemporal,F,D,N,A} +abstract type AbstractVmapClosure{NonTemporal,F,D,N,A<:Tuple{Vararg{Any,N}}} <: + Function end +struct VmapClosure{NonTemporal,F,D,N,A} <: + AbstractVmapClosure{NonTemporal,F,D,N,A} f::F function VmapClosure{NonTemporal}( f::F, ::D, - ::A, + ::A ) where {NonTemporal,F,D,N,A<:Tuple{Vararg{Any,N}}} new{NonTemporal,F,D,N,A}(f) end @@ -141,7 +169,9 @@ end # @generated function (::VmapKnownClosure{NonTemporal,F,D,N,A})(p::Ptr{UInt}) where {NonTemporal,F,D,N,A} # :(_vmap_thread_call!($(F.instance), p, $D, $A, Val{$NonTemporal}())) # end -function (m::VmapClosure{NonTemporal,F,D,N,A})(p::Ptr{UInt}) where {NonTemporal,F,D,N,A} +function (m::VmapClosure{NonTemporal,F,D,N,A})( + p::Ptr{UInt} +) where {NonTemporal,F,D,N,A} (offset, dest) = ThreadingUtilities.load(p, D, 2 * sizeof(UInt)) (offset, args) = ThreadingUtilities.load(p, A, offset) @@ -181,7 +211,7 @@ end ptry, ptrargs, start, - stop, + stop ) do p, cfunc, ptry, ptrargs, start, stop setup_thread_vmap!(p, cfunc, ptry, ptrargs, start, stop) end @@ -191,7 +221,7 @@ end f::F, ptry::D, ptrargs::A, - ::Val{NonTemporal}, + ::Val{NonTemporal} ) where {F,D<:StridedPointer,N,A<:Tuple{Vararg{Any,N}},NonTemporal} vmc = VmapClosure{NonTemporal}(f, ptry, ptrargs) @cfunction($vmc, Cvoid, (Ptr{UInt},)) @@ -201,7 +231,7 @@ function vmap_multithread!( f::F, y::AbstractArray{T}, ::Val{NonTemporal}, - args::Vararg{AbstractArray,A}, + args::Vararg{AbstractArray,A} ) where {F,T,A,NonTemporal} W, Wshift = VectorizationBase.pick_vector_width_shift(T) ptry, ptrargs, N = setup_vmap!(f, y, Val{NonTemporal}(), args...) @@ -254,16 +284,16 @@ end y::AbstractArray, ::Val{NonTemporal}, ::Val{Threaded}, - args::Vararg{AbstractArray,A}, + args::Vararg{AbstractArray,A} ) where {F,A,NonTemporal,Threaded} gc_preserve_vmap_quote(NonTemporal, Threaded, A) end - @inline _all_dense(t::Tuple{ArrayInterface.True}) = true @inline _all_dense(t::Tuple{ArrayInterface.True,ArrayInterface.True,Vararg}) = _all_dense(Base.tail(t)) -@inline _all_dense(t::Tuple{ArrayInterface.True,ArrayInterface.False,Vararg}) = false +@inline _all_dense(t::Tuple{ArrayInterface.True,ArrayInterface.False,Vararg}) = + false @inline _all_dense(t::Tuple{ArrayInterface.False,Vararg}) = false @inline all_dense() = true @inline all_dense(t::NTuple{N}) where {N} = true @@ -272,22 +302,26 @@ end @inline all_dense( A::AbstractArray, B::AbstractArray, - C::Vararg{AbstractArray,K}, + C::Vararg{AbstractArray,K} ) where {K} = all_dense(A) && all_dense(B, C...) """ vmap!(f, destination, a::AbstractArray) vmap!(f, destination, a::AbstractArray, b::AbstractArray, ...) + Vectorized-`map!`, applying `f` to batches of elements of `a` (or paired batches of `a`, `b`, ...) and storing the result in `destination`. The function `f` must accept `VectorizationBase.AbstractSIMD` inputs. Ideally, all this requires is making sure that `f` is defined to be agnostic with respect to input types, but if the function `f` contains branches or loops, more work will probably be needed. For example, a function + ```julia f(x) = x > 0 ? log(x) : inv(x) ``` + can be rewritten into + ```julia using IfElse f(x) = IfElse.ifelse(x > 0, log(x), inv(x)) @@ -298,7 +332,7 @@ function vmap!( y::AbstractArray, arg1::AbstractArray, arg2::AbstractArray, - args::Vararg{AbstractArray,A}, + args::Vararg{AbstractArray,A} ) where {F,A} if check_args(y, arg1, arg2, args...) && all_dense(y, arg1, arg2, args...) gc_preserve_vmap!(f, y, Val{false}(), Val{false}(), arg1, arg2, args...) @@ -314,12 +348,16 @@ function vmap!(f::F, y::AbstractArray, arg::AbstractArray) where {F} end end - """ vmapt!(::Function, dest, args...) + A threaded variant of [`vmap!`](@ref). """ -function vmapt!(f::F, y::AbstractArray, args::Vararg{AbstractArray,A}) where {F,A} +function vmapt!( + f::F, + y::AbstractArray, + args::Vararg{AbstractArray,A} +) where {F,A} if check_args(y, args...) && all_dense(y, args...) gc_preserve_vmap!(f, y, Val{false}(), Val{true}(), args...) else @@ -327,17 +365,23 @@ function vmapt!(f::F, y::AbstractArray, args::Vararg{AbstractArray,A}) where {F, end end - """ vmapnt!(::Function, dest, args...) + This is a vectorized map implementation using nontemporal store operations. This means that the write operations to the destination will not go to the CPU's cache. If you will not immediately be reading from these values, this can improve performance because the writes won't pollute your cache. This can especially be the case if your arguments are very long. + ```julia -julia> using LoopVectorization, BenchmarkTools -julia> x = rand(10^8); y = rand(10^8); z = similar(x); -julia> f(x,y) = exp(-0.5abs2(x - y)) -f (generic function with 1 method) +julia> f(x, y) = exp(-0.5abs2(x - y)) +using LoopVectorization, BenchmarkTools + julia> @benchmark map!(f, \$z, \$x, \$y) +x = rand(10^8); y = rand(10^8); z = similar(x); + +julia> @benchmark vmap!(f, \$z, \$x, \$y) +f (generic function with 1 method) + +julia> @benchmark vmapnt!(f, \$z, \$x, \$y) BenchmarkTools.Trial: memory estimate: 0 bytes allocs estimate: 0 @@ -349,33 +393,13 @@ BenchmarkTools.Trial: -------------- samples: 12 evals/sample: 1 -julia> @benchmark vmap!(f, \$z, \$x, \$y) -BenchmarkTools.Trial: - memory estimate: 0 bytes - allocs estimate: 0 - -------------- - minimum time: 178.147 ms (0.00% GC) - median time: 178.381 ms (0.00% GC) - mean time: 178.430 ms (0.00% GC) - maximum time: 179.054 ms (0.00% GC) - -------------- - samples: 29 - evals/sample: 1 -julia> @benchmark vmapnt!(f, \$z, \$x, \$y) -BenchmarkTools.Trial: - memory estimate: 0 bytes - allocs estimate: 0 - -------------- - minimum time: 144.183 ms (0.00% GC) - median time: 144.338 ms (0.00% GC) - mean time: 144.349 ms (0.00% GC) - maximum time: 144.641 ms (0.00% GC) - -------------- - samples: 35 - evals/sample: 1 ``` """ -function vmapnt!(f::F, y::AbstractArray, args::Vararg{AbstractArray,A}) where {F,A} +function vmapnt!( + f::F, + y::AbstractArray, + args::Vararg{AbstractArray,A} +) where {F,A} if check_args(y, args...) && all_dense(y, args...) gc_preserve_vmap!(f, y, Val{true}(), Val{false}(), args...) else @@ -385,9 +409,14 @@ end """ vmapntt!(::Function, dest, args...) + A threaded variant of [`vmapnt!`](@ref). """ -function vmapntt!(f::F, y::AbstractArray, args::Vararg{AbstractArray,A}) where {F,A} +function vmapntt!( + f::F, + y::AbstractArray, + args::Vararg{AbstractArray,A} +) where {F,A} if check_args(y, args...) && all_dense(y, args...) gc_preserve_vmap!(f, y, Val{true}(), Val{true}(), args...) else @@ -414,6 +443,7 @@ end """ vmap(f, a::AbstractArray) vmap(f, a::AbstractArray, b::AbstractArray, ...) + SIMD-vectorized `map`, applying `f` to each element of `a` (or paired elements of `a`, `b`, ...) and returning a new array. """ @@ -422,6 +452,7 @@ vmap(f::F, args::Vararg{Any,N}) where {F,N} = vmap_call(f, vmap!, args...) """ vmapt(f, a::AbstractArray) vmapt(f, a::AbstractArray, b::AbstractArray, ...) + A threaded variant of [`vmap`](@ref). """ vmapt(f::F, args::Vararg{Any,N}) where {F,N} = vmap_call(f, vmapt!, args...) @@ -429,6 +460,7 @@ vmapt(f::F, args::Vararg{Any,N}) where {F,N} = vmap_call(f, vmapt!, args...) """ vmapnt(f, a::AbstractArray) vmapnt(f, a::AbstractArray, b::AbstractArray, ...) + A "non-temporal" variant of [`vmap`](@ref). This can improve performance in cases where `destination` will not be needed soon. """ @@ -437,6 +469,7 @@ vmapnt(f::F, args::Vararg{Any,N}) where {F,N} = vmap_call(f, vmapnt!, args...) """ vmapntt(f, a::AbstractArray) vmapntt(f, a::AbstractArray, b::AbstractArray, ...) + A threaded variant of [`vmapnt`](@ref). """ vmapntt(f::F, args::Vararg{Any,N}) where {F,N} = vmap_call(f, vmapntt!, args...) diff --git a/src/simdfunctionals/mapreduce.jl b/src/simdfunctionals/mapreduce.jl index ebb625c04..af7d3c094 100644 --- a/src/simdfunctionals/mapreduce.jl +++ b/src/simdfunctionals/mapreduce.jl @@ -1,20 +1,31 @@ @inline vreduce(::typeof(+), v::VectorizationBase.AbstractSIMDVector) = vsum(v) @inline vreduce(::typeof(*), v::VectorizationBase.AbstractSIMDVector) = vprod(v) -@inline vreduce(::typeof(max), v::VectorizationBase.AbstractSIMDVector) = vmaximum(v) -@inline vreduce(::typeof(min), v::VectorizationBase.AbstractSIMDVector) = vminimum(v) -@inline vreduce(op, v::VectorizationBase.AbstractSIMDVector) = vec_vreduce(op, v) -@inline vec_reduce(op, v::VectorizationBase.AbstractSIMDVector) = vec_reduce(op, Vec(v)) +@inline vreduce(::typeof(max), v::VectorizationBase.AbstractSIMDVector) = + vmaximum(v) +@inline vreduce(::typeof(min), v::VectorizationBase.AbstractSIMDVector) = + vminimum(v) +@inline vreduce(op, v::VectorizationBase.AbstractSIMDVector) = + vec_vreduce(op, v) +@inline vec_reduce(op, v::VectorizationBase.AbstractSIMDVector) = + vec_reduce(op, Vec(v)) vec_vreduce(op, v::Vec{1}) = VectorizationBase.extractelement(v, 0) @inline function vec_vreduce(op, v::Vec{W}) where {W} - a = op(VectorizationBase.extractelement(v, 0), VectorizationBase.extractelement(v, 1)) + a = op( + VectorizationBase.extractelement(v, 0), + VectorizationBase.extractelement(v, 1) + ) for i ∈ 2:W-1 a = op(a, VectorizationBase.extractelement(v, i)) end a end -function mapreduce_simple(f::F, op::OP, args::Vararg{AbstractArray,A}) where {F,OP,A} +function mapreduce_simple( + f::F, + op::OP, + args::Vararg{AbstractArray,A} +) where {F,OP,A} ptrargs = ntuple(a -> pointer(args[a]), Val(A)) N = length(first(args)) iszero(N) && throw("Length of vector is 0!") @@ -29,16 +40,15 @@ function mapreduce_simple(f::F, op::OP, args::Vararg{AbstractArray,A}) where {F, ptrargs, VectorizationBase.lazymul.(st, i), False(), - register_size(), - )..., - ), + register_size() + )... + ) ) i += 1 end a_0 end - """ vmapreduce(f, op, A::DenseArray...) @@ -48,7 +58,7 @@ Vectorized version of `mapreduce`. Applies `f` to each element of the arrays `A` f::F, op::OP, arg1::AbstractArray{T}, - args::Vararg{AbstractArray{T},A}, + args::Vararg{AbstractArray{T},A} ) where {F,OP,T<:NativeTypes,A} if !(check_args(arg1, args...) && all_dense(arg1, args...)) return mapreduce(f, op, arg1, args...) @@ -68,7 +78,7 @@ end ::StaticInt{W}, N, ::Type{T}, - args::Vararg{AbstractArray{<:NativeTypes},A}, + args::Vararg{AbstractArray{<:NativeTypes},A} ) where {F,OP,A,W,T} ptrargs = VectorizationBase.zero_offsets.(stridedpointer.(args)) if N ≥ 4W @@ -111,7 +121,7 @@ At most one dimension may be supplied as kwarg. for (op, init) in zip((:+, :max, :min), (:zero, :typemin, :typemax)) @eval @inline function vreduce(::typeof($op), arg; dims = nothing) if !(check_args(arg) && all_dense(arg)) - return reduce($op, arg, dims = dims) + return reduce($op, arg; dims = dims) end dims === nothing && return _vreduce($op, arg) isone(ndims(arg)) && return [_vreduce($op, arg)] @@ -132,7 +142,14 @@ for (op, init) in zip((:+, :max, :min), (:zero, :typemin, :typemax)) end end - @eval @inline function _vreduce_dims!(out, ::typeof($op), Rpre, is, Rpost, arg) + @eval @inline function _vreduce_dims!( + out, + ::typeof($op), + Rpre, + is, + Rpost, + arg + ) s = $init(first(arg)) @turbo for Ipost in Rpost, Ipre in Rpre accum = s diff --git a/src/simdfunctionals/vmap_grad_forwarddiff.jl b/src/simdfunctionals/vmap_grad_forwarddiff.jl index 902cae420..39ef12c31 100644 --- a/src/simdfunctionals/vmap_grad_forwarddiff.jl +++ b/src/simdfunctionals/vmap_grad_forwarddiff.jl @@ -20,7 +20,7 @@ end ∂p::Tuple{Vararg{AbstractStridedPointer,A}}, p::AbstractStridedPointer, ∂v, - im::Vararg{Any,N}, + im::Vararg{Any,N} ) where {A,N} quote $(Expr(:meta, :inline)) diff --git a/src/simdfunctionals/vmap_grad_rrule.jl b/src/simdfunctionals/vmap_grad_rrule.jl index 84fa8ebd8..18f10ae19 100644 --- a/src/simdfunctionals/vmap_grad_rrule.jl +++ b/src/simdfunctionals/vmap_grad_rrule.jl @@ -35,7 +35,7 @@ function ∂vmap_singlethread!( f::F, ∂y::Tuple{Vararg{DenseArray{T},A}}, y::DenseArray{T}, - args::Vararg{DenseArray{<:Base.HWReal},A}, + args::Vararg{DenseArray{<:Base.HWReal},A} ) where {F,T<:Base.HWReal,A} N = length(y) ptry = VectorizationBase.zero_offsets(stridedpointer(y)) @@ -65,13 +65,12 @@ function ∂vmap_singlethread!( ptry, f(init_dual(vload.(ptrargs, ((MM{W}(i),),), m))...), (MM{W}(i),), - m, + m ) end nothing end - struct SIMDMapBack{K,T<:Tuple{Vararg{Any,K}}} jacs::T end @@ -94,14 +93,22 @@ end end end -function ChainRulesCore.rrule(::typeof(vmap), f::F, args::Vararg{Any,K}) where {F,K} +function ChainRulesCore.rrule( + ::typeof(vmap), + f::F, + args::Vararg{Any,K} +) where {F,K} out = similar(first(args)) jacs = map(similar, args) ∂vmap_singlethread!(f, jacs, out, args...) out, SIMDMapBack(jacs) end for f in (:vmapt, :vmapnt, :vmapntt) - @eval function ChainRulesCore.rrule(::typeof($f), f::F, args::Vararg{Any,K}) where {F,K} + @eval function ChainRulesCore.rrule( + ::typeof($f), + f::F, + args::Vararg{Any,K} + ) where {F,K} ChainRulesCore.rrule(typeof(vmap), f, args...) end end diff --git a/src/transforms.jl b/src/transforms.jl index 7f522cd66..4d3a6643a 100644 --- a/src/transforms.jl +++ b/src/transforms.jl @@ -33,9 +33,9 @@ function hoist_constant_vload!(ls::LoopSet, op::Operation) parents(op), loopdependencies(op), reduceddependencies(op), - name(op), + name(op) ), - elementbytes, + elementbytes ) end @@ -53,8 +53,6 @@ function return_empty_reductinit(op::Operation, var::Symbol) return op end - - function constant_symbol!(ls::LoopSet, op::Operation) # hack # relowers, but should make it work @@ -74,7 +72,10 @@ function constant_symbol!(ls::LoopSet, op::Operation) if intsz == 1 pushpreamble!(ls, Expr(:(=), symname, intval % Bool)) else - pushpreamble!(ls, Expr(:(=), symname, sizeequivalent_symint_expr(intval, signed))) + pushpreamble!( + ls, + Expr(:(=), symname, sizeequivalent_symint_expr(intval, signed)) + ) end return symname end @@ -82,7 +83,11 @@ function constant_symbol!(ls::LoopSet, op::Operation) (idcheck ≢ nothing) && ((idcheck == id) && continue) pushpreamble!( ls, - Expr(:(=), symname, Expr(:call, lv(:sizeequivalentfloat), ELTYPESYMBOL, floatval)), + Expr( + :(=), + symname, + Expr(:call, lv(:sizeequivalentfloat), ELTYPESYMBOL, floatval) + ) ) return symname end @@ -92,15 +97,24 @@ function constant_symbol!(ls::LoopSet, op::Operation) if typ == IntOrFloat pushpreamble!(ls, Expr(:(=), symname, Expr(:call, :zero, ELTYPESYMBOL))) elseif typ == HardInt - pushpreamble!(ls, Expr(:(=), symname, Expr(:call, lv(:zerointeger), ELTYPESYMBOL))) + pushpreamble!( + ls, + Expr(:(=), symname, Expr(:call, lv(:zerointeger), ELTYPESYMBOL)) + ) else#if typ == HardFloat - pushpreamble!(ls, Expr(:(=), symname, Expr(:call, lv(:zerofloat), ELTYPESYMBOL))) + pushpreamble!( + ls, + Expr(:(=), symname, Expr(:call, lv(:zerofloat), ELTYPESYMBOL)) + ) end return symname end for (id, f) ∈ ls.preamble_funcofeltypes (idcheck ≢ nothing) && ((idcheck == id) && continue) - pushpreamble!(ls, Expr(:(=), symname, Expr(:call, reduction_zero(f), ELTYPESYMBOL))) + pushpreamble!( + ls, + Expr(:(=), symname, Expr(:call, reduction_zero(f), ELTYPESYMBOL)) + ) return symname end throw("Constant operation symbol not found.") @@ -124,7 +138,7 @@ function hoist_constant_store!(q::Expr, ls::LoopSet, op::Operation) # @show last(ls.preamble.args) pushpreamble!( ls, - Expr(:(=), outer_reduct_init_typename(opr), Expr(:call, lv(:typeof), init)), + Expr(:(=), outer_reduct_init_typename(opr), Expr(:call, lv(:typeof), init)) ) qpre = Expr(:block) push!( @@ -133,8 +147,8 @@ function hoist_constant_store!(q::Expr, ls::LoopSet, op::Operation) :call, lv(:unsafe_store!), Expr(:call, lv(:pointer), op.ref.ptr), - outer_reduction_to_scalar_reduceq!(qpre, opr, init), - ), + outer_reduction_to_scalar_reduceq!(qpre, opr, init) + ) ) length(qpre.args) == 0 || pushpreamble!(ls, qpre) # creating `Expr` and pushing because `outer_reduction_to_scalar_reduceq!` uses `pushfirst!(q.args`, and we don't want it at the start of the preamble return nothing diff --git a/src/user_api_conveniences.jl b/src/user_api_conveniences.jl index d963faf1e..c4bcd59ed 100644 --- a/src/user_api_conveniences.jl +++ b/src/user_api_conveniences.jl @@ -11,7 +11,6 @@ const GEMMLOOPSET = loopset(:( end )); - # function matmul_params(rs::Int, rc::Int, cls::Int) # set_hw!(GEMMLOOPSET, rs, rc, cls, Int(cache_size(StaticInt(1))), Int(cache_size(StaticInt(2))), Int(cache_size(StaticInt(3)))) # order = choose_order(GEMMLOOPSET) @@ -24,23 +23,41 @@ function matmul_params( M = nothing, K = nothing, N = nothing, - W = 0, + W = 0 ) set_hw!(GEMMLOOPSET, rs, rc, cls) if N ≢ nothing nloop = GEMMLOOPSET.loops[1] - GEMMLOOPSET.loops[1] = - Loop(:n, MaybeKnown(1), MaybeKnown(N), MaybeKnown(1), nloop.rangesym, nloop.lensym) + GEMMLOOPSET.loops[1] = Loop( + :n, + MaybeKnown(1), + MaybeKnown(N), + MaybeKnown(1), + nloop.rangesym, + nloop.lensym + ) end if M ≢ nothing mloop = GEMMLOOPSET.loops[2] - GEMMLOOPSET.loops[2] = - Loop(:m, MaybeKnown(1), MaybeKnown(M), MaybeKnown(1), mloop.rangesym, mloop.lensym) + GEMMLOOPSET.loops[2] = Loop( + :m, + MaybeKnown(1), + MaybeKnown(M), + MaybeKnown(1), + mloop.rangesym, + mloop.lensym + ) end if K ≢ nothing kloop = GEMMLOOPSET.loops[3] - GEMMLOOPSET.loops[3] = - Loop(:k, MaybeKnown(1), MaybeKnown(K), MaybeKnown(1), kloop.rangesym, kloop.lensym) + GEMMLOOPSET.loops[3] = Loop( + :k, + MaybeKnown(1), + MaybeKnown(K), + MaybeKnown(1), + kloop.rangesym, + kloop.lensym + ) end GEMMLOOPSET.vector_width = W order = choose_order(GEMMLOOPSET) @@ -52,16 +69,17 @@ end @generated function matmul_params( ::StaticInt{RS}, ::StaticInt{RC}, - ::StaticInt{CLS}, + ::StaticInt{CLS} ) where {RS,RC,CLS} mᵣ, nᵣ = matmul_params(RS, RC, CLS) Expr( :tuple, Expr(:call, Expr(:curly, :StaticInt, mᵣ)), - Expr(:call, Expr(:curly, :StaticInt, nᵣ)), + Expr(:call, Expr(:curly, :StaticInt, nᵣ)) ) end -matmul_params() = matmul_params(register_size(), register_count(), cache_linesize()) +matmul_params() = + matmul_params(register_size(), register_count(), cache_linesize()) # function dotturbo(x,y) # s = zero(promote_type(eltype(x),eltype(y))) diff --git a/src/vectorizationbase_compat/contract_pass.jl b/src/vectorizationbase_compat/contract_pass.jl index 6381e8717..8539d80e0 100644 --- a/src/vectorizationbase_compat/contract_pass.jl +++ b/src/vectorizationbase_compat/contract_pass.jl @@ -75,7 +75,13 @@ function muladd_arguments!(argv, mod, f = first(argv)) end end -function recursive_muladd_search!(call, argv, mod, cnmul::Bool = false, csub::Bool = false) +function recursive_muladd_search!( + call, + argv, + mod, + cnmul::Bool = false, + csub::Bool = false +) if length(argv) < 3 muladd_arguments!(argv, mod) return length(call.args) == 4, cnmul, csub @@ -121,7 +127,10 @@ function recursive_muladd_search!(call, argv, mod, cnmul::Bool = false, csub::Bo if length(exargs) == 2 push!(call.args, exargs[3-i]) else - push!(call.args, append_args_skip!(Expr(:call, :add_fast), exargs, i, mod)) + push!( + call.args, + append_args_skip!(Expr(:call, :add_fast), exargs, i, mod) + ) end if issub csub = i == 1 @@ -139,12 +148,16 @@ function recursive_muladd_search!(call, argv, mod, cnmul::Bool = false, csub::Bo :call, :sub_fast, append_args_skip!(Expr(:call, :add_fast), exargs, i, mod), - call.args[4], + call.args[4] ) end else - call.args[4] = - append_args_skip!(Expr(:call, :add_fast, call.args[4]), exargs, i, mod) + call.args[4] = append_args_skip!( + Expr(:call, :add_fast, call.args[4]), + exargs, + i, + mod + ) end return true, cnmul, false end @@ -209,8 +222,7 @@ function capture_a_muladd(ex::Expr, mod) end true, call end -function capture_muladd(ex::Expr, mod) - while true +capture_muladd(ex::Expr, mod) = while true ex.head === :ref && return ex if Meta.isexpr(ex, :call, 2) if (ex.args[1] === :(-)) @@ -224,7 +236,6 @@ function capture_muladd(ex::Expr, mod) found, ex = capture_a_muladd(ex, mod) found || return ex end -end function append_update_args(f::Symbol, ex::Expr) call = Expr(:call, f) for i ∈ 2:length(ex.args) @@ -250,11 +261,34 @@ function contract!(expr::Expr, ex::Expr, i::Int, mod) else j = findfirst( Base.Fix2(===, ex.head), - (:(-=), :(/=), :(÷=), :(%=), :(^=), :(&=), :(|=), :(⊻=), :(>>>=), :(>>=), :(<<=)), + ( + :(-=), + :(/=), + :(÷=), + :(%=), + :(^=), + :(&=), + :(|=), + :(⊻=), + :(>>>=), + :(>>=), + :(<<=) + ) ) if j ≢ nothing - f = - (:sub_fast, :div_fast, :(÷), :(%), :(^), :(&), :(|), :(⊻), :(>>>), :(>>), :(<<))[j::Int] + f = ( + :sub_fast, + :div_fast, + :(÷), + :(%), + :(^), + :(&), + :(|), + :(⊻), + :(>>>), + :(>>), + :(<<) + )[j::Int] call = Expr(:call, f) append!(call.args, ex.args) expr.args[i] = ex = Expr(:(=), first(ex.args), call) diff --git a/src/vectorizationbase_compat/subsetview.jl b/src/vectorizationbase_compat/subsetview.jl index 41142f362..ced5febb5 100644 --- a/src/vectorizationbase_compat/subsetview.jl +++ b/src/vectorizationbase_compat/subsetview.jl @@ -11,7 +11,7 @@ end @generated function subsetview( ptr::AbstractStridedPointer{T,N,C,B,R,X,O}, ::StaticInt{I}, - i::Union{Integer,StaticInt}, + i::Union{Integer,StaticInt} ) where {T,N,C,B,R,X,O,I} I > N && return :ptr @assert B ≤ 0 "Batched dims not currently supported." @@ -32,31 +32,42 @@ end stridedpointer($gptr, si, StaticInt{$B}()) end end -@inline _subsetview(ptr::AbstractStridedPointer, ::StaticInt{I}, J::Tuple{}) where {I} = ptr @inline _subsetview( ptr::AbstractStridedPointer, ::StaticInt{I}, - J::Tuple{J1}, + J::Tuple{} +) where {I} = ptr +@inline _subsetview( + ptr::AbstractStridedPointer, + ::StaticInt{I}, + J::Tuple{J1} ) where {I,J1} = subsetview(ptr, StaticInt{I}(), first(J)) @inline _subsetview( ptr::AbstractStridedPointer, ::StaticInt{I}, - J::Tuple{J1,J2,Vararg}, -) where {I,J1,J2} = - _subsetview(subsetview(ptr, StaticInt{I}(), first(J)), StaticInt{I}(), Base.tail(J)) + J::Tuple{J1,J2,Vararg} +) where {I,J1,J2} = _subsetview( + subsetview(ptr, StaticInt{I}(), first(J)), + StaticInt{I}(), + Base.tail(J) +) @inline subsetview( ptr::AbstractStridedPointer, ::StaticInt{I}, - J::CartesianIndex, + J::CartesianIndex ) where {I} = _subsetview(ptr, StaticInt{I}(), Tuple(J)) -@inline _gesp(sp::VectorizationBase.FastRange, ::StaticInt{1}, i, ::StaticInt{1}) = - gesp(sp, (i,)) +@inline _gesp( + sp::VectorizationBase.FastRange, + ::StaticInt{1}, + i, + ::StaticInt{1} +) = gesp(sp, (i,)) @generated function _gesp( sp::AbstractStridedPointer{T,N}, ::StaticInt{I}, i::Union{Integer,StaticInt}, - ::StaticInt{D}, + ::StaticInt{D} ) where {I,N,T,D} t = Expr(:tuple) for j ∈ 1:I-1 diff --git a/test/manyarrayrefs.jl b/test/manyarrayrefs.jl index 5ea7ed6d8..b4ba68d0d 100644 --- a/test/manyarrayrefs.jl +++ b/test/manyarrayrefs.jl @@ -1,23 +1,23 @@ -@generated function sum_way_too_unrolled(A, ::Val{rows}, ::Val{cols}) where {rows, cols} - terms = :( 0 ) - - for i in 1:rows - for j in 1:cols - terms = :( $terms + A[$i, $j, k] ) - end +@generated function sum_way_too_unrolled(A, ::Val{rows}, ::Val{cols}) where {rows,cols} + terms = :(0) + + for i = 1:rows + for j = 1:cols + terms = :($terms + A[$i, $j, k]) end + end - quote - sum = 0.0 - @turbo for k in axes(A, 3) - sum += $terms - end - sum + quote + sum = 0.0 + @turbo for k in axes(A, 3) + sum += $terms end + sum + end end @testset "Many Array References" begin - A = rand(17, 16, 10) + A = rand(17, 16, 10) - @test isapprox(sum_way_too_unrolled(A, Val(17), Val(16)), sum(A)) + @test isapprox(sum_way_too_unrolled(A, Val(17), Val(16)), sum(A)) end diff --git a/utils/generate_costs.jl b/utils/generate_costs.jl index ebc14fdc6..14d32bcd6 100644 --- a/utils/generate_costs.jl +++ b/utils/generate_costs.jl @@ -12,7 +12,7 @@ using VectorizationBase: data :(Tuple{}), "i64", String[], - Symbol[], + Symbol[] ) end @@ -30,12 +30,13 @@ end $sideeffect_str, NTuple{$W,Core.VecElement{$T}}, Tuple{NTuple{$W,Core.VecElement{$T}}}, - VectorizationBase.data(x), - ), + VectorizationBase.data(x) + ) ) end end -@inline volatile(x::VecUnroll) = VecUnroll(VectorizationBase.fmap(volatile, data(x))) +@inline volatile(x::VecUnroll) = + VecUnroll(VectorizationBase.fmap(volatile, data(x))) @inline volatile(x::Tuple) = map(volatile, x) # @generated function volatile(x::Vec{W,T}, x::Vec{W,T}) where {W,T} # typ = VectorizationBase.LLVM_TYPES[T] @@ -89,7 +90,6 @@ end # end # end - # @generated function unrolltest!(f::F, y::AbstractVector{T}, x::AbstractVector{T}, ::Val{U}) where {F,U,T} # quote # cc = readcyclecounter() @@ -106,13 +106,22 @@ end let vx = Vec(ntuple(_ -> 10randn(), pick_vector_width(Float64))...) vu2 = VectorizationBase.VecUnroll( - ntuple(_ -> Vec(ntuple(_ -> 10randn(), pick_vector_width(Float64))...), Val(2)), + ntuple( + _ -> Vec(ntuple(_ -> 10randn(), pick_vector_width(Float64))...), + Val(2) + ) ) vu4 = VectorizationBase.VecUnroll( - ntuple(_ -> Vec(ntuple(_ -> 10randn(), pick_vector_width(Float64))...), Val(4)), + ntuple( + _ -> Vec(ntuple(_ -> 10randn(), pick_vector_width(Float64))...), + Val(4) + ) ) vu8 = VectorizationBase.VecUnroll( - ntuple(_ -> Vec(ntuple(_ -> 10randn(), pick_vector_width(Float64))...), Val(8)), + ntuple( + _ -> Vec(ntuple(_ -> 10randn(), pick_vector_width(Float64))...), + Val(8) + ) ) for unaryf ∈ [log, log2, log10, log1p, exp, exp2, exp10, expm1, sin, cos] rt1 = unrolltest(f, vx) @@ -131,7 +140,14 @@ end let f, io = mktemp() W = Int(VectorizationBase.pick_vector_width(Float64)) - code_native(io, exp, (VecUnroll{1,W,Float64,Vec{W,Float64}},); debuginfo = :none) + code_native( + io, + exp, + (VecUnroll{1,W,Float64,Vec{W,Float64}},); + debuginfo = :none + ) close(io) - run(`llvm-mca -mcpu=$(Sys.CPU_NAME) -output-asm-variant=1 -bottleneck-analysis $f`) + run( + `llvm-mca -mcpu=$(Sys.CPU_NAME) -output-asm-variant=1 -bottleneck-analysis $f` + ) end diff --git a/utils/generate_precompiles.jl b/utils/generate_precompiles.jl index eefbc6168..a78a67527 100644 --- a/utils/generate_precompiles.jl +++ b/utils/generate_precompiles.jl @@ -1,8 +1,12 @@ using LoopVectorization, SnoopCompile LOOPVECTORIZATION_TEST = "all" -tinf = @snoopi_deep include(joinpath(pkgdir(LoopVectorization), "test", "testsetup.jl")) -tinf = @snoopi_deep include(joinpath(pkgdir(LoopVectorization), "test", "grouptests.jl")) +tinf = @snoopi_deep include( + joinpath(pkgdir(LoopVectorization), "test", "testsetup.jl") +) +tinf = @snoopi_deep include( + joinpath(pkgdir(LoopVectorization), "test", "grouptests.jl") +) ttot, pcs = SnoopCompile.parcel(tinf); @@ -31,17 +35,17 @@ blacklist = ( :tanh_fast, :check_args, :relu, - :init_dual, + :init_dual ) -filteredmethods = filter(m -> !Base.sym_in(m[2].def.name, blacklist), last(pcslv)); +filteredmethods = + filter(m -> !Base.sym_in(m[2].def.name, blacklist), last(pcslv)); length(filteredmethods); SnoopCompile.write( "/tmp/precompile_loopvec", - [LoopVectorization => (sum(first, filteredmethods), filteredmethods)], + [LoopVectorization => (sum(first, filteredmethods), filteredmethods)] ) - # pc = SnoopCompile.parcel(tinf; blacklist=["vmaterialize", "vmaterialize!", "vreduce", "Base.Broadcast.materialize", "_vreduce_dims!", "vmapreduce"]) # pcs = pc[:LoopVectorization] # open(joinpath(pkgdir, "src", "precompile.jl"), "w") do io