From 1af836173776f7378cd5b5479aa136cecc93ee35 Mon Sep 17 00:00:00 2001 From: Chris Elrod Date: Tue, 26 Mar 2024 15:55:36 -0400 Subject: [PATCH] Clean JET.@report_call --- Project.toml | 2 +- ext/ForwardDiffExt.jl | 2 - ext/SpecialFunctionsExt.jl | 2 - src/LoopVectorization.jl | 494 +++++++++++++++++-------------------- src/condense_loopset.jl | 18 +- test/testsetup.jl | 5 - 6 files changed, 243 insertions(+), 280 deletions(-) diff --git a/Project.toml b/Project.toml index 21fd26c38..2d028c05b 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "LoopVectorization" uuid = "bdcacae8-1622-11e9-2a5c-532679323890" authors = ["Chris Elrod "] -version = "0.12.168" +version = "0.12.169" [deps] ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" diff --git a/ext/ForwardDiffExt.jl b/ext/ForwardDiffExt.jl index f9c20ec54..26227f694 100644 --- a/ext/ForwardDiffExt.jl +++ b/ext/ForwardDiffExt.jl @@ -1,5 +1,4 @@ module ForwardDiffExt -if VERSION < v"1.11-DEV" import ForwardDiff, ChainRulesCore using LoopVectorization, VectorizationBase, SLEEFPirates, ForwardDiff @@ -378,4 +377,3 @@ for f in (:vmapt, :vmapnt, :vmapntt) end end end -end diff --git a/ext/SpecialFunctionsExt.jl b/ext/SpecialFunctionsExt.jl index 2b1ed5fb5..bfd813ee4 100644 --- a/ext/SpecialFunctionsExt.jl +++ b/ext/SpecialFunctionsExt.jl @@ -1,8 +1,6 @@ module SpecialFunctionsExt -if VERSION < v"1.11-DEV" using SpecialFunctions using LoopVectorization: VectorizationBase using LoopVectorization: AbstractSIMD @inline SpecialFunctions.erf(x::AbstractSIMD) = VectorizationBase.verf(float(x)) end -end diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl index 34583f096..4af0f832c 100644 --- a/src/LoopVectorization.jl +++ b/src/LoopVectorization.jl @@ -20,288 +20,246 @@ export indices, vreduce, vcount -@static if VERSION >= v"1.11-DEV" - macro turbo(args...) - quote - @inbounds @fastmath begin - $(esc(last(args))) - end - end - end - const var"@tturbo" = var"@turbo" - const var"@avx" = var"@turbo" - const var"@avxt" = var"@turbo" - const vmap = map - const vmap! = map! - const vmapt = map - const vmapt! = map! - const vmapnt = map - const vmapnt! = map! - const vmapntt = map - const vmapntt! = map! - const vfilter = filter - const vfilter! = filter! - const vmapreduce = mapreduce - const vsum = sum - const vreduce = reduce - const vcount = count +if isdefined(Base, :Experimental) && + isdefined(Base.Experimental, Symbol("@max_methods")) + @eval Base.Experimental.@max_methods 1 +end +export LowDimArray, + static, stridedpointer, *ˡ, _turbo_!, tanh_fast, sigmoid_fast - indices(A::AbstractArray, i::Integer) = axes(A, i) - function _check_axes_match(ax::Tuple) - fax = first(ax) - foreach(Base.tail(ax)) do x - fax == x || throw(DimensionMismatch("Axes do not match.")) - end - fax - end - indices(a::Tuple, b::Tuple) = _check_axes_match(map(axes, a, b)) - indices(a::Tuple, b::Integer) = _check_axes_match(map(Base.Fix2(axes, b), a)) - function indices(a::AbstractArray, b::Tuple) - _check_axes_match(map(Base.Fix1(axes, a), b)) - end +using ArrayInterface: UpTri, LoTri +using Static: StaticInt, gt, static, Zero, One, reduce_tup +using VectorizationBase, + SLEEFPirates, UnPack, OffsetArrays, StaticArrayInterface +const ArrayInterface = StaticArrayInterface +using LayoutPointers: + AbstractStridedPointer, + StridedPointer, + StridedBitPointer, + grouped_strided_pointer, + stridedpointer_preserve, + GroupedStridedPointers +import LayoutPointers -else - if isdefined(Base, :Experimental) && - isdefined(Base.Experimental, Symbol("@max_methods")) - @eval Base.Experimental.@max_methods 1 - end - export LowDimArray, - static, stridedpointer, *ˡ, _turbo_!, tanh_fast, sigmoid_fast +using SIMDTypes: NativeTypes - using ArrayInterface: UpTri, LoTri - using Static: StaticInt, gt, static, Zero, One, reduce_tup - using VectorizationBase, - SLEEFPirates, UnPack, OffsetArrays, StaticArrayInterface - const ArrayInterface = StaticArrayInterface - using LayoutPointers: - AbstractStridedPointer, - StridedPointer, - StridedBitPointer, - grouped_strided_pointer, - stridedpointer_preserve, - GroupedStridedPointers - import LayoutPointers - - using SIMDTypes: NativeTypes - - using VectorizationBase: - mask, - MM, - AbstractMask, - data, - AbstractSIMD, - vzero, - offsetprecalc, - lazymul, - vadd_nw, - vadd_nsw, - vadd_nuw, - vsub_nw, - vsub_nsw, - vsub_nuw, - vmul_nw, - vmul_nsw, - vmul_nuw, - vfmaddsub, - vfmsubadd, - vpermilps177, - vmovsldup, - vmovshdup, - maybestaticfirst, - maybestaticlast, - gep, - gesp, - vfmadd, - vfmsub, - vfnmadd, - vfnmsub, - vfmadd_fast, - vfmsub_fast, - vfnmadd_fast, - vfnmsub_fast, - vfmadd231, - vfmsub231, - vfnmadd231, - vfnmsub231, - vfma_fast, - vmuladd_fast, - vdiv_fast, - vadd_fast, - vsub_fast, - vmul_fast, - relu, - stridedpointer, - _vload, - _vstore!, - reduced_add, - reduced_prod, - reduce_to_add, - reduce_to_prod, - reduced_max, - reduced_min, - reduce_to_max, - reduce_to_min, - reduced_all, - reduced_any, - reduce_to_all, - reduce_to_any, - vsum, - vprod, - vmaximum, - vminimum, - vany, - vall, - Unroll, - VecUnroll, - preserve_buffer, - zero_vecunroll, - vbroadcast_vecunroll, - _vzero, - _vbroadcast, - contract_add, - collapse_add, - contract_mul, - collapse_mul, - contract_max, - collapse_max, - contract_min, - collapse_min, - contract_and, - collapse_and, - contract_or, - collapse_or, - max_mask, - maybestaticsize, - zero_mask +using VectorizationBase: + mask, + MM, + AbstractMask, + data, + AbstractSIMD, + vzero, + offsetprecalc, + lazymul, + vadd_nw, + vadd_nsw, + vadd_nuw, + vsub_nw, + vsub_nsw, + vsub_nuw, + vmul_nw, + vmul_nsw, + vmul_nuw, + vfmaddsub, + vfmsubadd, + vpermilps177, + vmovsldup, + vmovshdup, + maybestaticfirst, + maybestaticlast, + gep, + gesp, + vfmadd, + vfmsub, + vfnmadd, + vfnmsub, + vfmadd_fast, + vfmsub_fast, + vfnmadd_fast, + vfnmsub_fast, + vfmadd231, + vfmsub231, + vfnmadd231, + vfnmsub231, + vfma_fast, + vmuladd_fast, + vdiv_fast, + vadd_fast, + vsub_fast, + vmul_fast, + relu, + stridedpointer, + _vload, + _vstore!, + reduced_add, + reduced_prod, + reduce_to_add, + reduce_to_prod, + reduced_max, + reduced_min, + reduce_to_max, + reduce_to_min, + reduced_all, + reduced_any, + reduce_to_all, + reduce_to_any, + vsum, + vprod, + vmaximum, + vminimum, + vany, + vall, + Unroll, + VecUnroll, + preserve_buffer, + zero_vecunroll, + vbroadcast_vecunroll, + _vzero, + _vbroadcast, + contract_add, + collapse_add, + contract_mul, + collapse_mul, + contract_max, + collapse_max, + contract_min, + collapse_min, + contract_and, + collapse_and, + contract_or, + collapse_or, + max_mask, + maybestaticsize, + zero_mask - using HostCPUFeatures: - pick_vector_width, - register_size, - register_count, - has_opmask_registers, - unwrap, - get_cpu_name - using CPUSummary: num_cores, cache_linesize, cache_size +using HostCPUFeatures: + pick_vector_width, + register_size, + register_count, + has_opmask_registers, + unwrap, + get_cpu_name +using CPUSummary: num_cores, cache_linesize, cache_size - using IfElse: ifelse +using IfElse: ifelse - using ThreadingUtilities, PolyesterWeave - using Base.Broadcast: Broadcasted, DefaultArrayStyle - using LinearAlgebra: Adjoint, Transpose, Diagonal - using Base.Meta: isexpr - using DocStringExtensions - import LinearAlgebra # for check_args +using ThreadingUtilities, PolyesterWeave +using Base.Broadcast: Broadcasted, DefaultArrayStyle +using LinearAlgebra: Adjoint, Transpose, Diagonal +using Base.Meta: isexpr +using DocStringExtensions +import LinearAlgebra # for check_args - using Base: unsafe_trunc +using Base: unsafe_trunc - using Base.FastMath: - add_fast, - sub_fast, - mul_fast, - div_fast, - inv_fast, - abs2_fast, - rem_fast, - max_fast, - min_fast, - pow_fast, - sqrt_fast - using SLEEFPirates: - log_fast, - log2_fast, - log10_fast, - pow, - sin_fast, - cos_fast, - sincos_fast, - tan_fast +using Base.FastMath: + add_fast, + sub_fast, + mul_fast, + div_fast, + inv_fast, + abs2_fast, + rem_fast, + max_fast, + min_fast, + pow_fast, + sqrt_fast +using SLEEFPirates: + log_fast, + log2_fast, + log10_fast, + pow, + sin_fast, + cos_fast, + sincos_fast, + tan_fast - using StaticArrayInterface: - OptionallyStaticUnitRange, - OptionallyStaticRange, - StaticBool, - True, - False, - indices, - static_strides, - offsets, - static_size, - static_axes, - StrideIndex - using CloseOpenIntervals: AbstractCloseOpen, CloseOpen#, SafeCloseOpen - # @static if VERSION ≥ v"1.6.0-rc1" #TODO: delete `else` when dropping 1.5 support - # using ArrayInterface: static_step - # else # Julia 1.5 did not define `step` on CartesianIndices - @inline static_step(x) = ArrayInterface.static_step(x) - @inline static_step(x::CartesianIndices) = - VectorizationBase.CartesianVIndex(map(static_step, x.indices)) - # end +using StaticArrayInterface: + OptionallyStaticUnitRange, + OptionallyStaticRange, + StaticBool, + True, + False, + indices, + static_strides, + offsets, + static_size, + static_axes, + StrideIndex +using CloseOpenIntervals: AbstractCloseOpen, CloseOpen#, SafeCloseOpen +# @static if VERSION ≥ v"1.6.0-rc1" #TODO: delete `else` when dropping 1.5 support +# using ArrayInterface: static_step +# else # Julia 1.5 did not define `step` on CartesianIndices +@inline static_step(x) = ArrayInterface.static_step(x) +@inline static_step(x::CartesianIndices) = + VectorizationBase.CartesianVIndex(map(static_step, x.indices)) +# end - const VECTORWIDTHSYMBOL, ELTYPESYMBOL, MASKSYMBOL = - Symbol("##Wvecwidth##"), Symbol("##Tloopeltype##"), Symbol("##mask##") +const VECTORWIDTHSYMBOL, ELTYPESYMBOL, MASKSYMBOL = + Symbol("##Wvecwidth##"), Symbol("##Tloopeltype##"), Symbol("##mask##") - include("vectorizationbase_compat/contract_pass.jl") - include("vectorizationbase_compat/subsetview.jl") - include("getconstindexes.jl") - include("predicates.jl") - include("simdfunctionals/map.jl") - include("simdfunctionals/filter.jl") - include("modeling/costs.jl") - include("modeling/operations.jl") - include("modeling/graphs.jl") - include("codegen/operation_evaluation_order.jl") - include("parse/memory_ops_common.jl") - include("parse/add_loads.jl") - include("parse/add_stores.jl") - include("parse/add_compute.jl") - include("parse/add_constants.jl") - include("parse/add_ifelse.jl") - include("modeling/determinestrategy.jl") - include("codegen/line_number_nodes.jl") - include("codegen/loopstartstopmanager.jl") - include("codegen/lower_compute.jl") - include("codegen/lower_constant.jl") - include("codegen/lower_memory_common.jl") - include("codegen/lower_load.jl") - include("codegen/lower_store.jl") - include("codegen/lowering.jl") - include("codegen/split_loops.jl") - include("codegen/lower_threads.jl") - include("condense_loopset.jl") - include("transforms.jl") - include("reconstruct_loopset.jl") - include("constructors.jl") - include("user_api_conveniences.jl") - include("simdfunctionals/mapreduce.jl") - include("simdfunctionals/count.jl") - include("broadcast.jl") +include("vectorizationbase_compat/contract_pass.jl") +include("vectorizationbase_compat/subsetview.jl") +include("getconstindexes.jl") +include("predicates.jl") +include("simdfunctionals/map.jl") +include("simdfunctionals/filter.jl") +include("modeling/costs.jl") +include("modeling/operations.jl") +include("modeling/graphs.jl") +include("codegen/operation_evaluation_order.jl") +include("parse/memory_ops_common.jl") +include("parse/add_loads.jl") +include("parse/add_stores.jl") +include("parse/add_compute.jl") +include("parse/add_constants.jl") +include("parse/add_ifelse.jl") +include("modeling/determinestrategy.jl") +include("codegen/line_number_nodes.jl") +include("codegen/loopstartstopmanager.jl") +include("codegen/lower_compute.jl") +include("codegen/lower_constant.jl") +include("codegen/lower_memory_common.jl") +include("codegen/lower_load.jl") +include("codegen/lower_store.jl") +include("codegen/lowering.jl") +include("codegen/split_loops.jl") +include("codegen/lower_threads.jl") +include("condense_loopset.jl") +include("transforms.jl") +include("reconstruct_loopset.jl") +include("constructors.jl") +include("user_api_conveniences.jl") +include("simdfunctionals/mapreduce.jl") +include("simdfunctionals/count.jl") +include("broadcast.jl") - """ - LoopVectorization provides macros and functions that combine SIMD vectorization and - loop-reordering so as to improve performance: +""" +LoopVectorization provides macros and functions that combine SIMD vectorization and +loop-reordering so as to improve performance: - - [`@turbo`](@ref): transform `for`-loops and broadcasting - - [`vmapreduce`](@ref): vectorized version of `mapreduce` - - [`vreduce`](@ref): vectorized version of `reduce` - - [`vsum`](@ref): vectorized version of `sum` - - [`vmap`](@ref) and `vmap!`: vectorized version of `map` and `map!` - - [`vmapnt`](@ref) and `vmapnt!`: non-temporal variants of `vmap` and `vmap!` - - [`vmapntt`](@ref) and `vmapntt!`: threaded variants of `vmapnt` and `vmapnt!` - - [`vfilter`](@ref) and `vfilter!`: vectorized versions of `filter` and `filter!` - """ - LoopVectorization + - [`@turbo`](@ref): transform `for`-loops and broadcasting + - [`vmapreduce`](@ref): vectorized version of `mapreduce` + - [`vreduce`](@ref): vectorized version of `reduce` + - [`vsum`](@ref): vectorized version of `sum` + - [`vmap`](@ref) and `vmap!`: vectorized version of `map` and `map!` + - [`vmapnt`](@ref) and `vmapnt!`: non-temporal variants of `vmap` and `vmap!` + - [`vmapntt`](@ref) and `vmapntt!`: threaded variants of `vmapnt` and `vmapnt!` + - [`vfilter`](@ref) and `vfilter!`: vectorized versions of `filter` and `filter!` +""" +LoopVectorization - include("precompile.jl") - # _precompile_() +include("precompile.jl") +# _precompile_() - # _vreduce(+, Float64[1.0]) - # matmul_params(64, 32, 64) +# _vreduce(+, Float64[1.0]) +# matmul_params(64, 32, 64) - # import ChainRulesCore, ForwardDiff - # include("vmap_grad.jl") - if !isdefined(Base, :get_extension) - include("../ext/ForwardDiffExt.jl") - include("../ext/SpecialFunctionsExt.jl") - end -end # if VERSION +# import ChainRulesCore, ForwardDiff +# include("vmap_grad.jl") +if !isdefined(Base, :get_extension) + include("../ext/ForwardDiffExt.jl") + include("../ext/SpecialFunctionsExt.jl") +end end # module diff --git a/src/condense_loopset.jl b/src/condense_loopset.jl index fd0411e8a..45b7aa68b 100644 --- a/src/condense_loopset.jl +++ b/src/condense_loopset.jl @@ -369,9 +369,19 @@ function add_external_functions!(q::Expr, ls::LoopSet) end end +# _any(f, x::Tuple{}) = false +# _any(f, x::Tuple{T}) where {T} = f(@inbounds x[1])::Bool +# _any(f, x::Tuple{T0,T1,Vararg}) where {T0,T1} = f(@inbounds x[1])::Bool || _any(f,Base.tail(x))::Bool + +_any_empty(x::Tuple{}) = false +_any_empty(x::Tuple{T}) where {T} = isempty(@inbounds x[1])::Bool +_any_empty(x::Tuple{T0,T1,Vararg}) where {T0,T1} = + isempty(@inbounds x[1])::Bool || _any_empty(Base.tail(x))::Bool + function check_if_empty(ls::LoopSet, q::Expr) lb = loop_boundaries(ls, fill(false, length(ls.loops))) - Expr(:if, Expr(:call, :!, Expr(:call, :any, :isempty, lb)), q) + # Expr(:if, Expr(:call, :!, Expr(:call, _any, :isempty, lb)), q) + Expr(:if, Expr(:call, :!, Expr(:call, _any_empty, lb)), q) end val(x) = Expr(:call, Expr(:curly, :Val, x)) @@ -1164,5 +1174,9 @@ function setup_call( end pushprepreamble!(ls, Expr(:if, call_check, call, argfailure)) prepend_lnns!(ls.prepreamble, lnns) - return ls.prepreamble + return quote + let + $(ls.prepreamble) + end + end end diff --git a/test/testsetup.jl b/test/testsetup.jl index df27ccaab..b60c20220 100644 --- a/test/testsetup.jl +++ b/test/testsetup.jl @@ -2,12 +2,7 @@ using Test using Pkg using LoopVectorization -if VERSION >= v"1.11-DEV" -const var"@_avx" = LoopVectorization.var"@turbo" -else const var"@_avx" = LoopVectorization.var"@_turbo" -end - using LinearAlgebra function clenshaw(x, coeff)