diff --git a/Project.toml b/Project.toml index e3f8e0da..bd278731 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "LoopVectorization" uuid = "bdcacae8-1622-11e9-2a5c-532679323890" authors = ["Chris Elrod "] -version = "0.12.166" +version = "0.12.167" [deps] ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" diff --git a/README.md b/README.md index 569b8b9b..099e903e 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,10 @@ [![LoopVectorization Downloads](https://shields.io/endpoint?url=https://pkgs.genieframework.com/api/v1/badge/LoopVectorization)](https://pkgs.genieframework.com?packages=LoopVectorization) +# NOTE: Deprecated for Julia v1.11 and above! + +LoopVectorization only works for Julia 1.3 through 1.10. For 1.11 and newer, it simply uses `@inbounds @fastmath` instead, so it should still get roughly the same answer, but both runtime and compile time performance may change dramatically. + ## Installation ```julia diff --git a/ext/ForwardDiffExt.jl b/ext/ForwardDiffExt.jl index 26227f69..f9c20ec5 100644 --- a/ext/ForwardDiffExt.jl +++ b/ext/ForwardDiffExt.jl @@ -1,4 +1,5 @@ module ForwardDiffExt +if VERSION < v"1.11-DEV" import ForwardDiff, ChainRulesCore using LoopVectorization, VectorizationBase, SLEEFPirates, ForwardDiff @@ -377,3 +378,4 @@ for f in (:vmapt, :vmapnt, :vmapntt) end end end +end diff --git a/ext/SpecialFunctionsExt.jl b/ext/SpecialFunctionsExt.jl index bfd813ee..2b1ed5fb 100644 --- a/ext/SpecialFunctionsExt.jl +++ b/ext/SpecialFunctionsExt.jl @@ -1,6 +1,8 @@ module SpecialFunctionsExt +if VERSION < v"1.11-DEV" using SpecialFunctions using LoopVectorization: VectorizationBase using LoopVectorization: AbstractSIMD @inline SpecialFunctions.erf(x::AbstractSIMD) = VectorizationBase.verf(float(x)) end +end diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl index 7151df75..bf5795f0 100644 --- a/src/LoopVectorization.jl +++ b/src/LoopVectorization.jl @@ -1,189 +1,10 @@ module LoopVectorization -if isdefined(Base, :Experimental) && - isdefined(Base.Experimental, Symbol("@max_methods")) - @eval Base.Experimental.@max_methods 1 -end - -using ArrayInterface: UpTri, LoTri -using Static: StaticInt, gt, static, Zero, One, reduce_tup -using VectorizationBase, - SLEEFPirates, UnPack, OffsetArrays, StaticArrayInterface -const ArrayInterface = StaticArrayInterface -using LayoutPointers: - AbstractStridedPointer, - StridedPointer, - StridedBitPointer, - grouped_strided_pointer, - stridedpointer_preserve, - GroupedStridedPointers -import LayoutPointers - -using SIMDTypes: NativeTypes - -using VectorizationBase: - mask, - MM, - AbstractMask, - data, - AbstractSIMD, - vzero, - offsetprecalc, - lazymul, - vadd_nw, - vadd_nsw, - vadd_nuw, - vsub_nw, - vsub_nsw, - vsub_nuw, - vmul_nw, - vmul_nsw, - vmul_nuw, - vfmaddsub, - vfmsubadd, - vpermilps177, - vmovsldup, - vmovshdup, - maybestaticfirst, - maybestaticlast, - gep, - gesp, - vfmadd, - vfmsub, - vfnmadd, - vfnmsub, - vfmadd_fast, - vfmsub_fast, - vfnmadd_fast, - vfnmsub_fast, - vfmadd231, - vfmsub231, - vfnmadd231, - vfnmsub231, - vfma_fast, - vmuladd_fast, - vdiv_fast, - vadd_fast, - vsub_fast, - vmul_fast, - relu, - stridedpointer, - _vload, - _vstore!, - reduced_add, - reduced_prod, - reduce_to_add, - reduce_to_prod, - reduced_max, - reduced_min, - reduce_to_max, - reduce_to_min, - reduced_all, - reduced_any, - reduce_to_all, - reduce_to_any, - vsum, - vprod, - vmaximum, - vminimum, - vany, - vall, - Unroll, - VecUnroll, - preserve_buffer, - zero_vecunroll, - vbroadcast_vecunroll, - _vzero, - _vbroadcast, - contract_add, - collapse_add, - contract_mul, - collapse_mul, - contract_max, - collapse_max, - contract_min, - collapse_min, - contract_and, - collapse_and, - contract_or, - collapse_or, - max_mask, - maybestaticsize, - zero_mask - -using HostCPUFeatures: - pick_vector_width, - register_size, - register_count, - has_opmask_registers, - unwrap, - get_cpu_name -using CPUSummary: num_cores, cache_linesize, cache_size - -using IfElse: ifelse - -using ThreadingUtilities, PolyesterWeave -using Base.Broadcast: Broadcasted, DefaultArrayStyle -using LinearAlgebra: Adjoint, Transpose, Diagonal -using Base.Meta: isexpr -using DocStringExtensions -import LinearAlgebra # for check_args - -using Base: unsafe_trunc - -using Base.FastMath: - add_fast, - sub_fast, - mul_fast, - div_fast, - inv_fast, - abs2_fast, - rem_fast, - max_fast, - min_fast, - pow_fast, - sqrt_fast -using SLEEFPirates: - log_fast, - log2_fast, - log10_fast, - pow, - sin_fast, - cos_fast, - sincos_fast, - tan_fast - -using StaticArrayInterface: - OptionallyStaticUnitRange, - OptionallyStaticRange, - StaticBool, - True, - False, - indices, - static_strides, - offsets, - static_size, - static_axes, - StrideIndex -using CloseOpenIntervals: AbstractCloseOpen, CloseOpen#, SafeCloseOpen -# @static if VERSION ≥ v"1.6.0-rc1" #TODO: delete `else` when dropping 1.5 support -# using ArrayInterface: static_step -# else # Julia 1.5 did not define `step` on CartesianIndices -@inline static_step(x) = ArrayInterface.static_step(x) -@inline static_step(x::CartesianIndices) = - VectorizationBase.CartesianVIndex(map(static_step, x.indices)) -# end - -export LowDimArray, - stridedpointer, - indices, - static, +export indices, @avx, @avxt, @turbo, @tturbo, - *ˡ, - _turbo_!, vmap, vmap!, vmapt, @@ -192,8 +13,6 @@ export LowDimArray, vmapnt!, vmapntt, vmapntt!, - tanh_fast, - sigmoid_fast, vfilter, vfilter!, vmapreduce, @@ -201,71 +20,288 @@ export LowDimArray, vreduce, vcount -const VECTORWIDTHSYMBOL, ELTYPESYMBOL, MASKSYMBOL = - Symbol("##Wvecwidth##"), Symbol("##Tloopeltype##"), Symbol("##mask##") +if VERSION >= v"1.11-DEV" + macro turbo(args...) + quote + @inbounds @fastmath begin + $(esc(last(args))) + end + end + end + const var"@tturbo" = var"@turbo" + const var"@avx" = var"@turbo" + const var"@avxt" = var"@turbo" + const vmap = map + const vmap! = map! + const vmapt = map + const vmapt! = map! + const vmapnt = map + const vmapnt! = map! + const vmapntt = map + const vmapntt! = map! + const vfilter = filter + const vfilter! = filter! + const vmapreduce = mapreduce + const vsum = sum + const vreduce = reduce + const vcount = count + + indices(A::AbstractArray, i::Integer) = axes(A, i) + function _check_axes_match(ax::Tuple) + fax = first(ax) + foreach(Base.tail(ax)) do x + fax == x || throw(DimensionMismatch("Axes do not match.")) + end + fax + end + indices(a::Tuple, b::Tuple) = _check_axes_match(map(axes, a, b)) + indices(a::Tuple, b::Integer) = _check_axes_match(map(Base.Fix2(axes, b), a)) + function indices(a::AbstractArray, b::Tuple) + _check_axes_match(map(Base.Fix1(axes, a), b)) + end + +else + if isdefined(Base, :Experimental) && + isdefined(Base.Experimental, Symbol("@max_methods")) + @eval Base.Experimental.@max_methods 1 + end + export LowDimArray, + static, stridedpointer, *ˡ, _turbo_!, tanh_fast, sigmoid_fast + + using ArrayInterface: UpTri, LoTri + using Static: StaticInt, gt, static, Zero, One, reduce_tup + using VectorizationBase, + SLEEFPirates, UnPack, OffsetArrays, StaticArrayInterface + const ArrayInterface = StaticArrayInterface + using LayoutPointers: + AbstractStridedPointer, + StridedPointer, + StridedBitPointer, + grouped_strided_pointer, + stridedpointer_preserve, + GroupedStridedPointers + import LayoutPointers + + using SIMDTypes: NativeTypes + + using VectorizationBase: + mask, + MM, + AbstractMask, + data, + AbstractSIMD, + vzero, + offsetprecalc, + lazymul, + vadd_nw, + vadd_nsw, + vadd_nuw, + vsub_nw, + vsub_nsw, + vsub_nuw, + vmul_nw, + vmul_nsw, + vmul_nuw, + vfmaddsub, + vfmsubadd, + vpermilps177, + vmovsldup, + vmovshdup, + maybestaticfirst, + maybestaticlast, + gep, + gesp, + vfmadd, + vfmsub, + vfnmadd, + vfnmsub, + vfmadd_fast, + vfmsub_fast, + vfnmadd_fast, + vfnmsub_fast, + vfmadd231, + vfmsub231, + vfnmadd231, + vfnmsub231, + vfma_fast, + vmuladd_fast, + vdiv_fast, + vadd_fast, + vsub_fast, + vmul_fast, + relu, + stridedpointer, + _vload, + _vstore!, + reduced_add, + reduced_prod, + reduce_to_add, + reduce_to_prod, + reduced_max, + reduced_min, + reduce_to_max, + reduce_to_min, + reduced_all, + reduced_any, + reduce_to_all, + reduce_to_any, + vsum, + vprod, + vmaximum, + vminimum, + vany, + vall, + Unroll, + VecUnroll, + preserve_buffer, + zero_vecunroll, + vbroadcast_vecunroll, + _vzero, + _vbroadcast, + contract_add, + collapse_add, + contract_mul, + collapse_mul, + contract_max, + collapse_max, + contract_min, + collapse_min, + contract_and, + collapse_and, + contract_or, + collapse_or, + max_mask, + maybestaticsize, + zero_mask + + using HostCPUFeatures: + pick_vector_width, + register_size, + register_count, + has_opmask_registers, + unwrap, + get_cpu_name + using CPUSummary: num_cores, cache_linesize, cache_size + + using IfElse: ifelse + + using ThreadingUtilities, PolyesterWeave + using Base.Broadcast: Broadcasted, DefaultArrayStyle + using LinearAlgebra: Adjoint, Transpose, Diagonal + using Base.Meta: isexpr + using DocStringExtensions + import LinearAlgebra # for check_args + + using Base: unsafe_trunc + + using Base.FastMath: + add_fast, + sub_fast, + mul_fast, + div_fast, + inv_fast, + abs2_fast, + rem_fast, + max_fast, + min_fast, + pow_fast, + sqrt_fast + using SLEEFPirates: + log_fast, + log2_fast, + log10_fast, + pow, + sin_fast, + cos_fast, + sincos_fast, + tan_fast + + using StaticArrayInterface: + OptionallyStaticUnitRange, + OptionallyStaticRange, + StaticBool, + True, + False, + indices, + static_strides, + offsets, + static_size, + static_axes, + StrideIndex + using CloseOpenIntervals: AbstractCloseOpen, CloseOpen#, SafeCloseOpen + # @static if VERSION ≥ v"1.6.0-rc1" #TODO: delete `else` when dropping 1.5 support + # using ArrayInterface: static_step + # else # Julia 1.5 did not define `step` on CartesianIndices + @inline static_step(x) = ArrayInterface.static_step(x) + @inline static_step(x::CartesianIndices) = + VectorizationBase.CartesianVIndex(map(static_step, x.indices)) + # end -include("vectorizationbase_compat/contract_pass.jl") -include("vectorizationbase_compat/subsetview.jl") -include("getconstindexes.jl") -include("predicates.jl") -include("simdfunctionals/map.jl") -include("simdfunctionals/filter.jl") -include("modeling/costs.jl") -include("modeling/operations.jl") -include("modeling/graphs.jl") -include("codegen/operation_evaluation_order.jl") -include("parse/memory_ops_common.jl") -include("parse/add_loads.jl") -include("parse/add_stores.jl") -include("parse/add_compute.jl") -include("parse/add_constants.jl") -include("parse/add_ifelse.jl") -include("modeling/determinestrategy.jl") -include("codegen/line_number_nodes.jl") -include("codegen/loopstartstopmanager.jl") -include("codegen/lower_compute.jl") -include("codegen/lower_constant.jl") -include("codegen/lower_memory_common.jl") -include("codegen/lower_load.jl") -include("codegen/lower_store.jl") -include("codegen/lowering.jl") -include("codegen/split_loops.jl") -include("codegen/lower_threads.jl") -include("condense_loopset.jl") -include("transforms.jl") -include("reconstruct_loopset.jl") -include("constructors.jl") -include("user_api_conveniences.jl") -include("simdfunctionals/mapreduce.jl") -include("simdfunctionals/count.jl") -include("broadcast.jl") + const VECTORWIDTHSYMBOL, ELTYPESYMBOL, MASKSYMBOL = + Symbol("##Wvecwidth##"), Symbol("##Tloopeltype##"), Symbol("##mask##") -""" -LoopVectorization provides macros and functions that combine SIMD vectorization and -loop-reordering so as to improve performance: + include("vectorizationbase_compat/contract_pass.jl") + include("vectorizationbase_compat/subsetview.jl") + include("getconstindexes.jl") + include("predicates.jl") + include("simdfunctionals/map.jl") + include("simdfunctionals/filter.jl") + include("modeling/costs.jl") + include("modeling/operations.jl") + include("modeling/graphs.jl") + include("codegen/operation_evaluation_order.jl") + include("parse/memory_ops_common.jl") + include("parse/add_loads.jl") + include("parse/add_stores.jl") + include("parse/add_compute.jl") + include("parse/add_constants.jl") + include("parse/add_ifelse.jl") + include("modeling/determinestrategy.jl") + include("codegen/line_number_nodes.jl") + include("codegen/loopstartstopmanager.jl") + include("codegen/lower_compute.jl") + include("codegen/lower_constant.jl") + include("codegen/lower_memory_common.jl") + include("codegen/lower_load.jl") + include("codegen/lower_store.jl") + include("codegen/lowering.jl") + include("codegen/split_loops.jl") + include("codegen/lower_threads.jl") + include("condense_loopset.jl") + include("transforms.jl") + include("reconstruct_loopset.jl") + include("constructors.jl") + include("user_api_conveniences.jl") + include("simdfunctionals/mapreduce.jl") + include("simdfunctionals/count.jl") + include("broadcast.jl") - - [`@turbo`](@ref): transform `for`-loops and broadcasting - - [`vmapreduce`](@ref): vectorized version of `mapreduce` - - [`vreduce`](@ref): vectorized version of `reduce` - - [`vsum`](@ref): vectorized version of `sum` - - [`vmap`](@ref) and `vmap!`: vectorized version of `map` and `map!` - - [`vmapnt`](@ref) and `vmapnt!`: non-temporal variants of `vmap` and `vmap!` - - [`vmapntt`](@ref) and `vmapntt!`: threaded variants of `vmapnt` and `vmapnt!` - - [`vfilter`](@ref) and `vfilter!`: vectorized versions of `filter` and `filter!` -""" -LoopVectorization + """ + LoopVectorization provides macros and functions that combine SIMD vectorization and + loop-reordering so as to improve performance: -include("precompile.jl") -# _precompile_() + - [`@turbo`](@ref): transform `for`-loops and broadcasting + - [`vmapreduce`](@ref): vectorized version of `mapreduce` + - [`vreduce`](@ref): vectorized version of `reduce` + - [`vsum`](@ref): vectorized version of `sum` + - [`vmap`](@ref) and `vmap!`: vectorized version of `map` and `map!` + - [`vmapnt`](@ref) and `vmapnt!`: non-temporal variants of `vmap` and `vmap!` + - [`vmapntt`](@ref) and `vmapntt!`: threaded variants of `vmapnt` and `vmapnt!` + - [`vfilter`](@ref) and `vfilter!`: vectorized versions of `filter` and `filter!` + """ + LoopVectorization -# _vreduce(+, Float64[1.0]) -# matmul_params(64, 32, 64) + include("precompile.jl") + # _precompile_() -# import ChainRulesCore, ForwardDiff -# include("vmap_grad.jl") -if !isdefined(Base, :get_extension) - include("../ext/ForwardDiffExt.jl") - include("../ext/SpecialFunctionsExt.jl") -end + # _vreduce(+, Float64[1.0]) + # matmul_params(64, 32, 64) + # import ChainRulesCore, ForwardDiff + # include("vmap_grad.jl") + if !isdefined(Base, :get_extension) + include("../ext/ForwardDiffExt.jl") + include("../ext/SpecialFunctionsExt.jl") + end +end # if VERSION end # module diff --git a/src/condense_loopset.jl b/src/condense_loopset.jl index 403f865f..18e29a04 100644 --- a/src/condense_loopset.jl +++ b/src/condense_loopset.jl @@ -938,6 +938,7 @@ end @inline check_args(::VectorizationBase.AbstractStridedPointer) = true @inline function check_args(x) # @info "`LoopVectorization.check_args(::$(typeof(x))) == false`, therefore compiling a probably slow `@inbounds @fastmath` fallback loop." maxlog=1 + @show @__LINE__, typeof(x) false end @inline check_args(A, B, C::Vararg{Any,K}) where {K} = @@ -954,11 +955,17 @@ end Returns true if the element type is supported. """ @inline check_type(::Type{T}) where {T<:NativeTypes} = true -@inline check_type(::Type{T}) where {T} = false +@inline function check_type(::Type{T}) where {T} +@show @__LINE__, T +false +end @inline check_type(::Type{T}) where {T<:AbstractSIMD} = true @inline check_device(::ArrayInterface.CPUPointer) = true @inline check_device(::ArrayInterface.CPUTuple) = true -@inline check_device(x) = false +@inline function check_device(x) +@show @__LINE__, typeof(x) +false +end function check_args_call(ls::LoopSet) q = Expr(:call, lv(:check_args)) @@ -978,6 +985,7 @@ can be used inside a `@turbo` loop. """ function can_turbo(f::F, ::Val{NARGS})::Bool where {F,NARGS} promoted_op = Base.promote_op(f, ntuple(RetVec2Int(), Val(NARGS))...) + promoted_op === Union{} && @show f, NARGS return promoted_op !== Union{} end can_turbo(::typeof(vfmaddsub), ::Val{3}) = true diff --git a/test/testsetup.jl b/test/testsetup.jl index b60c2022..df27ccaa 100644 --- a/test/testsetup.jl +++ b/test/testsetup.jl @@ -2,7 +2,12 @@ using Test using Pkg using LoopVectorization +if VERSION >= v"1.11-DEV" +const var"@_avx" = LoopVectorization.var"@turbo" +else const var"@_avx" = LoopVectorization.var"@_turbo" +end + using LinearAlgebra function clenshaw(x, coeff)