diff --git a/Project.toml b/Project.toml index ea00ae606..766bb339a 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "LoopVectorization" uuid = "bdcacae8-1622-11e9-2a5c-532679323890" authors = ["Chris Elrod "] -version = "0.12.124" +version = "0.12.125" [deps] ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" diff --git a/src/broadcast.jl b/src/broadcast.jl index 3e7ae1482..eb06474c3 100644 --- a/src/broadcast.jl +++ b/src/broadcast.jl @@ -548,8 +548,8 @@ end # we have an N dimensional loop. # need to construct the LoopSet ls = LoopSet(Mod) - inline, u₁, u₂, v, isbroadcast, _, rs, rc, cls, l1, l2, l3, threads, warncheckarg = UNROLL - set_hw!(ls, rs, rc, cls, l1, l2, l3) + inline, u₁, u₂, v, isbroadcast, _, rs, rc, cls, threads, warncheckarg = UNROLL + set_hw!(ls, rs, rc, cls) ls.isbroadcast = isbroadcast # maybe set `false` in a DiffEq-like `@..` macro loopsyms = [gensym!(ls, "n") for _ ∈ 1:N] add_broadcast_loops!(ls, loopsyms, :dest) @@ -584,8 +584,8 @@ end # we have an N dimensional loop. # need to construct the LoopSet ls = LoopSet(Mod) - inline, u₁, u₂, v, isbroadcast, _, rs, rc, cls, l1, l2, l3, threads, warncheckarg = UNROLL - set_hw!(ls, rs, rc, cls, l1, l2, l3) + inline, u₁, u₂, v, isbroadcast, _, rs, rc, cls, threads, warncheckarg = UNROLL + set_hw!(ls, rs, rc, cls) ls.isbroadcast = isbroadcast # maybe set `false` in a DiffEq-like `@..` macro loopsyms = [gensym!(ls, "n") for _ ∈ 1:N] pushprepreamble!(ls, Expr(:(=), :dest, Expr(:call, :parent, :dest′))) @@ -626,7 +626,7 @@ end ::Val{UNROLL}, ::Val{dontbc} ) where {T<:NativeTypes,N,T2<:Number,Mod,UNROLL,dontbc} - inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, l1, l2, l3, threads = UNROLL + inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, threads = UNROLL quote $(Expr(:meta, :inline)) arg = T(first(bc.args)) @@ -646,7 +646,7 @@ end ::Val{UNROLL}, ::Val{dontbc} ) where {T<:NativeTypes,N,A<:AbstractArray{T,N},T2<:Number,Mod,UNROLL,dontbc} - inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, l1, l2, l3, threads = UNROLL + inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, threads = UNROLL quote $(Expr(:meta, :inline)) arg = T(first(bc.args)) diff --git a/src/codegen/lower_threads.jl b/src/codegen/lower_threads.jl index 96950033f..84ab88201 100644 --- a/src/codegen/lower_threads.jl +++ b/src/codegen/lower_threads.jl @@ -420,7 +420,7 @@ function thread_one_loops_expr( valid_thread_loop::Vector{Bool}, ntmax::UInt, c::Float64, - UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,Int,Int,Int,UInt}, + UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,UInt}, OPS::Expr, ARF::Expr, AM::Expr, @@ -615,7 +615,7 @@ function thread_two_loops_expr( valid_thread_loop::Vector{Bool}, ntmax::UInt, c::Float64, - UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,Int,Int,Int,UInt}, + UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,UInt}, OPS::Expr, ARF::Expr, AM::Expr, @@ -877,7 +877,7 @@ function valid_thread_loops(ls::LoopSet) end function avx_threads_expr( ls::LoopSet, - UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,Int,Int,Int,UInt}, + UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,UInt}, nt::UInt, OPS::Expr, ARF::Expr, diff --git a/src/codegen/split_loops.jl b/src/codegen/split_loops.jl index b8f39c76c..3440704a1 100644 --- a/src/codegen/split_loops.jl +++ b/src/codegen/split_loops.jl @@ -107,8 +107,7 @@ function split_loopset(ls::LoopSet, ids::Vector{Int}, issecond::Bool) # it shouldn't. # Current behavior is incorrect when VECWIDTH chosen does actually differ between # split loops and the loops are statically sized, because code gen will then assume it is correct... - l1, l2, l3 = cache_sze(ls) - set_hw!(ls_new, reg_size(ls), reg_count(ls), cache_lnsze(ls), l1, l2, l3) + set_hw!(ls_new, reg_size(ls), reg_count(ls), cache_lnsze(ls)) ls_new.vector_width = ls.vector_width fill_offset_memop_collection!(ls) # println("ls_new operations:") diff --git a/src/condense_loopset.jl b/src/condense_loopset.jl index 43c93f691..d18a3ad7e 100644 --- a/src/condense_loopset.jl +++ b/src/condense_loopset.jl @@ -550,14 +550,6 @@ function add_grouped_strided_pointer!(extra_args::Expr, ls::LoopSet) preserve, shouldindbyind, roots end -# first_cache() = ifelse(gt(num_cache_levels(), StaticInt{2}()), StaticInt{2}(), StaticInt{1}()) -# function _first_cache_size(::StaticInt{FCS}) where {FCS} -# L1inclusive = StaticInt{FCS}() - VectorizationBase.cache_size(One()) -# ifelse(eq(first_cache(), StaticInt(2)) & VectorizationBase.cache_inclusive(StaticInt(2)), L1inclusive, StaticInt{FCS}()) -# end -# _first_cache_size(::Nothing) = StaticInt(262144) -# first_cache_size() = _first_cache_size(cache_size(first_cache())) - @generated function _turbo_config_val( ::Val{CNFARG}, ::StaticInt{W}, @@ -565,13 +557,10 @@ end ::StaticInt{AR}, ::StaticInt{NT}, ::StaticInt{CLS}, - ::StaticInt{L1}, - ::StaticInt{L2}, - ::StaticInt{L3}, -) where {CNFARG,W,RS,AR,CLS,L1,L2,L3,NT} +) where {CNFARG,W,RS,AR,CLS,NT} inline, u₁, u₂, v, BROADCAST, thread = CNFARG nt = min(thread % UInt, NT % UInt) - t = Expr(:tuple, inline, u₁, u₂, v, BROADCAST, W, RS, AR, CLS, L1, L2, L3, nt) + t = Expr(:tuple, inline, u₁, u₂, v, BROADCAST, W, RS, AR, CLS, nt) length(CNFARG) == 7 && push!(t.args, CNFARG[7]) Expr(:call, Expr(:curly, :Val, t)) end @@ -582,10 +571,7 @@ end register_size(), available_registers(), lv_max_num_threads(), - cache_linesize(), - cache_size(StaticInt(1)), - cache_size(StaticInt(2)), - cache_size(StaticInt(3)), + cache_linesize() ) end function find_samename_constparent(op::Operation, opname::Symbol) diff --git a/src/modeling/graphs.jl b/src/modeling/graphs.jl index da83982e7..92e5aea42 100644 --- a/src/modeling/graphs.jl +++ b/src/modeling/graphs.jl @@ -451,7 +451,6 @@ mutable struct LoopSet register_size::Int register_count::Int cache_linesize::Int - cache_size::Tuple{Int,Int,Int} ureduct::Int equalarraydims::Vector{Tuple{Vector{Symbol},Vector{Int}}} omop::OffsetLoadCollection @@ -499,11 +498,11 @@ function save_tilecost!(ls::LoopSet) end # ls.reg_pres[5,1] = ls.reg_pres[5,2] end -function set_hw!(ls::LoopSet, rs::Int, rc::Int, cls::Int, l1::Int, l2::Int, l3::Int) +function set_hw!(ls::LoopSet, rs::Int, rc::Int, cls::Int) ls.register_size = rs ls.register_count = rc ls.cache_linesize = cls - ls.cache_size = (l1, l2, l3) + # ls.cache_size = (l1, l2, l3) # ls.opmask_register[] = omr nothing end @@ -514,16 +513,12 @@ function set_hw!(ls::LoopSet) ls, Int(register_size()), Int(available_registers()), - Int(cache_linesize()), - Int(cache_size(StaticInt(1))), - Int(cache_size(StaticInt(2))), - Int(cache_size(StaticInt(3))), + Int(cache_linesize()) ) end reg_size(ls::LoopSet) = ls.register_size reg_count(ls::LoopSet) = ls.register_count cache_lnsze(ls::LoopSet) = ls.cache_linesize -cache_sze(ls::LoopSet) = ls.cache_size pushprepreamble!(ls::LoopSet, ex) = push!(ls.prepreamble.args, ex) function pushpreamble!(ls::LoopSet, op::Operation, v::Symbol) @@ -608,7 +603,6 @@ function LoopSet(mod::Symbol) ls.register_size = 0 ls.register_count = 0 ls.cache_linesize = 0 - ls.cache_size = (0, 0, 0) ls.ureduct = -1 ls.equalarraydims = Tuple{Vector{Symbol},Vector{Int}}[] ls.omop = OffsetLoadCollection() diff --git a/src/reconstruct_loopset.jl b/src/reconstruct_loopset.jl index ee327cdbf..fab8f53ed 100644 --- a/src/reconstruct_loopset.jl +++ b/src/reconstruct_loopset.jl @@ -874,9 +874,9 @@ function avx_loopset!( end function avx_body( ls::LoopSet, - UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,Int,Int,Int,UInt}, + UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,UInt}, ) - inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, l1, l2, l3, nt = UNROLL + inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, nt = UNROLL q = (iszero(u₁) & iszero(v)) ? lower_and_split_loops(ls, inline % Int) : lower(ls, u₁ % Int, u₂ % Int, v % Int, inline % Int) @@ -916,14 +916,14 @@ function _turbo_loopset( @nospecialize(LPSYMsv), LBsv::Core.SimpleVector, vargs::Core.SimpleVector, - UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,Int,Int,Int,UInt}, + UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,UInt}, ) nops = length(OPSsv) ÷ 3 instr = Instruction[Instruction(OPSsv[3i+1], OPSsv[3i+2]) for i ∈ 0:nops-1] ops = OperationStruct[OPSsv[3i] for i ∈ 1:nops] ls = LoopSet(:LoopVectorization) - inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, l1, l2, l3, nt = UNROLL - set_hw!(ls, rs, rc, cls, l1, l2, l3) + inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, nt = UNROLL + set_hw!(ls, rs, rc, cls) ls.vector_width = W ls.isbroadcast = isbroadcast arsv = Vector{ArrayRefStruct}(undef, length(ARFsv)) @@ -990,11 +990,11 @@ Execute an `@turbo` block. The block's code is represented via the arguments: post = hoist_constant_memory_accesses!(ls) # q = @show(avx_body(ls, var"#UNROLL#")); post === ls.preamble ? q : Expr(:block, q, post) q = if (last(var"#UNROLL#") > 1) && length(var"#LPSYM#") == length(ls.loops) - inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, l1, l2, l3, nt = var"#UNROLL#" + inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, nt = var"#UNROLL#" # wrap in `var"#OPS#", var"#ARF#", var"#AM#", var"#LPSYM#"` in `Expr` to homogenize types avx_threads_expr( ls, - (inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, l1, l2, l3, one(UInt)), + (inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, one(UInt)), nt, :(Val{$(var"#OPS#")}()), :(Val{$(var"#ARF#")}()), diff --git a/src/user_api_conveniences.jl b/src/user_api_conveniences.jl index 350bd4d6a..bc558b9e4 100644 --- a/src/user_api_conveniences.jl +++ b/src/user_api_conveniences.jl @@ -31,9 +31,6 @@ function matmul_params( rs, rc, cls, - Int(cache_size(StaticInt(1))), - Int(cache_size(StaticInt(2))), - Int(cache_size(StaticInt(3))), ) if N ≢ nothing nloop = GEMMLOOPSET.loops[1]