Skip to content

Commit

Permalink
rip out cache_size
Browse files Browse the repository at this point in the history
  • Loading branch information
chriselrod committed Aug 31, 2022
1 parent f33d08b commit def5ad1
Show file tree
Hide file tree
Showing 8 changed files with 24 additions and 48 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "LoopVectorization"
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
authors = ["Chris Elrod <[email protected]>"]
version = "0.12.124"
version = "0.12.125"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
Expand Down
12 changes: 6 additions & 6 deletions src/broadcast.jl
Original file line number Diff line number Diff line change
Expand Up @@ -548,8 +548,8 @@ end
# we have an N dimensional loop.
# need to construct the LoopSet
ls = LoopSet(Mod)
inline, u₁, u₂, v, isbroadcast, _, rs, rc, cls, l1, l2, l3, threads, warncheckarg = UNROLL
set_hw!(ls, rs, rc, cls, l1, l2, l3)
inline, u₁, u₂, v, isbroadcast, _, rs, rc, cls, threads, warncheckarg = UNROLL
set_hw!(ls, rs, rc, cls)
ls.isbroadcast = isbroadcast # maybe set `false` in a DiffEq-like `@..` macro
loopsyms = [gensym!(ls, "n") for _ 1:N]
add_broadcast_loops!(ls, loopsyms, :dest)
Expand Down Expand Up @@ -584,8 +584,8 @@ end
# we have an N dimensional loop.
# need to construct the LoopSet
ls = LoopSet(Mod)
inline, u₁, u₂, v, isbroadcast, _, rs, rc, cls, l1, l2, l3, threads, warncheckarg = UNROLL
set_hw!(ls, rs, rc, cls, l1, l2, l3)
inline, u₁, u₂, v, isbroadcast, _, rs, rc, cls, threads, warncheckarg = UNROLL
set_hw!(ls, rs, rc, cls)
ls.isbroadcast = isbroadcast # maybe set `false` in a DiffEq-like `@..` macro
loopsyms = [gensym!(ls, "n") for _ 1:N]
pushprepreamble!(ls, Expr(:(=), :dest, Expr(:call, :parent, :dest′)))
Expand Down Expand Up @@ -626,7 +626,7 @@ end
::Val{UNROLL},
::Val{dontbc}
) where {T<:NativeTypes,N,T2<:Number,Mod,UNROLL,dontbc}
inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, l1, l2, l3, threads = UNROLL
inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, threads = UNROLL
quote
$(Expr(:meta, :inline))
arg = T(first(bc.args))
Expand All @@ -646,7 +646,7 @@ end
::Val{UNROLL},
::Val{dontbc}
) where {T<:NativeTypes,N,A<:AbstractArray{T,N},T2<:Number,Mod,UNROLL,dontbc}
inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, l1, l2, l3, threads = UNROLL
inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, threads = UNROLL
quote
$(Expr(:meta, :inline))
arg = T(first(bc.args))
Expand Down
6 changes: 3 additions & 3 deletions src/codegen/lower_threads.jl
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,7 @@ function thread_one_loops_expr(
valid_thread_loop::Vector{Bool},
ntmax::UInt,
c::Float64,
UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,Int,Int,Int,UInt},
UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,UInt},
OPS::Expr,
ARF::Expr,
AM::Expr,
Expand Down Expand Up @@ -615,7 +615,7 @@ function thread_two_loops_expr(
valid_thread_loop::Vector{Bool},
ntmax::UInt,
c::Float64,
UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,Int,Int,Int,UInt},
UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,UInt},
OPS::Expr,
ARF::Expr,
AM::Expr,
Expand Down Expand Up @@ -877,7 +877,7 @@ function valid_thread_loops(ls::LoopSet)
end
function avx_threads_expr(
ls::LoopSet,
UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,Int,Int,Int,UInt},
UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,UInt},
nt::UInt,
OPS::Expr,
ARF::Expr,
Expand Down
3 changes: 1 addition & 2 deletions src/codegen/split_loops.jl
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,7 @@ function split_loopset(ls::LoopSet, ids::Vector{Int}, issecond::Bool)
# it shouldn't.
# Current behavior is incorrect when VECWIDTH chosen does actually differ between
# split loops and the loops are statically sized, because code gen will then assume it is correct...
l1, l2, l3 = cache_sze(ls)
set_hw!(ls_new, reg_size(ls), reg_count(ls), cache_lnsze(ls), l1, l2, l3)
set_hw!(ls_new, reg_size(ls), reg_count(ls), cache_lnsze(ls))
ls_new.vector_width = ls.vector_width
fill_offset_memop_collection!(ls)
# println("ls_new operations:")
Expand Down
20 changes: 3 additions & 17 deletions src/condense_loopset.jl
Original file line number Diff line number Diff line change
Expand Up @@ -550,28 +550,17 @@ function add_grouped_strided_pointer!(extra_args::Expr, ls::LoopSet)
preserve, shouldindbyind, roots
end

# first_cache() = ifelse(gt(num_cache_levels(), StaticInt{2}()), StaticInt{2}(), StaticInt{1}())
# function _first_cache_size(::StaticInt{FCS}) where {FCS}
# L1inclusive = StaticInt{FCS}() - VectorizationBase.cache_size(One())
# ifelse(eq(first_cache(), StaticInt(2)) & VectorizationBase.cache_inclusive(StaticInt(2)), L1inclusive, StaticInt{FCS}())
# end
# _first_cache_size(::Nothing) = StaticInt(262144)
# first_cache_size() = _first_cache_size(cache_size(first_cache()))

@generated function _turbo_config_val(
::Val{CNFARG},
::StaticInt{W},
::StaticInt{RS},
::StaticInt{AR},
::StaticInt{NT},
::StaticInt{CLS},
::StaticInt{L1},
::StaticInt{L2},
::StaticInt{L3},
) where {CNFARG,W,RS,AR,CLS,L1,L2,L3,NT}
) where {CNFARG,W,RS,AR,CLS,NT}
inline, u₁, u₂, v, BROADCAST, thread = CNFARG
nt = min(thread % UInt, NT % UInt)
t = Expr(:tuple, inline, u₁, u₂, v, BROADCAST, W, RS, AR, CLS, L1, L2, L3, nt)
t = Expr(:tuple, inline, u₁, u₂, v, BROADCAST, W, RS, AR, CLS, nt)
length(CNFARG) == 7 && push!(t.args, CNFARG[7])
Expr(:call, Expr(:curly, :Val, t))
end
Expand All @@ -582,10 +571,7 @@ end
register_size(),
available_registers(),
lv_max_num_threads(),
cache_linesize(),
cache_size(StaticInt(1)),
cache_size(StaticInt(2)),
cache_size(StaticInt(3)),
cache_linesize()
)
end
function find_samename_constparent(op::Operation, opname::Symbol)
Expand Down
12 changes: 3 additions & 9 deletions src/modeling/graphs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,6 @@ mutable struct LoopSet
register_size::Int
register_count::Int
cache_linesize::Int
cache_size::Tuple{Int,Int,Int}
ureduct::Int
equalarraydims::Vector{Tuple{Vector{Symbol},Vector{Int}}}
omop::OffsetLoadCollection
Expand Down Expand Up @@ -499,11 +498,11 @@ function save_tilecost!(ls::LoopSet)
end
# ls.reg_pres[5,1] = ls.reg_pres[5,2]
end
function set_hw!(ls::LoopSet, rs::Int, rc::Int, cls::Int, l1::Int, l2::Int, l3::Int)
function set_hw!(ls::LoopSet, rs::Int, rc::Int, cls::Int)
ls.register_size = rs
ls.register_count = rc
ls.cache_linesize = cls
ls.cache_size = (l1, l2, l3)
# ls.cache_size = (l1, l2, l3)
# ls.opmask_register[] = omr
nothing
end
Expand All @@ -514,16 +513,12 @@ function set_hw!(ls::LoopSet)
ls,
Int(register_size()),
Int(available_registers()),
Int(cache_linesize()),
Int(cache_size(StaticInt(1))),
Int(cache_size(StaticInt(2))),
Int(cache_size(StaticInt(3))),
Int(cache_linesize())
)
end
reg_size(ls::LoopSet) = ls.register_size
reg_count(ls::LoopSet) = ls.register_count
cache_lnsze(ls::LoopSet) = ls.cache_linesize
cache_sze(ls::LoopSet) = ls.cache_size

pushprepreamble!(ls::LoopSet, ex) = push!(ls.prepreamble.args, ex)
function pushpreamble!(ls::LoopSet, op::Operation, v::Symbol)
Expand Down Expand Up @@ -608,7 +603,6 @@ function LoopSet(mod::Symbol)
ls.register_size = 0
ls.register_count = 0
ls.cache_linesize = 0
ls.cache_size = (0, 0, 0)
ls.ureduct = -1
ls.equalarraydims = Tuple{Vector{Symbol},Vector{Int}}[]
ls.omop = OffsetLoadCollection()
Expand Down
14 changes: 7 additions & 7 deletions src/reconstruct_loopset.jl
Original file line number Diff line number Diff line change
Expand Up @@ -874,9 +874,9 @@ function avx_loopset!(
end
function avx_body(
ls::LoopSet,
UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,Int,Int,Int,UInt},
UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,UInt},
)
inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, l1, l2, l3, nt = UNROLL
inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, nt = UNROLL
q =
(iszero(u₁) & iszero(v)) ? lower_and_split_loops(ls, inline % Int) :
lower(ls, u₁ % Int, u₂ % Int, v % Int, inline % Int)
Expand Down Expand Up @@ -916,14 +916,14 @@ function _turbo_loopset(
@nospecialize(LPSYMsv),
LBsv::Core.SimpleVector,
vargs::Core.SimpleVector,
UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,Int,Int,Int,UInt},
UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,UInt},
)
nops = length(OPSsv) ÷ 3
instr = Instruction[Instruction(OPSsv[3i+1], OPSsv[3i+2]) for i 0:nops-1]
ops = OperationStruct[OPSsv[3i] for i 1:nops]
ls = LoopSet(:LoopVectorization)
inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, l1, l2, l3, nt = UNROLL
set_hw!(ls, rs, rc, cls, l1, l2, l3)
inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, nt = UNROLL
set_hw!(ls, rs, rc, cls)
ls.vector_width = W
ls.isbroadcast = isbroadcast
arsv = Vector{ArrayRefStruct}(undef, length(ARFsv))
Expand Down Expand Up @@ -990,11 +990,11 @@ Execute an `@turbo` block. The block's code is represented via the arguments:
post = hoist_constant_memory_accesses!(ls)
# q = @show(avx_body(ls, var"#UNROLL#")); post === ls.preamble ? q : Expr(:block, q, post)
q = if (last(var"#UNROLL#") > 1) && length(var"#LPSYM#") == length(ls.loops)
inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, l1, l2, l3, nt = var"#UNROLL#"
inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, nt = var"#UNROLL#"
# wrap in `var"#OPS#", var"#ARF#", var"#AM#", var"#LPSYM#"` in `Expr` to homogenize types
avx_threads_expr(
ls,
(inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, l1, l2, l3, one(UInt)),
(inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, one(UInt)),
nt,
:(Val{$(var"#OPS#")}()),
:(Val{$(var"#ARF#")}()),
Expand Down
3 changes: 0 additions & 3 deletions src/user_api_conveniences.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,6 @@ function matmul_params(
rs,
rc,
cls,
Int(cache_size(StaticInt(1))),
Int(cache_size(StaticInt(2))),
Int(cache_size(StaticInt(3))),
)
if N nothing
nloop = GEMMLOOPSET.loops[1]
Expand Down

2 comments on commit def5ad1

@chriselrod
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/67438

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.12.125 -m "<description of version>" def5ad12f760f0c97817082141a77c642159f4f4
git push origin v0.12.125

Please sign in to comment.