Purge num_threads (#454)

JuliaSIMD · Jan 4, 2023 · ac31711 · ac31711 · chriselrod · Jan 4, 2023
1 parent 35f8310
commit ac31711
Show file tree

Hide file tree

Showing 8 changed files with 117 additions and 105 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "LoopVectorization"
 uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
 authors = ["Chris Elrod <[email protected]>"]
-version = "0.12.143"
+version = "0.12.144"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
@@ -34,7 +34,7 @@ ArrayInterface = "6"
 ArrayInterfaceCore = "0.1.5"
 ArrayInterfaceOffsetArrays = "0.1.2"
 ArrayInterfaceStaticArrays = "0.1.2"
-CPUSummary = "0.1.3 - 0.1.8, 0.1.11"
+CPUSummary = "0.1.3 - 0.1.8, 0.1.11, 0.2.1"
 ChainRulesCore = "1"
 CloseOpenIntervals = "0.1.10"
 DocStringExtensions = "0.8, 0.9"
@@ -43,7 +43,7 @@ HostCPUFeatures = "0.1.10"
 IfElse = "0.1"
 LayoutPointers = "0.1.11"
 OffsetArrays = "1.4.1"
-PolyesterWeave = "0.1.10"
+PolyesterWeave = "0.1.10, 0.2"
 SIMDDualNumbers = "0.1"
 SIMDTypes = "0.1"
 SLEEFPirates = "0.6.23"

diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl
@@ -105,8 +105,13 @@ using VectorizationBase:
   maybestaticsize#,zero_mask
 
 using HostCPUFeatures:
-  pick_vector_width, register_size, register_count, has_opmask_registers, unwrap, get_cpu_name
-using CPUSummary: num_threads, num_cores, cache_linesize, cache_size
+  pick_vector_width,
+  register_size,
+  register_count,
+  has_opmask_registers,
+  unwrap,
+  get_cpu_name
+using CPUSummary: num_cores, cache_linesize, cache_size
 
 
 using IfElse: ifelse

diff --git a/src/broadcast.jl b/src/broadcast.jl
@@ -534,7 +534,7 @@ end
   ::Val{UNROLL},
   ::Val{dontbc},
 ) where {T<:NativeTypes,N,BC<:Union{Broadcasted,Product},Mod,UNROLL,dontbc}
-  2 + 1
+  # 2 + 1
   # we have an N dimensional loop.
   # need to construct the LoopSet
   ls = LoopSet(Mod)

diff --git a/src/codegen/lower_threads.jl b/src/codegen/lower_threads.jl
@@ -64,14 +64,6 @@ struct StaticType{T} end
   )
 end
 
-# function approx_cbrt(x)
-#     s = significand(x)
-#     e = exponent(x)
-
-#     # 40 + 0.00020833333333333335*(x-64000)  -2.1701388888888896e-9*(x-64000)^2*0.5 + 5.6514033564814844e-14 * (x-64000)^3/6
-# end
-lv_max_num_threads() = ifelse(gt(num_threads(), num_cores()), num_cores(), num_threads())
-
 @generated function calc_factors(::StaticInt{nc}) where {nc}
   t = Expr(:tuple)
   for i ∈ nc:-1:1
@@ -148,10 +140,10 @@ end
 
 # if a threaded loop is vectorized, call
 @inline function choose_num_blocks(M, ::StaticInt{U}, nt) where {U}
-  _choose_num_blocks(M % UInt, StaticInt{U}(), nt, lv_max_num_threads())
+  _choose_num_blocks(M % UInt, StaticInt{U}(), nt, num_cores())
 end
 # otherwise, call
-@inline choose_num_blocks(nt, ::StaticInt{NC} = lv_max_num_threads()) where {NC} =
+@inline choose_num_blocks(nt, ::StaticInt{NC} = num_cores()) where {NC} =
   @inbounds choose_num_block_table(StaticInt{NC}())[nt]
 
 scale_cost(c) = @fastmath c * (Sys.ARCH === :x86_64 ? 0.0225 : 0.005625)
@@ -168,12 +160,15 @@ end
   NT::UInt,
   x::Base.BitInteger,
 ) where {T<:Union{Float32,Float64}}
-  min(
-    Base.fptoui(
-      UInt,
-      Base.ceil_llvm(Base.mul_float_fast(C, Base.sqrt_llvm_fast(Base.uitofp(T, x)))),
+  max(
+    min(
+      Base.fptoui(
+        UInt,
+        Base.ceil_llvm(Base.mul_float_fast(C, Base.sqrt_llvm_fast(Base.uitofp(T, x)))),
+      ),
+      NT,
     ),
-    NT,
+    one(UInt),
   )
 end
 function push_loop_length_expr!(q::Expr, ls::LoopSet)
@@ -431,9 +426,12 @@ function thread_one_loops_expr(
   if all(isstaticloop, ls.loops)
     _num_threads = _choose_num_threads(c, ntmax, Int64(looplen))::UInt
     _num_threads > 1 || return avx_body(ls, UNROLL)
-    choose_nthread = Expr(:(=), Symbol("#nthreads#"), _num_threads)
+    ntcallexpr = Expr(:call, %, Expr(:call, Threads.nthreads), UInt)
+    choose_nthread =
+      Expr(:(=), Symbol("#nthreads#"), Expr(:call, min, ntcallexpr, _num_threads))
   else
-    choose_nthread = :(_choose_num_threads($(Float32(c)), $ntmax))
+    choose_nthread =
+      :(_choose_num_threads($(Float32(c)), min(Threads.nthreads() % UInt, $ntmax)))
     push_loop_length_expr!(choose_nthread, ls)
     choose_nthread = Expr(:(=), Symbol("#nthreads#"), choose_nthread)
   end
@@ -623,9 +621,12 @@ function thread_two_loops_expr(
   if all(isstaticloop, ls.loops)
     _num_threads = _choose_num_threads(c, ntmax, Int64(looplen))::UInt
     _num_threads > 1 || return avx_body(ls, UNROLL)
-    choose_nthread = Expr(:(=), Symbol("#nthreads#"), _num_threads)
+    ntcallexpr = Expr(:call, %, Expr(:call, Threads.nthreads), UInt)
+    choose_nthread =
+      Expr(:(=), Symbol("#nthreads#"), Expr(:call, min, ntcallexpr, _num_threads))
   else
-    choose_nthread = :(_choose_num_threads($(Float32(c)), $ntmax))
+    choose_nthread =
+      :(_choose_num_threads($(Float32(c)), min(Threads.nthreads() % UInt, $ntmax)))
     push_loop_length_expr!(choose_nthread, ls)
     choose_nthread = Expr(:(=), Symbol("#nthreads#"), choose_nthread)
   end

diff --git a/src/condense_loopset.jl b/src/condense_loopset.jl
@@ -374,8 +374,7 @@ val(x) = Expr(:call, Expr(:curly, :Val, x))
   ri = argmin(R)
   quote
     $(Expr(:meta, :inline))
-    p, li =
-      VectorizationBase.tdot(x, (vsub_nsw(getfield(i, 1), one($I)),), strides(x))
+    p, li = VectorizationBase.tdot(x, (vsub_nsw(getfield(i, 1), one($I)),), strides(x))
     ptr = gep(p, li)
     si = ArrayInterface.StrideIndex{1,$(R[ri],),$(C === 1 ? 1 : 0)}(
       (getfield(strides(x), $ri),),
@@ -572,7 +571,7 @@ end
     StaticInt{W}(),
     register_size(),
     available_registers(),
-    lv_max_num_threads(),
+    num_cores(), #FIXME
     cache_linesize(),
   )
 end
@@ -814,11 +813,12 @@ function generate_call_types(
   add_external_functions!(extra_args, ls) # extract_external_functions!
   add_outerreduct_types!(extra_args, ls) # extract_outerreduct_types!
   argcestimate = length(extra_args.args) - 1
-  for ref = ls.refs_aliasing_syms
+  for ref in ls.refs_aliasing_syms
     argcestimate += length(ref.loopedindex)
   end
   manyarg = !debug && (argcestimate > 16)
-  func = debug ? lv(:_turbo_loopset_debug) : (manyarg ? lv(:_turbo_manyarg!) : lv(:_turbo_!))
+  func =
+    debug ? lv(:_turbo_loopset_debug) : (manyarg ? lv(:_turbo_manyarg!) : lv(:_turbo_!))
   q = Expr(
     :call,
     func,
@@ -835,18 +835,12 @@ function generate_call_types(
     vargsym = gensym(:vargsym)
     push!(
       q.args,
-      Expr(:call, GlobalRef(Base, :Val), Expr(:call, GlobalRef(Base, :typeof), vargsym))
+      Expr(:call, GlobalRef(Base, :Val), Expr(:call, GlobalRef(Base, :typeof), vargsym)),
     )
     if manyarg
-      push!(
-        q.args,
-        Expr(:call, lv(:flatten_to_tuple), vargsym),
-      )
+      push!(q.args, Expr(:call, lv(:flatten_to_tuple), vargsym))
     else
-      push!(
-        q.args,
-        Expr(:(...), Expr(:call, lv(:flatten_to_tuple), vargsym)),
-      )
+      push!(q.args, Expr(:(...), Expr(:call, lv(:flatten_to_tuple), vargsym)))
     end
     Expr(:block, Expr(:(=), vargsym, Expr(:tuple, lbarg, extra_args)))
   end
@@ -943,6 +937,10 @@ for f ∈ (convert, reinterpret, trunc, unsafe_trunc, round, ceil, floor)
   @eval can_turbo(::typeof($f), ::Val{2}) = true
 end
 
+# @inline function _can_turbo(f::F, t::Vararg{Any,K}) where {F,K}
+#   Base.promote_op(f, t...) !== Union{}
+# end
+
 """
     check_turbo_safe(ls::LoopSet)
 

diff --git a/test/.JuliaFormatter.toml b/test/.JuliaFormatter.toml
@@ -0,0 +1 @@
+indent = 2
diff --git a/test/precompile/LVUser/.JuliaFormatter.toml b/test/precompile/LVUser/.JuliaFormatter.toml
@@ -0,0 +1 @@
+indent = 2