From 1af836173776f7378cd5b5479aa136cecc93ee35 Mon Sep 17 00:00:00 2001
From: Chris Elrod <elrodc@gmail.com>
Date: Tue, 26 Mar 2024 15:55:36 -0400
Subject: [PATCH] Clean JET.@report_call

---
 Project.toml               |   2 +-
 ext/ForwardDiffExt.jl      |   2 -
 ext/SpecialFunctionsExt.jl |   2 -
 src/LoopVectorization.jl   | 494 +++++++++++++++++--------------------
 src/condense_loopset.jl    |  18 +-
 test/testsetup.jl          |   5 -
 6 files changed, 243 insertions(+), 280 deletions(-)

diff --git a/Project.toml b/Project.toml
index 21fd26c38..2d028c05b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "LoopVectorization"
 uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
 authors = ["Chris Elrod <elrodc@gmail.com>"]
-version = "0.12.168"
+version = "0.12.169"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
diff --git a/ext/ForwardDiffExt.jl b/ext/ForwardDiffExt.jl
index f9c20ec54..26227f694 100644
--- a/ext/ForwardDiffExt.jl
+++ b/ext/ForwardDiffExt.jl
@@ -1,5 +1,4 @@
 module ForwardDiffExt
-if VERSION < v"1.11-DEV"
 import ForwardDiff, ChainRulesCore
 using LoopVectorization, VectorizationBase, SLEEFPirates, ForwardDiff
 
@@ -378,4 +377,3 @@ for f in (:vmapt, :vmapnt, :vmapntt)
   end
 end
 end
-end
diff --git a/ext/SpecialFunctionsExt.jl b/ext/SpecialFunctionsExt.jl
index 2b1ed5fb5..bfd813ee4 100644
--- a/ext/SpecialFunctionsExt.jl
+++ b/ext/SpecialFunctionsExt.jl
@@ -1,8 +1,6 @@
 module SpecialFunctionsExt
-if VERSION < v"1.11-DEV"
 using SpecialFunctions
 using LoopVectorization: VectorizationBase
 using LoopVectorization: AbstractSIMD
 @inline SpecialFunctions.erf(x::AbstractSIMD) = VectorizationBase.verf(float(x))
 end
-end
diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl
index 34583f096..4af0f832c 100644
--- a/src/LoopVectorization.jl
+++ b/src/LoopVectorization.jl
@@ -20,288 +20,246 @@ export indices,
   vreduce,
   vcount
 
-@static if VERSION >= v"1.11-DEV"
-  macro turbo(args...)
-    quote
-      @inbounds @fastmath begin
-        $(esc(last(args)))
-      end
-    end
-  end
-  const var"@tturbo" = var"@turbo"
-  const var"@avx" = var"@turbo"
-  const var"@avxt" = var"@turbo"
-  const vmap = map
-  const vmap! = map!
-  const vmapt = map
-  const vmapt! = map!
-  const vmapnt = map
-  const vmapnt! = map!
-  const vmapntt = map
-  const vmapntt! = map!
-  const vfilter = filter
-  const vfilter! = filter!
-  const vmapreduce = mapreduce
-  const vsum = sum
-  const vreduce = reduce
-  const vcount = count
+if isdefined(Base, :Experimental) &&
+   isdefined(Base.Experimental, Symbol("@max_methods"))
+  @eval Base.Experimental.@max_methods 1
+end
+export LowDimArray,
+  static, stridedpointer, *ˡ, _turbo_!, tanh_fast, sigmoid_fast
 
-  indices(A::AbstractArray, i::Integer) = axes(A, i)
-  function _check_axes_match(ax::Tuple)
-    fax = first(ax)
-    foreach(Base.tail(ax)) do x
-      fax == x || throw(DimensionMismatch("Axes do not match."))
-    end
-    fax
-  end
-  indices(a::Tuple, b::Tuple) = _check_axes_match(map(axes, a, b))
-  indices(a::Tuple, b::Integer) = _check_axes_match(map(Base.Fix2(axes, b), a))
-  function indices(a::AbstractArray, b::Tuple)
-    _check_axes_match(map(Base.Fix1(axes, a), b))
-  end
+using ArrayInterface: UpTri, LoTri
+using Static: StaticInt, gt, static, Zero, One, reduce_tup
+using VectorizationBase,
+  SLEEFPirates, UnPack, OffsetArrays, StaticArrayInterface
+const ArrayInterface = StaticArrayInterface
+using LayoutPointers:
+  AbstractStridedPointer,
+  StridedPointer,
+  StridedBitPointer,
+  grouped_strided_pointer,
+  stridedpointer_preserve,
+  GroupedStridedPointers
+import LayoutPointers
 
-else
-  if isdefined(Base, :Experimental) &&
-     isdefined(Base.Experimental, Symbol("@max_methods"))
-    @eval Base.Experimental.@max_methods 1
-  end
-  export LowDimArray,
-    static, stridedpointer, *ˡ, _turbo_!, tanh_fast, sigmoid_fast
+using SIMDTypes: NativeTypes
 
-  using ArrayInterface: UpTri, LoTri
-  using Static: StaticInt, gt, static, Zero, One, reduce_tup
-  using VectorizationBase,
-    SLEEFPirates, UnPack, OffsetArrays, StaticArrayInterface
-  const ArrayInterface = StaticArrayInterface
-  using LayoutPointers:
-    AbstractStridedPointer,
-    StridedPointer,
-    StridedBitPointer,
-    grouped_strided_pointer,
-    stridedpointer_preserve,
-    GroupedStridedPointers
-  import LayoutPointers
-
-  using SIMDTypes: NativeTypes
-
-  using VectorizationBase:
-    mask,
-    MM,
-    AbstractMask,
-    data,
-    AbstractSIMD,
-    vzero,
-    offsetprecalc,
-    lazymul,
-    vadd_nw,
-    vadd_nsw,
-    vadd_nuw,
-    vsub_nw,
-    vsub_nsw,
-    vsub_nuw,
-    vmul_nw,
-    vmul_nsw,
-    vmul_nuw,
-    vfmaddsub,
-    vfmsubadd,
-    vpermilps177,
-    vmovsldup,
-    vmovshdup,
-    maybestaticfirst,
-    maybestaticlast,
-    gep,
-    gesp,
-    vfmadd,
-    vfmsub,
-    vfnmadd,
-    vfnmsub,
-    vfmadd_fast,
-    vfmsub_fast,
-    vfnmadd_fast,
-    vfnmsub_fast,
-    vfmadd231,
-    vfmsub231,
-    vfnmadd231,
-    vfnmsub231,
-    vfma_fast,
-    vmuladd_fast,
-    vdiv_fast,
-    vadd_fast,
-    vsub_fast,
-    vmul_fast,
-    relu,
-    stridedpointer,
-    _vload,
-    _vstore!,
-    reduced_add,
-    reduced_prod,
-    reduce_to_add,
-    reduce_to_prod,
-    reduced_max,
-    reduced_min,
-    reduce_to_max,
-    reduce_to_min,
-    reduced_all,
-    reduced_any,
-    reduce_to_all,
-    reduce_to_any,
-    vsum,
-    vprod,
-    vmaximum,
-    vminimum,
-    vany,
-    vall,
-    Unroll,
-    VecUnroll,
-    preserve_buffer,
-    zero_vecunroll,
-    vbroadcast_vecunroll,
-    _vzero,
-    _vbroadcast,
-    contract_add,
-    collapse_add,
-    contract_mul,
-    collapse_mul,
-    contract_max,
-    collapse_max,
-    contract_min,
-    collapse_min,
-    contract_and,
-    collapse_and,
-    contract_or,
-    collapse_or,
-    max_mask,
-    maybestaticsize,
-    zero_mask
+using VectorizationBase:
+  mask,
+  MM,
+  AbstractMask,
+  data,
+  AbstractSIMD,
+  vzero,
+  offsetprecalc,
+  lazymul,
+  vadd_nw,
+  vadd_nsw,
+  vadd_nuw,
+  vsub_nw,
+  vsub_nsw,
+  vsub_nuw,
+  vmul_nw,
+  vmul_nsw,
+  vmul_nuw,
+  vfmaddsub,
+  vfmsubadd,
+  vpermilps177,
+  vmovsldup,
+  vmovshdup,
+  maybestaticfirst,
+  maybestaticlast,
+  gep,
+  gesp,
+  vfmadd,
+  vfmsub,
+  vfnmadd,
+  vfnmsub,
+  vfmadd_fast,
+  vfmsub_fast,
+  vfnmadd_fast,
+  vfnmsub_fast,
+  vfmadd231,
+  vfmsub231,
+  vfnmadd231,
+  vfnmsub231,
+  vfma_fast,
+  vmuladd_fast,
+  vdiv_fast,
+  vadd_fast,
+  vsub_fast,
+  vmul_fast,
+  relu,
+  stridedpointer,
+  _vload,
+  _vstore!,
+  reduced_add,
+  reduced_prod,
+  reduce_to_add,
+  reduce_to_prod,
+  reduced_max,
+  reduced_min,
+  reduce_to_max,
+  reduce_to_min,
+  reduced_all,
+  reduced_any,
+  reduce_to_all,
+  reduce_to_any,
+  vsum,
+  vprod,
+  vmaximum,
+  vminimum,
+  vany,
+  vall,
+  Unroll,
+  VecUnroll,
+  preserve_buffer,
+  zero_vecunroll,
+  vbroadcast_vecunroll,
+  _vzero,
+  _vbroadcast,
+  contract_add,
+  collapse_add,
+  contract_mul,
+  collapse_mul,
+  contract_max,
+  collapse_max,
+  contract_min,
+  collapse_min,
+  contract_and,
+  collapse_and,
+  contract_or,
+  collapse_or,
+  max_mask,
+  maybestaticsize,
+  zero_mask
 
-  using HostCPUFeatures:
-    pick_vector_width,
-    register_size,
-    register_count,
-    has_opmask_registers,
-    unwrap,
-    get_cpu_name
-  using CPUSummary: num_cores, cache_linesize, cache_size
+using HostCPUFeatures:
+  pick_vector_width,
+  register_size,
+  register_count,
+  has_opmask_registers,
+  unwrap,
+  get_cpu_name
+using CPUSummary: num_cores, cache_linesize, cache_size
 
-  using IfElse: ifelse
+using IfElse: ifelse
 
-  using ThreadingUtilities, PolyesterWeave
-  using Base.Broadcast: Broadcasted, DefaultArrayStyle
-  using LinearAlgebra: Adjoint, Transpose, Diagonal
-  using Base.Meta: isexpr
-  using DocStringExtensions
-  import LinearAlgebra # for check_args
+using ThreadingUtilities, PolyesterWeave
+using Base.Broadcast: Broadcasted, DefaultArrayStyle
+using LinearAlgebra: Adjoint, Transpose, Diagonal
+using Base.Meta: isexpr
+using DocStringExtensions
+import LinearAlgebra # for check_args
 
-  using Base: unsafe_trunc
+using Base: unsafe_trunc
 
-  using Base.FastMath:
-    add_fast,
-    sub_fast,
-    mul_fast,
-    div_fast,
-    inv_fast,
-    abs2_fast,
-    rem_fast,
-    max_fast,
-    min_fast,
-    pow_fast,
-    sqrt_fast
-  using SLEEFPirates:
-    log_fast,
-    log2_fast,
-    log10_fast,
-    pow,
-    sin_fast,
-    cos_fast,
-    sincos_fast,
-    tan_fast
+using Base.FastMath:
+  add_fast,
+  sub_fast,
+  mul_fast,
+  div_fast,
+  inv_fast,
+  abs2_fast,
+  rem_fast,
+  max_fast,
+  min_fast,
+  pow_fast,
+  sqrt_fast
+using SLEEFPirates:
+  log_fast,
+  log2_fast,
+  log10_fast,
+  pow,
+  sin_fast,
+  cos_fast,
+  sincos_fast,
+  tan_fast
 
-  using StaticArrayInterface:
-    OptionallyStaticUnitRange,
-    OptionallyStaticRange,
-    StaticBool,
-    True,
-    False,
-    indices,
-    static_strides,
-    offsets,
-    static_size,
-    static_axes,
-    StrideIndex
-  using CloseOpenIntervals: AbstractCloseOpen, CloseOpen#, SafeCloseOpen
-  # @static if VERSION ≥ v"1.6.0-rc1" #TODO: delete `else` when dropping 1.5 support
-  # using ArrayInterface: static_step
-  # else # Julia 1.5 did not define `step` on CartesianIndices
-  @inline static_step(x) = ArrayInterface.static_step(x)
-  @inline static_step(x::CartesianIndices) =
-    VectorizationBase.CartesianVIndex(map(static_step, x.indices))
-  # end
+using StaticArrayInterface:
+  OptionallyStaticUnitRange,
+  OptionallyStaticRange,
+  StaticBool,
+  True,
+  False,
+  indices,
+  static_strides,
+  offsets,
+  static_size,
+  static_axes,
+  StrideIndex
+using CloseOpenIntervals: AbstractCloseOpen, CloseOpen#, SafeCloseOpen
+# @static if VERSION ≥ v"1.6.0-rc1" #TODO: delete `else` when dropping 1.5 support
+# using ArrayInterface: static_step
+# else # Julia 1.5 did not define `step` on CartesianIndices
+@inline static_step(x) = ArrayInterface.static_step(x)
+@inline static_step(x::CartesianIndices) =
+  VectorizationBase.CartesianVIndex(map(static_step, x.indices))
+# end
 
-  const VECTORWIDTHSYMBOL, ELTYPESYMBOL, MASKSYMBOL =
-    Symbol("##Wvecwidth##"), Symbol("##Tloopeltype##"), Symbol("##mask##")
+const VECTORWIDTHSYMBOL, ELTYPESYMBOL, MASKSYMBOL =
+  Symbol("##Wvecwidth##"), Symbol("##Tloopeltype##"), Symbol("##mask##")
 
-  include("vectorizationbase_compat/contract_pass.jl")
-  include("vectorizationbase_compat/subsetview.jl")
-  include("getconstindexes.jl")
-  include("predicates.jl")
-  include("simdfunctionals/map.jl")
-  include("simdfunctionals/filter.jl")
-  include("modeling/costs.jl")
-  include("modeling/operations.jl")
-  include("modeling/graphs.jl")
-  include("codegen/operation_evaluation_order.jl")
-  include("parse/memory_ops_common.jl")
-  include("parse/add_loads.jl")
-  include("parse/add_stores.jl")
-  include("parse/add_compute.jl")
-  include("parse/add_constants.jl")
-  include("parse/add_ifelse.jl")
-  include("modeling/determinestrategy.jl")
-  include("codegen/line_number_nodes.jl")
-  include("codegen/loopstartstopmanager.jl")
-  include("codegen/lower_compute.jl")
-  include("codegen/lower_constant.jl")
-  include("codegen/lower_memory_common.jl")
-  include("codegen/lower_load.jl")
-  include("codegen/lower_store.jl")
-  include("codegen/lowering.jl")
-  include("codegen/split_loops.jl")
-  include("codegen/lower_threads.jl")
-  include("condense_loopset.jl")
-  include("transforms.jl")
-  include("reconstruct_loopset.jl")
-  include("constructors.jl")
-  include("user_api_conveniences.jl")
-  include("simdfunctionals/mapreduce.jl")
-  include("simdfunctionals/count.jl")
-  include("broadcast.jl")
+include("vectorizationbase_compat/contract_pass.jl")
+include("vectorizationbase_compat/subsetview.jl")
+include("getconstindexes.jl")
+include("predicates.jl")
+include("simdfunctionals/map.jl")
+include("simdfunctionals/filter.jl")
+include("modeling/costs.jl")
+include("modeling/operations.jl")
+include("modeling/graphs.jl")
+include("codegen/operation_evaluation_order.jl")
+include("parse/memory_ops_common.jl")
+include("parse/add_loads.jl")
+include("parse/add_stores.jl")
+include("parse/add_compute.jl")
+include("parse/add_constants.jl")
+include("parse/add_ifelse.jl")
+include("modeling/determinestrategy.jl")
+include("codegen/line_number_nodes.jl")
+include("codegen/loopstartstopmanager.jl")
+include("codegen/lower_compute.jl")
+include("codegen/lower_constant.jl")
+include("codegen/lower_memory_common.jl")
+include("codegen/lower_load.jl")
+include("codegen/lower_store.jl")
+include("codegen/lowering.jl")
+include("codegen/split_loops.jl")
+include("codegen/lower_threads.jl")
+include("condense_loopset.jl")
+include("transforms.jl")
+include("reconstruct_loopset.jl")
+include("constructors.jl")
+include("user_api_conveniences.jl")
+include("simdfunctionals/mapreduce.jl")
+include("simdfunctionals/count.jl")
+include("broadcast.jl")
 
-  """
-  LoopVectorization provides macros and functions that combine SIMD vectorization and
-  loop-reordering so as to improve performance:
+"""
+LoopVectorization provides macros and functions that combine SIMD vectorization and
+loop-reordering so as to improve performance:
 
-    - [`@turbo`](@ref): transform `for`-loops and broadcasting
-    - [`vmapreduce`](@ref): vectorized version of `mapreduce`
-    - [`vreduce`](@ref): vectorized version of `reduce`
-    - [`vsum`](@ref): vectorized version of `sum`
-    - [`vmap`](@ref) and `vmap!`: vectorized version of `map` and `map!`
-    - [`vmapnt`](@ref) and `vmapnt!`: non-temporal variants of `vmap` and `vmap!`
-    - [`vmapntt`](@ref) and `vmapntt!`: threaded variants of `vmapnt` and `vmapnt!`
-    - [`vfilter`](@ref) and `vfilter!`: vectorized versions of `filter` and `filter!`
-  """
-  LoopVectorization
+  - [`@turbo`](@ref): transform `for`-loops and broadcasting
+  - [`vmapreduce`](@ref): vectorized version of `mapreduce`
+  - [`vreduce`](@ref): vectorized version of `reduce`
+  - [`vsum`](@ref): vectorized version of `sum`
+  - [`vmap`](@ref) and `vmap!`: vectorized version of `map` and `map!`
+  - [`vmapnt`](@ref) and `vmapnt!`: non-temporal variants of `vmap` and `vmap!`
+  - [`vmapntt`](@ref) and `vmapntt!`: threaded variants of `vmapnt` and `vmapnt!`
+  - [`vfilter`](@ref) and `vfilter!`: vectorized versions of `filter` and `filter!`
+"""
+LoopVectorization
 
-  include("precompile.jl")
-  # _precompile_()
+include("precompile.jl")
+# _precompile_()
 
-  # _vreduce(+, Float64[1.0])
-  # matmul_params(64, 32, 64)
+# _vreduce(+, Float64[1.0])
+# matmul_params(64, 32, 64)
 
-  # import ChainRulesCore, ForwardDiff
-  # include("vmap_grad.jl")
-  if !isdefined(Base, :get_extension)
-    include("../ext/ForwardDiffExt.jl")
-    include("../ext/SpecialFunctionsExt.jl")
-  end
-end # if VERSION
+# import ChainRulesCore, ForwardDiff
+# include("vmap_grad.jl")
+if !isdefined(Base, :get_extension)
+  include("../ext/ForwardDiffExt.jl")
+  include("../ext/SpecialFunctionsExt.jl")
+end
 end # module
diff --git a/src/condense_loopset.jl b/src/condense_loopset.jl
index fd0411e8a..45b7aa68b 100644
--- a/src/condense_loopset.jl
+++ b/src/condense_loopset.jl
@@ -369,9 +369,19 @@ function add_external_functions!(q::Expr, ls::LoopSet)
   end
 end
 
+# _any(f, x::Tuple{}) = false
+# _any(f, x::Tuple{T}) where {T} = f(@inbounds x[1])::Bool
+# _any(f, x::Tuple{T0,T1,Vararg}) where {T0,T1} = f(@inbounds x[1])::Bool || _any(f,Base.tail(x))::Bool
+
+_any_empty(x::Tuple{}) = false
+_any_empty(x::Tuple{T}) where {T} = isempty(@inbounds x[1])::Bool
+_any_empty(x::Tuple{T0,T1,Vararg}) where {T0,T1} =
+  isempty(@inbounds x[1])::Bool || _any_empty(Base.tail(x))::Bool
+
 function check_if_empty(ls::LoopSet, q::Expr)
   lb = loop_boundaries(ls, fill(false, length(ls.loops)))
-  Expr(:if, Expr(:call, :!, Expr(:call, :any, :isempty, lb)), q)
+  # Expr(:if, Expr(:call, :!, Expr(:call, _any, :isempty, lb)), q)
+  Expr(:if, Expr(:call, :!, Expr(:call, _any_empty, lb)), q)
 end
 
 val(x) = Expr(:call, Expr(:curly, :Val, x))
@@ -1164,5 +1174,9 @@ function setup_call(
   end
   pushprepreamble!(ls, Expr(:if, call_check, call, argfailure))
   prepend_lnns!(ls.prepreamble, lnns)
-  return ls.prepreamble
+  return quote
+    let
+      $(ls.prepreamble)
+    end
+  end
 end
diff --git a/test/testsetup.jl b/test/testsetup.jl
index df27ccaab..b60c20220 100644
--- a/test/testsetup.jl
+++ b/test/testsetup.jl
@@ -2,12 +2,7 @@ using Test
 using Pkg
 using LoopVectorization
 
-if VERSION >= v"1.11-DEV"
-const var"@_avx" = LoopVectorization.var"@turbo"
-else
 const var"@_avx" = LoopVectorization.var"@_turbo"
-end
-
 
 using LinearAlgebra
 function clenshaw(x, coeff)