From da1161b9806d630d989ecb8c2162cf3522b69ee9 Mon Sep 17 00:00:00 2001
From: Chris Elrod <elrodc@gmail.com>
Date: Tue, 10 Jan 2023 15:04:41 -0500
Subject: [PATCH] format

---
 .JuliaFormatter.toml                          |   8 +-
 benchmark/benchmarkflops.jl                   |   7 +-
 benchmark/benchmarks.jl                       |   3 +-
 benchmark/driver.jl                           |  38 +-
 benchmark/loadsharedlibs.jl                   | 303 +++++++----
 benchmark/looptests.jl                        |  55 +-
 benchmark/plotbenchmarks.jl                   |  42 +-
 docs/make.jl                                  |  10 +-
 src/LoopVectorization.jl                      |  31 +-
 src/broadcast.jl                              | 264 +++++++---
 src/codegen/line_number_nodes.jl              |  15 +-
 src/codegen/loopstartstopmanager.jl           | 240 ++++++---
 src/codegen/lower_compute.jl                  | 168 ++++--
 src/codegen/lower_constant.jl                 |  68 ++-
 src/codegen/lower_load.jl                     | 161 ++++--
 src/codegen/lower_memory_common.jl            | 217 ++++++--
 src/codegen/lower_store.jl                    |  86 +++-
 src/codegen/lower_threads.jl                  | 339 ++++++++----
 src/codegen/lowering.jl                       | 393 ++++++++++----
 src/codegen/operation_evaluation_order.jl     |  53 +-
 src/codegen/split_loops.jl                    |  76 ++-
 src/condense_loopset.jl                       | 223 +++++---
 src/constructors.jl                           | 131 ++++-
 src/getconstindexes.jl                        |   2 +-
 src/modeling/costs.jl                         |  96 ++--
 src/modeling/determinestrategy.jl             | 223 +++++---
 src/modeling/graphs.jl                        | 485 ++++++++++++------
 src/modeling/operations.jl                    | 104 ++--
 src/parse/add_compute.jl                      | 164 ++++--
 src/parse/add_constants.jl                    |  44 +-
 src/parse/add_ifelse.jl                       |  89 +++-
 src/parse/add_loads.jl                        |  36 +-
 src/parse/add_stores.jl                       |  36 +-
 src/parse/memory_ops_common.jl                | 263 +++++++---
 src/predicates.jl                             |   3 +-
 src/reconstruct_loopset.jl                    | 371 ++++++++++----
 src/simdfunctionals/filter.jl                 |  25 +-
 src/simdfunctionals/map.jl                    | 145 ++++--
 src/simdfunctionals/mapreduce.jl              |  45 +-
 src/simdfunctionals/vmap_grad_forwarddiff.jl  |   2 +-
 src/simdfunctionals/vmap_grad_rrule.jl        |  17 +-
 src/transforms.jl                             |  38 +-
 src/user_api_conveniences.jl                  |  40 +-
 src/vectorizationbase_compat/contract_pass.jl |  56 +-
 src/vectorizationbase_compat/subsetview.jl    |  31 +-
 test/manyarrayrefs.jl                         |  30 +-
 utils/generate_costs.jl                       |  36 +-
 utils/generate_precompiles.jl                 |  16 +-
 48 files changed, 3698 insertions(+), 1630 deletions(-)

diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml
index 5c2cf2f06..b74ff2c91 100644
--- a/.JuliaFormatter.toml
+++ b/.JuliaFormatter.toml
@@ -1 +1,7 @@
-indent = 2
\ No newline at end of file
+indent = 2
+margin = 80
+remove_extra_newlines = true
+long_to_short_function_def = true
+format_docstrings = true
+trailing_comma = false
+separate_kwargs_with_semicolon = true
diff --git a/benchmark/benchmarkflops.jl b/benchmark/benchmarkflops.jl
index cbaf4fde2..8831931b3 100644
--- a/benchmark/benchmarkflops.jl
+++ b/benchmark/benchmarkflops.jl
@@ -28,8 +28,8 @@ function Base.vcat(br1::BenchmarkResult, br2::BenchmarkResult)
     br1.tests,
     SizedResults(
       hcat(br1.sizedresults.results, br2.sizedresults.results),
-      vcat(br1.sizedresults.sizes, br2.sizedresults.sizes),
-    ),
+      vcat(br1.sizedresults.sizes, br2.sizedresults.sizes)
+    )
   )
 end
 
@@ -119,7 +119,6 @@ function At_mul_Bt_bench!(br, s, i)
   matmul_bench!(br, C, A, B, i)
 end
 
-
 function dot_bench!(br, s, i)
   a = rand(s)
   b = rand(s)
@@ -440,7 +439,6 @@ function logdettriangle_bench!(br, s, i)
   br[5+2INTEL_BENCH, i] = n_gflop / @belapsed logdet($U)
 end
 
-
 function filter2d_bench_run!(br, s, i, K)
   A = rand(s + 2, s + 2)
   B = OffsetArray(similar(A, (s, s)), 1, 1)
@@ -463,7 +461,6 @@ function filter2d_bench_run!(br, s, i, K)
   end
 end
 
-
 function filter2dunrolled_bench_run!(br, s, i, K)
   A = rand(s + 2, s + 2)
   B = OffsetArray(similar(A, (s, s)), 1, 1)
diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index 4cd95d96f..715703739 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -14,7 +14,8 @@ for n ∈ 1:64
   B = rand(n, n)
   C = Matrix{Float64}(undef, n, n)
   SUITE["linalg"]["matmul"]["AmulB", n] = @benchmarkable gemmavx!($C, $A, $B)
-  SUITE["linalg"]["matmul"]["A′mulB", n] = @benchmarkable jAtmulBavx!($C, $A′, $B)
+  SUITE["linalg"]["matmul"]["A′mulB", n] =
+    @benchmarkable jAtmulBavx!($C, $A′, $B)
   x = rand(n)
   y = rand(n)
   SUITE["linalg"]["dot"]["dot", n] = @benchmarkable jdotavx($x, $y)
diff --git a/benchmark/driver.jl b/benchmark/driver.jl
index ad605982d..cbcaa8286 100644
--- a/benchmark/driver.jl
+++ b/benchmark/driver.jl
@@ -7,21 +7,26 @@ const LOOPVECBENCHDIR = joinpath(pkgdir(LoopVectorization), "benchmark")
 include(joinpath(LOOPVECBENCHDIR, "benchmarkflops.jl"))
 include(joinpath(LOOPVECBENCHDIR, "plotbenchmarks.jl"))
 
-
 nprocs_to_add() = ((Sys.CPU_THREADS)::Int >> 1)
 # nprocs_to_add() = ((Sys.CPU_THREADS)::Int >> 1) - 1
-start_worker(wid) = remotecall(include, wid, joinpath(LOOPVECBENCHDIR, "setup_worker.jl"))
+start_worker(wid) =
+  remotecall(include, wid, joinpath(LOOPVECBENCHDIR, "setup_worker.jl"))
 function start_workers(nprocs = nprocs_to_add())
-  addprocs(nprocs, exeflags = "--project=$(Base.active_project())")
+  addprocs(nprocs; exeflags = "--project=$(Base.active_project())")
   foreach(wait, map(start_worker, workers()))
 end
 stop_workers() = rmprocs(workers())
 
-
 function blastests()
   tests = ["LoopVectorization", "Julia", "Clang", "GFortran"]
   INTEL_BENCH && push!(tests, "icc", "ifort")
-  push!(tests, "g++ & Eigen-3", "clang++ & Eigen-3", "GFortran-builtin", "OpenBLAS")
+  push!(
+    tests,
+    "g++ & Eigen-3",
+    "clang++ & Eigen-3",
+    "GFortran-builtin",
+    "OpenBLAS"
+  )
   INTEL_BENCH && push!(tests, "ifort-builtin")
   MKL_BENCH && push!(tests, "MKL")
   tests
@@ -166,7 +171,10 @@ function benchmark_random_access(sizes)
   INTEL_BENCH && push!(tests, "icc", "ifort")
   start_workers()
   sm = SharedMatrix(Matrix{Float64}(undef, length(tests), length(sizes)))
-  @showprogress pmap(is -> randomaccess_bench!(sm, is[2], is[1]), enumerate(sizes))
+  @showprogress pmap(
+    is -> randomaccess_bench!(sm, is[2], is[1]),
+    enumerate(sizes)
+  )
   br = BenchmarkResult(Matrix(sm), tests, sizes)
   stop_workers()
   br
@@ -178,7 +186,10 @@ function benchmark_logdettriangle(sizes)
   push!(tests, "LinearAlgebra")
   start_workers()
   sm = SharedMatrix(Matrix{Float64}(undef, length(tests), length(sizes)))
-  @showprogress pmap(is -> logdettriangle_bench!(sm, is[2], is[1]), enumerate(sizes))
+  @showprogress pmap(
+    is -> logdettriangle_bench!(sm, is[2], is[1]),
+    enumerate(sizes)
+  )
   br = BenchmarkResult(Matrix(sm), tests, sizes)
   stop_workers()
   br
@@ -188,7 +199,10 @@ function benchmark_filter2d(sizes, K)
   INTEL_BENCH && push!(tests, "icc", "ifort")
   start_workers()
   sm = SharedMatrix(Matrix{Float64}(undef, length(tests), length(sizes)))
-  @showprogress pmap(is -> filter2d_bench_run!(sm, is[2], is[1], K), enumerate(sizes))
+  @showprogress pmap(
+    is -> filter2d_bench_run!(sm, is[2], is[1], K),
+    enumerate(sizes)
+  )
   br = BenchmarkResult(Matrix(sm), tests, sizes)
   stop_workers()
   br
@@ -209,15 +223,13 @@ function benchmark_filter2dunrolled(sizes)
   K = SizedOffsetMatrix{Float64,-1,1,-1,1}(rand(3, 3))
   @showprogress pmap(
     is -> filter2dunrolled_bench_run!(sm, is[2], is[1], K),
-    enumerate(sizes),
+    enumerate(sizes)
   )
   br = BenchmarkResult(Matrix(sm), tests, sizes)
   stop_workers()
   br
 end
 
-
-
 # sizes = 23:23
 sizes = 256:-1:2
 longsizes = 1024:-1:2
@@ -287,7 +299,8 @@ const v = 2
 # using Cairo, Fontconfig
 const PICTURES = joinpath(pkgdir(LoopVectorization), "docs", "src", "assets")
 # saveplot(f, br) = draw(PNG(joinpath(PICTURES, f * "$v.png"), 12inch, 8inch), plot(br))
-saveplot(f, br) = draw(SVG(joinpath(PICTURES, f * "$v.svg"), 12inch, 8inch), plot(br))
+saveplot(f, br) =
+  draw(SVG(joinpath(PICTURES, f * "$v.svg"), 12inch, 8inch), plot(br))
 
 # If only rerunning a few, remove them from load.
 # @load "benchmarkresults.jld2" logdettriangle_bench filter2d_dynamic_bench filter2d_3x3_bench filter2d_unrolled_bench dot_bench selfdot_bench dot3_bench sse_bench aplusBc_bench AplusAt_bench vexp_bench randomaccess_bench AmulB_bench AmulBt_bench AtmulB_bench AtmulBt_bench Amulvb_bench Atmulvb_bench
@@ -305,7 +318,6 @@ saveplot("bench_AtmulBt_v", AtmulBt_bench);
 saveplot("bench_Amulvb_v", Amulvb_bench);
 saveplot("bench_Atmulvb_v", Atmulvb_bench);
 
-
 saveplot("bench_logdettriangle_v", logdettriangle_bench);
 saveplot("bench_filter2d_dynamic_v", filter2d_dynamic_bench);
 saveplot("bench_filter2d_3x3_v", filter2d_3x3_bench);
diff --git a/benchmark/loadsharedlibs.jl b/benchmark/loadsharedlibs.jl
index e54bd29eb..65656b7df 100644
--- a/benchmark/loadsharedlibs.jl
+++ b/benchmark/loadsharedlibs.jl
@@ -12,17 +12,16 @@ const LIBIFTEST = joinpath(LOOPVECBENCHDIR, "libiftests.so")
 const LIBEIGENTEST = joinpath(LOOPVECBENCHDIR, "libetest.so")
 const LIBIEIGENTEST = joinpath(LOOPVECBENCHDIR, "libietest.so")
 
-
 # requires Clang with polly to build
 cfile = joinpath(LOOPVECBENCHDIR, "looptests.c")
 if !isfile(LIBCTEST) || mtime(cfile) > mtime(LIBCTEST)
   if (Sys.ARCH === :aarch64) && Sys.isapple() # assume no `-march=native` support
     run(
-      `clang -Ofast -mprefer-vector-width=$(8REGISTER_SIZE) -lm -shared -fPIC $cfile -o $LIBCTEST`,
+      `clang -Ofast -mprefer-vector-width=$(8REGISTER_SIZE) -lm -shared -fPIC $cfile -o $LIBCTEST`
     )
   else
     run(
-      `clang -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -lm -shared -fPIC $cfile -o $LIBCTEST`,
+      `clang -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -lm -shared -fPIC $cfile -o $LIBCTEST`
     )
   end
 
@@ -33,11 +32,11 @@ if !isfile(LIBFTEST) || mtime(ffile) > mtime(LIBFTEST)
   # --param max-unroll-times defaults to ≥8, which is generally excessive
   if (Sys.ARCH === :x86_64)
     run(
-      `gfortran -Ofast -march=native -funroll-loops -mprefer-vector-width=$(8REGISTER_SIZE) -fvariable-expansion-in-unroller --param max-variable-expansions-in-unroller=4 -shared -fPIC $ffile -o $LIBFTEST`,
+      `gfortran -Ofast -march=native -funroll-loops -mprefer-vector-width=$(8REGISTER_SIZE) -fvariable-expansion-in-unroller --param max-variable-expansions-in-unroller=4 -shared -fPIC $ffile -o $LIBFTEST`
     )
   else
     run(
-      `gfortran -Ofast -march=native -funroll-loops -fvariable-expansion-in-unroller --param max-variable-expansions-in-unroller=4 -shared -fPIC $ffile -o $LIBFTEST`,
+      `gfortran -Ofast -march=native -funroll-loops -fvariable-expansion-in-unroller --param max-variable-expansions-in-unroller=4 -shared -fPIC $ffile -o $LIBFTEST`
     )
   end
   # run(`gfortran -Ofast -march=native -funroll-loops -floop-nest-optimize -mprefer-vector-width=$(8REGISTER_SIZE) -shared -fPIC $ffile -o $LIBFTEST`)
@@ -46,12 +45,12 @@ end
 const INTEL_BENCH = try
   if !isfile(LIBIFTEST) || mtime(ffile) > mtime(LIBIFTEST)
     run(
-      `ifort -fast -qopt-zmm-usage=high -qoverride-limits -shared -fPIC $ffile -o $LIBIFTEST`,
+      `ifort -fast -qopt-zmm-usage=high -qoverride-limits -shared -fPIC $ffile -o $LIBIFTEST`
     )
   end
   if !isfile(LIBICTEST) || mtime(cfile) > mtime(LIBICTEST)
     run(
-      `icc -fast -qopt-zmm-usage=high -fargument-noalias-global -qoverride-limits -shared -fPIC $cfile -o $LIBICTEST`,
+      `icc -fast -qopt-zmm-usage=high -fargument-noalias-global -qoverride-limits -shared -fPIC $cfile -o $LIBICTEST`
     )
   end
   true
@@ -65,15 +64,15 @@ if !isfile(LIBEIGENTEST) || mtime(eigenfile) > mtime(LIBEIGENTEST)
   # Clang seems to have trouble finding includes
   if Bool(LoopVectorization.VectorizationBase.has_feature(Val(:x86_64_avx512f)))
     run(
-      `g++ -O3 -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`,
+      `g++ -O3 -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`
     )
   elseif (Sys.ARCH === :aarch64) && Sys.isapple() # assume homebrew
     run(
-      `g++-10 -O3 -march=native -I/opt/homebrew/Cellar/eigen/3.3.9/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`,
+      `g++-10 -O3 -march=native -I/opt/homebrew/Cellar/eigen/3.3.9/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`
     )
   else
     run(
-      `g++ -O3 -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`,
+      `g++ -O3 -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`
     )
   end
 end
@@ -81,15 +80,15 @@ if !isfile(LIBIEIGENTEST) || mtime(eigenfile) > mtime(LIBIEIGENTEST)
   # run(`/usr/bin/clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/c++/9 -I/usr/include/c++/9/x86_64-generic-linux -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
   if Bool(LoopVectorization.VectorizationBase.has_feature(Val(:x86_64_avx512f)))
     run(
-      `clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`,
+      `clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`
     )
   elseif (Sys.ARCH === :aarch64) && Sys.isapple() # assume homebrew and no `-march=native`
     run(
-      `clang++ -Ofast -I/opt/homebrew/Cellar/eigen/3.3.9/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`,
+      `clang++ -Ofast -I/opt/homebrew/Cellar/eigen/3.3.9/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`
     )
   else
     run(
-      `clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`,
+      `clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`
     )
   end
   # run(`icpc -fast -qopt-zmm-usage=high -fargument-noalias-global -qoverride-limits -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`)
@@ -104,7 +103,6 @@ end
 #     # run(`gfortran -Ofast -march=native -DMKL_DIRECT_CALL_SEQ_JIT -cpp -mprefer-vector-width=$(8REGISTER_SIZE) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`)
 # end
 
-
 randa(::Type{T}, dim...) where {T} = rand(T, dim...)
 randa(::Type{T}, dim...) where {T<:Signed} = rand(T(-100):T(200), dim...)
 
@@ -127,7 +125,8 @@ const libOpenBLAS = Libdl.dlopen(OpenBLAS_jll.libopenblas)
 const DGEMM_OpenBLAS = Libdl.dlsym(libOpenBLAS, :dgemm_64_)
 const SGEMM_OpenBLAS = Libdl.dlsym(libOpenBLAS, :sgemm_64_)
 const DGEMV_OpenBLAS = Libdl.dlsym(libOpenBLAS, :dgemv_64_)
-const OPENBLAS_SET_NUM_THREADS = Libdl.dlsym(libOpenBLAS, :openblas_set_num_threads64_)
+const OPENBLAS_SET_NUM_THREADS =
+  Libdl.dlsym(libOpenBLAS, :openblas_set_num_threads64_)
 
 istransposed(x) = 'N'
 istransposed(x::Adjoint{<:Real}) = 'T'
@@ -137,7 +136,11 @@ for (lib, f) ∈ [(:GEMM_MKL, :gemmmkl!), (:GEMM_OpenBLAS, :gemmopenblas!)]
   for (T, prefix) ∈ [(Float32, :S), (Float64, :D)]
     fm = Symbol(prefix, lib)
     @eval begin
-      function $f(C::AbstractMatrix{$T}, A::AbstractMatrix{$T}, B::AbstractMatrix{$T})
+      function $f(
+        C::AbstractMatrix{$T},
+        A::AbstractMatrix{$T},
+        B::AbstractMatrix{$T}
+      )
         transA = istransposed(A)
         transB = istransposed(B)
         M, N = size(C)
@@ -165,7 +168,7 @@ for (lib, f) ∈ [(:GEMM_MKL, :gemmmkl!), (:GEMM_OpenBLAS, :gemmopenblas!)]
             Ref{Int64},
             Ref{$T},
             Ref{$T},
-            Ref{Int64},
+            Ref{Int64}
           ),
           transA,
           transB,
@@ -179,17 +182,19 @@ for (lib, f) ∈ [(:GEMM_MKL, :gemmmkl!), (:GEMM_OpenBLAS, :gemmopenblas!)]
           ldB,
           β,
           C,
-          ldC,
+          ldC
         )
       end
     end
   end
 end
 if MKL_BENCH
-  mkl_set_num_threads(N::Integer) = ccall(MKL_SET_NUM_THREADS, Cvoid, (Int32,), N % Int32)
+  mkl_set_num_threads(N::Integer) =
+    ccall(MKL_SET_NUM_THREADS, Cvoid, (Int32,), N % Int32)
   mkl_set_num_threads(1)
 end
-openblas_set_num_threads(N::Integer) = ccall(OPENBLAS_SET_NUM_THREADS, Cvoid, (Int64,), N)
+openblas_set_num_threads(N::Integer) =
+  ccall(OPENBLAS_SET_NUM_THREADS, Cvoid, (Int64,), N)
 openblas_set_num_threads(1)
 
 function dgemvmkl!(
@@ -197,7 +202,7 @@ function dgemvmkl!(
   A::AbstractMatrix{Float64},
   x::AbstractVector{Float64},
   α = 1.0,
-  β = 0.0,
+  β = 0.0
 )
   transA = istransposed(A)
   pA = parent(A)
@@ -219,7 +224,7 @@ function dgemvmkl!(
       Ref{Int64},
       Ref{Float64},
       Ref{Float64},
-      Ref{Int64},
+      Ref{Int64}
     ),
     transA,
     M,
@@ -231,13 +236,13 @@ function dgemvmkl!(
     incx,
     β,
     y,
-    incy,
+    incy
   )
 end
 function dgemvopenblas!(
   y::AbstractVector{Float64},
   A::AbstractMatrix{Float64},
-  x::AbstractVector{Float64},
+  x::AbstractVector{Float64}
 )
   transA = istransposed(A)
   pA = parent(A)
@@ -261,7 +266,7 @@ function dgemvopenblas!(
       Ref{Int64},
       Ref{Float64},
       Ref{Float64},
-      Ref{Int64},
+      Ref{Int64}
     ),
     transA,
     M,
@@ -273,11 +278,10 @@ function dgemvopenblas!(
     incx,
     β,
     y,
-    incy,
+    incy
   )
 end
 
-
 for (prefix, Eshared) ∈ ((Symbol(""), LIBEIGENTEST), (:i, LIBIEIGENTEST))
   @eval function $(Symbol(prefix, :egemm!))(C, A, B)
     M, N = size(C)
@@ -291,7 +295,7 @@ for (prefix, Eshared) ∈ ((Symbol(""), LIBEIGENTEST), (:i, LIBIEIGENTEST))
       B,
       M,
       K,
-      N,
+      N
     )
   end
   let (p, s) = (:e, Eshared)
@@ -307,7 +311,7 @@ for (prefix, Eshared) ∈ ((Symbol(""), LIBEIGENTEST), (:i, LIBIEIGENTEST))
         B,
         M,
         K,
-        N,
+        N
       )
     end
     @eval function $(Symbol(prefix, p, :gemm!))(C, A, B::Adjoint)
@@ -322,7 +326,7 @@ for (prefix, Eshared) ∈ ((Symbol(""), LIBEIGENTEST), (:i, LIBIEIGENTEST))
         parent(B),
         M,
         K,
-        N,
+        N
       )
     end
     @eval function $(Symbol(prefix, p, :gemm!))(C, A::Adjoint, B::Adjoint)
@@ -337,7 +341,7 @@ for (prefix, Eshared) ∈ ((Symbol(""), LIBEIGENTEST), (:i, LIBIEIGENTEST))
         parent(B),
         M,
         K,
-        N,
+        N
       )
     end
     @eval function $(Symbol(prefix, p, :dot))(a, b)
@@ -354,7 +358,7 @@ for (prefix, Eshared) ∈ ((Symbol(""), LIBEIGENTEST), (:i, LIBIEIGENTEST))
         A,
         x,
         M,
-        K,
+        K
       )
     end
     @eval function $(Symbol(prefix, p, :selfdot))(a)
@@ -371,7 +375,7 @@ for (prefix, Eshared) ∈ ((Symbol(""), LIBEIGENTEST), (:i, LIBIEIGENTEST))
         A,
         y,
         M,
-        N,
+        N
       )
     end
     @eval function $(Symbol(prefix, p, :gemv!))(y, A::Adjoint, x)
@@ -384,7 +388,7 @@ for (prefix, Eshared) ∈ ((Symbol(""), LIBEIGENTEST), (:i, LIBIEIGENTEST))
         parent(A),
         x,
         M,
-        K,
+        K
       )
     end
     @eval function $(Symbol(prefix, p, :aplusBc!))(D, a, B, c)
@@ -398,7 +402,7 @@ for (prefix, Eshared) ∈ ((Symbol(""), LIBEIGENTEST), (:i, LIBIEIGENTEST))
         B,
         c,
         M,
-        K,
+        K
       )
     end
     @eval function $(Symbol(prefix, p, :OLSlp))(y, X, β)
@@ -411,7 +415,7 @@ for (prefix, Eshared) ∈ ((Symbol(""), LIBEIGENTEST), (:i, LIBIEIGENTEST))
         X,
         β,
         N,
-        P,
+        P
       )
     end
     @eval function $(Symbol(prefix, p, :AplusAt!))(B, A)
@@ -419,7 +423,7 @@ for (prefix, Eshared) ∈ ((Symbol(""), LIBEIGENTEST), (:i, LIBIEIGENTEST))
       ccall((:AplusAt, $s), Cvoid, (Ptr{Float64}, Ptr{Float64}, Clong), B, A, N)
     end
     @eval function $(Symbol(prefix, p, :logdettriangle))(
-      T::Union{LowerTriangular,UpperTriangular},
+      T::Union{LowerTriangular,UpperTriangular}
     )
       N = size(T, 1)
       Tp = parent(T)
@@ -447,7 +451,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
         B,
         M,
         K,
-        N,
+        N
       )
     end
     @eval function $(Symbol(prefix, :f, gemm, :!))(C, A, B)
@@ -456,31 +460,47 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
       ccall(
         ($(QuoteNode(gemm)), $Fshared),
         Cvoid,
-        (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}),
+        (
+          Ptr{Float64},
+          Ptr{Float64},
+          Ptr{Float64},
+          Ref{Clong},
+          Ref{Clong},
+          Ref{Clong}
+        ),
         C,
         A,
         B,
         Ref(M),
         Ref(K),
-        Ref(N),
+        Ref(N)
       )
     end
   end
-  @eval $(Symbol(prefix, :cgemm!))(C, A, B) = $(Symbol(prefix, :cgemm_nkm!))(C, A, B)
-  @eval $(Symbol(prefix, :fgemm!))(C, A, B) = $(Symbol(prefix, :fgemm_nkm!))(C, A, B)
+  @eval $(Symbol(prefix, :cgemm!))(C, A, B) =
+    $(Symbol(prefix, :cgemm_nkm!))(C, A, B)
+  @eval $(Symbol(prefix, :fgemm!))(C, A, B) =
+    $(Symbol(prefix, :fgemm_nkm!))(C, A, B)
   @eval function $(Symbol(prefix, :fgemm_builtin!))(C, A, B)
     M, N = size(C)
     K = size(B, 1)
     ccall(
       (:gemmbuiltin, $Fshared),
       Cvoid,
-      (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}),
+      (
+        Ptr{Float64},
+        Ptr{Float64},
+        Ptr{Float64},
+        Ref{Clong},
+        Ref{Clong},
+        Ref{Clong}
+      ),
       C,
       A,
       B,
       Ref(M),
       Ref(K),
-      Ref(N),
+      Ref(N)
     )
   end
   let (p, s) = (:c, Cshared)
@@ -496,7 +516,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
         B,
         M,
         K,
-        N,
+        N
       )
     end
   end
@@ -506,13 +526,20 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
     ccall(
       (:AtmulB, $Fshared),
       Cvoid,
-      (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}),
+      (
+        Ptr{Float64},
+        Ptr{Float64},
+        Ptr{Float64},
+        Ref{Clong},
+        Ref{Clong},
+        Ref{Clong}
+      ),
       C,
       parent(A),
       B,
       Ref(M),
       Ref(K),
-      Ref(N),
+      Ref(N)
     )
   end
   @eval function $(Symbol(prefix, :fgemm_builtin!))(C, A::Adjoint, B)
@@ -521,13 +548,20 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
     ccall(
       (:AtmulBbuiltin, $Fshared),
       Cvoid,
-      (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}),
+      (
+        Ptr{Float64},
+        Ptr{Float64},
+        Ptr{Float64},
+        Ref{Clong},
+        Ref{Clong},
+        Ref{Clong}
+      ),
       C,
       parent(A),
       B,
       Ref(M),
       Ref(K),
-      Ref(N),
+      Ref(N)
     )
   end
   let (p, s) = (:c, Cshared)# (:e,Eshared)]
@@ -543,7 +577,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
         parent(B),
         M,
         K,
-        N,
+        N
       )
     end
   end
@@ -553,13 +587,20 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
     ccall(
       (:AmulBt, $Fshared),
       Cvoid,
-      (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}),
+      (
+        Ptr{Float64},
+        Ptr{Float64},
+        Ptr{Float64},
+        Ref{Clong},
+        Ref{Clong},
+        Ref{Clong}
+      ),
       C,
       A,
       parent(B),
       Ref(M),
       Ref(K),
-      Ref(N),
+      Ref(N)
     )
   end
   @eval function $(Symbol(prefix, :fgemm_builtin!))(C, A, B::Adjoint)
@@ -568,13 +609,20 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
     ccall(
       (:AmulBtbuiltin, $Fshared),
       Cvoid,
-      (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}),
+      (
+        Ptr{Float64},
+        Ptr{Float64},
+        Ptr{Float64},
+        Ref{Clong},
+        Ref{Clong},
+        Ref{Clong}
+      ),
       C,
       A,
       parent(B),
       Ref(M),
       Ref(K),
-      Ref(N),
+      Ref(N)
     )
   end
   @eval function $(Symbol(prefix, :fgemm!))(C, A::Adjoint, B::Adjoint)
@@ -583,13 +631,20 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
     ccall(
       (:AtmulBt, $Fshared),
       Cvoid,
-      (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}),
+      (
+        Ptr{Float64},
+        Ptr{Float64},
+        Ptr{Float64},
+        Ref{Clong},
+        Ref{Clong},
+        Ref{Clong}
+      ),
       C,
       parent(A),
       parent(B),
       Ref(M),
       Ref(K),
-      Ref(N),
+      Ref(N)
     )
   end
   @eval function $(Symbol(prefix, :fgemm_builtin!))(C, A::Adjoint, B::Adjoint)
@@ -598,18 +653,32 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
     ccall(
       (:AtmulBtbuiltin, $Fshared),
       Cvoid,
-      (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}),
+      (
+        Ptr{Float64},
+        Ptr{Float64},
+        Ptr{Float64},
+        Ref{Clong},
+        Ref{Clong},
+        Ref{Clong}
+      ),
       C,
       parent(A),
       parent(B),
       Ref(M),
       Ref(K),
-      Ref(N),
+      Ref(N)
     )
   end
   @eval function $(Symbol(prefix, :fdot))(a, b)
     N = length(a)
-    ccall((:dot, $Fshared), Float64, (Ptr{Float64}, Ptr{Float64}, Ref{Clong}), a, b, Ref(N))
+    ccall(
+      (:dot, $Fshared),
+      Float64,
+      (Ptr{Float64}, Ptr{Float64}, Ref{Clong}),
+      a,
+      b,
+      Ref(N)
+    )
   end
 
   @eval function $(Symbol(prefix, :fselfdot))(a)
@@ -626,7 +695,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
       A,
       y,
       Ref(M),
-      Ref(N),
+      Ref(N)
     )
   end
   @eval function $(Symbol(prefix, :fgemv!))(y, A, x)
@@ -639,7 +708,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
       A,
       x,
       Ref(M),
-      Ref(K),
+      Ref(K)
     )
   end
   @eval function $(Symbol(prefix, :fgemv_builtin!))(y, A, x)
@@ -652,7 +721,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
       A,
       x,
       Ref(M),
-      Ref(K),
+      Ref(K)
     )
   end
   @eval function $(Symbol(prefix, :fgemv!))(y, A::Adjoint, x)
@@ -665,7 +734,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
       parent(A),
       x,
       Ref(M),
-      Ref(K),
+      Ref(K)
     )
   end
   @eval function $(Symbol(prefix, :fgemv_builtin!))(y, A::Adjoint, x)
@@ -678,7 +747,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
       parent(A),
       x,
       Ref(M),
-      Ref(K),
+      Ref(K)
     )
   end
   let (p, s) = (:c, Cshared)# (:e,Eshared)]
@@ -694,7 +763,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
         parent(B),
         M,
         K,
-        N,
+        N
       )
     end
     @eval function $(Symbol(prefix, p, :dot))(a, b)
@@ -711,7 +780,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
         A,
         x,
         M,
-        K,
+        K
       )
     end
     @eval function $(Symbol(prefix, p, :selfdot))(a)
@@ -728,7 +797,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
         A,
         y,
         M,
-        N,
+        N
       )
     end
     @eval function $(Symbol(prefix, p, :gemv!))(y, A::Adjoint, x)
@@ -741,7 +810,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
         parent(A),
         x,
         M,
-        K,
+        K
       )
     end
     @eval function $(Symbol(prefix, p, :aplusBc!))(D, a, B, c)
@@ -755,7 +824,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
         B,
         c,
         M,
-        K,
+        K
       )
     end
     @eval function $(Symbol(prefix, p, :OLSlp))(y, X, β)
@@ -768,7 +837,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
         X,
         β,
         N,
-        P,
+        P
       )
     end
     @eval function $(Symbol(prefix, p, :AplusAt!))(B, A)
@@ -776,7 +845,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
       ccall((:AplusAt, $s), Cvoid, (Ptr{Float64}, Ptr{Float64}, Clong), B, A, N)
     end
     @eval function $(Symbol(prefix, p, :logdettriangle))(
-      T::Union{LowerTriangular,UpperTriangular},
+      T::Union{LowerTriangular,UpperTriangular}
     )
       N = size(T, 1)
       Tp = parent(T)
@@ -788,13 +857,20 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
     ccall(
       (:aplusBc, $Fshared),
       Cvoid,
-      (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}),
+      (
+        Ptr{Float64},
+        Ptr{Float64},
+        Ptr{Float64},
+        Ptr{Float64},
+        Ref{Clong},
+        Ref{Clong}
+      ),
       D,
       a,
       B,
       c,
       Ref(M),
-      Ref(K),
+      Ref(K)
     )
   end
   @eval function $(Symbol(prefix, :fOLSlp))(y, X, β)
@@ -807,21 +883,42 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
       X,
       β,
       Ref(N),
-      Ref(P),
+      Ref(P)
     )
   end
   @eval function $(Symbol(prefix, :cvexp!))(b, a)
     N = length(b)
-    ccall((:vexp, $Cshared), Cvoid, (Ptr{Float64}, Ptr{Float64}, Clong), b, a, N)
+    ccall(
+      (:vexp, $Cshared),
+      Cvoid,
+      (Ptr{Float64}, Ptr{Float64}, Clong),
+      b,
+      a,
+      N
+    )
   end
   @eval function $(Symbol(prefix, :fvexp!))(b, a)
     N = length(b)
-    ccall((:vexp, $Fshared), Cvoid, (Ptr{Float64}, Ptr{Float64}, Ref{Clong}), b, a, Ref(N))
+    ccall(
+      (:vexp, $Fshared),
+      Cvoid,
+      (Ptr{Float64}, Ptr{Float64}, Ref{Clong}),
+      b,
+      a,
+      Ref(N)
+    )
   end
   @eval function $(Symbol(prefix, :fvexpsum))(a)
     N = length(a)
     s = Ref{Float64}()
-    ccall((:svexp, $Fshared), Cvoid, (Ref{Float64}, Ptr{Float64}, Ref{Clong}), s, a, Ref(N))
+    ccall(
+      (:svexp, $Fshared),
+      Cvoid,
+      (Ref{Float64}, Ptr{Float64}, Ref{Clong}),
+      s,
+      a,
+      Ref(N)
+    )
     s[]
   end
   @eval function $(Symbol(prefix, :fAplusAt!))(B, A)
@@ -832,7 +929,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
       (Ptr{Float64}, Ptr{Float64}, Ref{Clong}),
       B,
       A,
-      Ref(N),
+      Ref(N)
     )
   end
   @eval function $(Symbol(prefix, :fAplusAt_builtin!))(B, A)
@@ -843,7 +940,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
       (Ptr{Float64}, Ptr{Float64}, Ref{Clong}),
       B,
       A,
-      Ref(N),
+      Ref(N)
     )
   end
 
@@ -857,7 +954,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
       basis,
       coefs,
       A,
-      C,
+      C
     )
   end
   @eval function $(Symbol(prefix, :frandomaccess))(P, basis, coefs)
@@ -870,20 +967,26 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
       basis,
       coefs,
       Ref(A),
-      Ref(C),
+      Ref(C)
     )
   end
   @eval function $(Symbol(prefix, :flogdettriangle))(
-    T::Union{LowerTriangular,UpperTriangular},
+    T::Union{LowerTriangular,UpperTriangular}
   )
     N = size(T, 1)
     Tp = parent(T)
-    ccall((:logdettriangle, $Fshared), Float64, (Ptr{Float64}, Ref{Clong}), Tp, Ref(N))
+    ccall(
+      (:logdettriangle, $Fshared),
+      Float64,
+      (Ptr{Float64}, Ref{Clong}),
+      Tp,
+      Ref(N)
+    )
   end
   @eval function $(Symbol(prefix, :cfilter2d!))(
     B::OffsetArray,
     A::AbstractArray,
-    K::OffsetArray,
+    K::OffsetArray
   )
     Ma, Na = size(A)
     offset = first(B.offsets)
@@ -896,32 +999,39 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
       parent(K),
       Ma,
       Na,
-      offset,
+      offset
     )
   end
   @eval function $(Symbol(prefix, :ffilter2d!))(
     B::OffsetArray,
     A::AbstractArray,
-    K::OffsetArray,
+    K::OffsetArray
   )
     Ma, Na = size(A)
     offset = first(B.offsets)
     ccall(
       (:filter2d, $Fshared),
       Cvoid,
-      (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}),
+      (
+        Ptr{Float64},
+        Ptr{Float64},
+        Ptr{Float64},
+        Ref{Clong},
+        Ref{Clong},
+        Ref{Clong}
+      ),
       parent(B),
       A,
       parent(K),
       Ref(Ma),
       Ref(Na),
-      Ref(offset),
+      Ref(offset)
     )
   end
   @eval function $(Symbol(prefix, :cfilter2d!))(
     B::OffsetArray,
     A::AbstractArray,
-    K::SizedOffsetMatrix{Float64,-1,1,-1,1},
+    K::SizedOffsetMatrix{Float64,-1,1,-1,1}
   )
     Ma, Na = size(A)
     ccall(
@@ -932,13 +1042,13 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
       A,
       K,
       Ma,
-      Na,
+      Na
     )
   end
   @eval function $(Symbol(prefix, :ffilter2d!))(
     B::OffsetArray,
     A::AbstractArray,
-    K::SizedOffsetMatrix{Float64,-1,1,-1,1},
+    K::SizedOffsetMatrix{Float64,-1,1,-1,1}
   )
     Ma, Na = size(A)
     ccall(
@@ -949,13 +1059,13 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
       A,
       K,
       Ref(Ma),
-      Ref(Na),
+      Ref(Na)
     )
   end
   @eval function $(Symbol(prefix, :cfilter2dunrolled!))(
     B::OffsetArray,
     A::AbstractArray,
-    K::SizedOffsetMatrix{Float64,-1,1,-1,1},
+    K::SizedOffsetMatrix{Float64,-1,1,-1,1}
   )
     Ma, Na = size(A)
     ccall(
@@ -966,13 +1076,13 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
       A,
       K,
       Ma,
-      Na,
+      Na
     )
   end
   @eval function $(Symbol(prefix, :ffilter2dunrolled!))(
     B::OffsetArray,
     A::AbstractArray,
-    K::SizedOffsetMatrix{Float64,-1,1,-1,1},
+    K::SizedOffsetMatrix{Float64,-1,1,-1,1}
   )
     Ma, Na = size(A)
     ccall(
@@ -983,8 +1093,7 @@ for (prefix, Cshared, Fshared) ∈ funcs_to_define
       A,
       K,
       Ref(Ma),
-      Ref(Na),
+      Ref(Na)
     )
   end
-
 end
diff --git a/benchmark/looptests.jl b/benchmark/looptests.jl
index 3c8d4223b..a5f417afb 100644
--- a/benchmark/looptests.jl
+++ b/benchmark/looptests.jl
@@ -1,7 +1,6 @@
 using LoopVectorization, LinearAlgebra, OffsetArrays, ArrayInterface
 BLAS.set_num_threads(1)
 
-
 using LoopVectorization: Static
 # TODO: remove this once this PR merges: https://github.com/JuliaArrays/OffsetArrays.jl/pull/170
 @inline Base.unsafe_convert(::Type{Ptr{T}}, A::OffsetArray{T}) where {T} =
@@ -15,20 +14,24 @@ Base.size(::SizedOffsetMatrix{<:Any,LR,UR,LC,UC}) where {LR,UR,LC,UC} =
 Base.axes(::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC} =
   (StaticInt{LR}():StaticInt{UR}(), StaticInt{LC}():StaticInt{UC}())
 Base.parent(A::SizedOffsetMatrix) = A.data
-Base.unsafe_convert(::Type{Ptr{T}}, A::SizedOffsetMatrix{T}) where {T} = pointer(A.data)
+Base.unsafe_convert(::Type{Ptr{T}}, A::SizedOffsetMatrix{T}) where {T} =
+  pointer(A.data)
 ArrayInterface.contiguous_axis(::Type{<:SizedOffsetMatrix}) = StaticInt(1)
 ArrayInterface.contiguous_batch_size(::Type{<:SizedOffsetMatrix}) = StaticInt(0)
-ArrayInterface.stride_rank(::Type{<:SizedOffsetMatrix}) = (StaticInt(1), StaticInt(2))
-function ArrayInterface.strides(A::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC}
+ArrayInterface.stride_rank(::Type{<:SizedOffsetMatrix}) =
+  (StaticInt(1), StaticInt(2))
+function ArrayInterface.strides(
+  A::SizedOffsetMatrix{T,LR,UR,LC,UC}
+) where {T,LR,UR,LC,UC}
   (StaticInt{1}(), (StaticInt{UR}() - StaticInt{LR}() + StaticInt{1}()))
 end
-ArrayInterface.offsets(A::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC} =
-  (StaticInt{LR}(), StaticInt{LC}())
+ArrayInterface.offsets(
+  A::SizedOffsetMatrix{T,LR,UR,LC,UC}
+) where {T,LR,UR,LC,UC} = (StaticInt{LR}(), StaticInt{LC}())
 ArrayInterface.parent_type(::Type{<:SizedOffsetMatrix{T}}) where {T} = Matrix{T}
 Base.getindex(A::SizedOffsetMatrix, i, j) =
   LoopVectorization.vload(LoopVectorization.stridedpointer(A), (i, j))
 
-
 function jgemm!(𝐂, 𝐀, 𝐁)
   𝐂 .= 0
   M, N = size(𝐂)
@@ -72,19 +75,17 @@ function jgemm!(𝐂, 𝐀ᵀ::Adjoint, 𝐁ᵀ::Adjoint)
     end
   end
 end
-function gemmavx!(𝐂, 𝐀, 𝐁)
-  @turbo for m ∈ indices((𝐀, 𝐂), 1), n ∈ indices((𝐁, 𝐂), 2)
+gemmavx!(𝐂, 𝐀, 𝐁) = @turbo for m ∈ indices((𝐀, 𝐂), 1), n ∈ indices((𝐁, 𝐂), 2)
     𝐂ₘₙ = zero(eltype(𝐂))
     for k ∈ indices((𝐀, 𝐁), (2, 1))
       𝐂ₘₙ += 𝐀[m, k] * 𝐁[k, n]
     end
     𝐂[m, n] = 𝐂ₘₙ
   end
-end
 function gemmavx!(
   Cc::AbstractMatrix{Complex{T}},
   Ac::AbstractMatrix{Complex{T}},
-  Bc::AbstractMatrix{Complex{T}},
+  Bc::AbstractMatrix{Complex{T}}
 ) where {T}
   A = reinterpret(reshape, T, Ac)
   B = reinterpret(reshape, T, Bc)
@@ -100,19 +101,17 @@ function gemmavx!(
     C[2, m, n] = Cim
   end
 end
-function gemmavxt!(𝐂, 𝐀, 𝐁)
-  @tturbo for m ∈ indices((𝐀, 𝐂), 1), n ∈ indices((𝐁, 𝐂), 2)
+gemmavxt!(𝐂, 𝐀, 𝐁) = @tturbo for m ∈ indices((𝐀, 𝐂), 1), n ∈ indices((𝐁, 𝐂), 2)
     𝐂ₘₙ = zero(eltype(𝐂))
     for k ∈ indices((𝐀, 𝐁), (2, 1))
       𝐂ₘₙ += 𝐀[m, k] * 𝐁[k, n]
     end
     𝐂[m, n] = 𝐂ₘₙ
   end
-end
 function gemmavxt!(
   Cc::AbstractMatrix{Complex{T}},
   Ac::AbstractMatrix{Complex{T}},
-  Bc::AbstractMatrix{Complex{T}},
+  Bc::AbstractMatrix{Complex{T}}
 ) where {T}
   A = reinterpret(reshape, T, Ac)
   B = reinterpret(reshape, T, Bc)
@@ -204,16 +203,12 @@ function jdot3avx(x, A, y)
   end
   s
 end
-function jvexp!(b, a)
-  @inbounds for i ∈ eachindex(a)
+jvexp!(b, a) = @inbounds for i ∈ eachindex(a)
     b[i] = exp(a[i])
   end
-end
-function jvexpavx!(b, a)
-  @turbo for i ∈ eachindex(a)
+jvexpavx!(b, a) = @turbo for i ∈ eachindex(a)
     b[i] = exp(a[i])
   end
-end
 function jsvexp(a)
   s = zero(eltype(a))
   @inbounds for i ∈ eachindex(a)
@@ -246,15 +241,13 @@ function jgemv!(𝐲, 𝐀ᵀ::Adjoint, 𝐱)
     𝐲[i] = 𝐲ᵢ
   end
 end
-function jgemvavx!(𝐲, 𝐀, 𝐱)
-  @turbo for i ∈ eachindex(𝐲)
+jgemvavx!(𝐲, 𝐀, 𝐱) = @turbo for i ∈ eachindex(𝐲)
     𝐲ᵢ = zero(eltype(𝐲))
     for j ∈ eachindex(𝐱)
       𝐲ᵢ += 𝐀[i, j] * 𝐱[j]
     end
     𝐲[i] = 𝐲ᵢ
   end
-end
 function jvar!(𝐬², 𝐀, x̄)
   @. s² = zero(eltype(𝐬²))
   @inbounds @fastmath for i ∈ 1:size(𝐀, 2)
@@ -264,8 +257,7 @@ function jvar!(𝐬², 𝐀, x̄)
     end
   end
 end
-function jvaravx!(𝐬², 𝐀, x̄)
-  @turbo for j ∈ eachindex(𝐬²)
+jvaravx!(𝐬², 𝐀, x̄) = @turbo for j ∈ eachindex(𝐬²)
     𝐬²ⱼ = zero(eltype(𝐬²))
     x̄ⱼ = x̄[j]
     for i ∈ 1:size(𝐀, 2)
@@ -274,7 +266,6 @@ function jvaravx!(𝐬², 𝐀, x̄)
     end
     𝐬²[j] = 𝐬²ⱼ
   end
-end
 japlucBc!(D, a, B, c) = @. D = a + B * c';
 japlucBcavx!(D, a, B, c) = @turbo @. D = a + B * c';
 
@@ -342,9 +333,6 @@ function jlogdettriangleavx(B::Union{LowerTriangular,UpperTriangular})
   ld
 end
 
-
-
-
 function filter2d!(out::AbstractMatrix, A::AbstractMatrix, kern)
   @inbounds @fastmath for J in CartesianIndices(out)
     tmp = zero(eltype(out))
@@ -369,7 +357,7 @@ end
 function filter2dunrolled!(
   out::AbstractMatrix,
   A::AbstractMatrix,
-  kern::SizedOffsetMatrix{T,-1,1,-1,1},
+  kern::SizedOffsetMatrix{T,-1,1,-1,1}
 ) where {T}
   rng1, rng2 = axes(out)
   Base.Cartesian.@nexprs 3 jk ->
@@ -380,7 +368,7 @@ function filter2dunrolled!(
       Base.Cartesian.@nexprs 3 jk -> Base.Cartesian.@nexprs 3 ik ->
         tmp_{ik + (jk - 1) * 3} = Base.FastMath.add_fast(
           Base.FastMath.mul_fast(A[i+(ik-2), j+(jk-2)], kern_ik_jk),
-          tmp_{ik + (jk - 1) * 3 - 1},
+          tmp_{ik + (jk - 1) * 3 - 1}
         )
       out[i, j] = tmp_9
     end
@@ -390,7 +378,7 @@ end
 function filter2dunrolledavx!(
   out::AbstractMatrix,
   A::AbstractMatrix,
-  kern::SizedOffsetMatrix{T,-1,1,-1,1},
+  kern::SizedOffsetMatrix{T,-1,1,-1,1}
 ) where {T}
   rng1, rng2 = axes(out)
   Base.Cartesian.@nexprs 3 jk ->
@@ -405,7 +393,6 @@ function filter2dunrolledavx!(
   out
 end
 
-
 # function smooth_line!(sl,nrm1,j,i1,rl,ih2,denom)
 #     @fastmath @inbounds @simd ivdep for i=i1:2:nrm1
 #         sl[i,j]=denom*(rl[i,j]+ih2*(sl[i,j-1]+sl[i-1,j]+sl[i+1,j]+sl[i,j+1]))
diff --git a/benchmark/plotbenchmarks.jl b/benchmark/plotbenchmarks.jl
index f505b89ed..e9f984505 100644
--- a/benchmark/plotbenchmarks.jl
+++ b/benchmark/plotbenchmarks.jl
@@ -2,17 +2,22 @@ using PrettyTables
 
 function Base.show(io::IO, br::BenchmarkResult)
   hb = Highlighter(
-    (br, i, j) -> (j > 1 && maximum(@view(br.results[:, i])) == br.results[j-1, i]),
-    foreground = :green,
+    (br, i, j) ->
+      (j > 1 && maximum(@view(br.results[:, i])) == br.results[j-1, i]);
+    foreground = :green
+  )
+  pretty_table(
+    io,
+    br.sizedresults,
+    br.tests;
+    crop = :none,
+    highlighters = (hb,)
   )
-  pretty_table(io, br.sizedresults, br.tests, crop = :none, highlighters = (hb,))
 end
 
-
 if (Sys.ARCH === :aarch64) && Sys.isapple()
   nothing
 else
-
   using Colors, ColorSchemes, Gadfly
   const COLORS = [RGB(0.0, 0.0, 0.0), RGB(1.0, 0.0, 0.0)]
   # const COLORS = [RGB(0.0,0.0,0.0),RGB(0.0,1.0,0.0)]
@@ -24,11 +29,9 @@ else
   # const COLOR_MAP = Dict{String,RGB{Float64}}()
   # const COLOR_MAP = Dict{String,RGB{Colors.N0f8}}()
   const COLOR_MAP64 = Dict{String,RGB{Float64}}()
-  function getcolor(s::String)
-    get!(COLOR_MAP64, s) do
+  getcolor(s::String) = get!(COLOR_MAP64, s) do
       COLORS[length(COLOR_MAP64)+1]
     end
-  end
   replace_and(str) = replace(str, '&' => "with")
 
   function Gadfly.plot(br::BenchmarkResult)
@@ -46,7 +49,7 @@ else
     maxtick = 10round(Int, 0.1maxres)
     yt = if iszero(maxtick)
       maxtick = 10round(0.1maxres)
-      range(0, maxres, length = 20)
+      range(0, maxres; length = 20)
     elseif maxtick < 10
       0:1:maxtick
     elseif maxtick < 20
@@ -60,24 +63,31 @@ else
       Gadfly.Guide.manual_color_key("Methods", tests, colors),
       Guide.xlabel("Size"),
       Guide.ylabel("GFLOPS"),
-      Guide.xticks(ticks = collect(xt)),
-      Guide.yticks(ticks = collect(yt)),
+      Guide.xticks(; ticks = collect(xt)),
+      Guide.yticks(; ticks = collect(yt))
     )
     for i ∈ eachindex(tests)
-      push!(p, layer(x = sizes, y = res[i, :], Geom.line, Theme(default_color = colors[i])))
+      push!(
+        p,
+        layer(;
+          x = sizes,
+          y = res[i, :],
+          Geom.line,
+          Theme(; default_color = colors[i])
+        )
+      )
     end
     addlabel && push!(
       p,
-      layer(
+      layer(;
         x = fill(maxxtick - 10, length(tests)),
         y = res[:, maxxind],
         label = tests,
-        Geom.label(position = :centered),
-      ),
+        Geom.label(; position = :centered)
+      )
     )
     p
   end
-
 end
 # using VegaLite, IndexedTables
 # function plot(br::BenchmarkResult)
diff --git a/docs/make.jl b/docs/make.jl
index 9168c0022..b537d66f5 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -2,7 +2,7 @@ using Documenter, LoopVectorization
 
 makedocs(;
   modules = [LoopVectorization],
-  format = Documenter.HTML(prettyurls = get(ENV, "CI", nothing) == "true"),
+  format = Documenter.HTML(; prettyurls = get(ENV, "CI", nothing) == "true"),
   pages = [
     "Home" => "index.md",
     "Getting Started" => "getting_started.md",
@@ -15,7 +15,7 @@ makedocs(;
       "examples/datetime_arrays.md",
       "examples/special_functions.md",
       "examples/sum_of_squared_error.md",
-      "examples/filtering.md",
+      "examples/filtering.md"
     ],
     "Vectorized Convenience Functions" => "vectorized_convenience_functions.md",
     "Future Work" => "future_work.md",
@@ -26,12 +26,12 @@ makedocs(;
       "devdocs/constructing_loopsets.md",
       "devdocs/evaluating_loops.md",
       "devdocs/lowering.md",
-      "devdocs/reference.md",
-    ],
+      "devdocs/reference.md"
+    ]
   ],
   # repo="https://github.com/JuliaSIMD/LoopVectorization.jl/blob/{commit}{path}#L{line}",
   sitename = "LoopVectorization.jl",
-  authors = "Chris Elrod",
+  authors = "Chris Elrod"
   # assets=[],
 )
 
diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl
index 570c0a576..cc103bd7f 100644
--- a/src/LoopVectorization.jl
+++ b/src/LoopVectorization.jl
@@ -3,7 +3,11 @@ module LoopVectorization
 using ArrayInterfaceCore: UpTri, LoTri
 using Static: StaticInt, gt, static, Zero, One, reduce_tup
 using VectorizationBase,
-  SLEEFPirates, UnPack, OffsetArrays, ArrayInterfaceOffsetArrays, ArrayInterfaceStaticArrays
+  SLEEFPirates,
+  UnPack,
+  OffsetArrays,
+  ArrayInterfaceOffsetArrays,
+  ArrayInterfaceStaticArrays
 using LayoutPointers:
   AbstractStridedPointer,
   StridedPointer,
@@ -113,7 +117,6 @@ using HostCPUFeatures:
   get_cpu_name
 using CPUSummary: num_cores, cache_linesize, cache_size
 
-
 using IfElse: ifelse
 
 using ThreadingUtilities, PolyesterWeave
@@ -138,7 +141,14 @@ using Base.FastMath:
   pow_fast,
   sqrt_fast
 using SLEEFPirates:
-  log_fast, log2_fast, log10_fast, pow, sin_fast, cos_fast, sincos_fast, tan_fast
+  log_fast,
+  log2_fast,
+  log10_fast,
+  pow,
+  sin_fast,
+  cos_fast,
+  sincos_fast,
+  tan_fast
 
 using ArrayInterface
 using ArrayInterface:
@@ -190,7 +200,6 @@ export LowDimArray,
 const VECTORWIDTHSYMBOL, ELTYPESYMBOL, MASKSYMBOL =
   Symbol("##Wvecwidth##"), Symbol("##Tloopeltype##"), Symbol("##mask##")
 
-
 include("vectorizationbase_compat/contract_pass.jl")
 include("vectorizationbase_compat/subsetview.jl")
 include("getconstindexes.jl")
@@ -230,13 +239,13 @@ include("broadcast.jl")
 LoopVectorization provides macros and functions that combine SIMD vectorization and
 loop-reordering so as to improve performance:
 
-- [`@turbo`](@ref): transform `for`-loops and broadcasting
-- [`vmapreduce`](@ref): vectorized version of `mapreduce`
-- [`vreduce`](@ref): vectorized version of `reduce`
-- [`vmap`](@ref) and `vmap!`: vectorized version of `map` and `map!`
-- [`vmapnt`](@ref) and `vmapnt!`: non-temporal variants of `vmap` and `vmap!`
-- [`vmapntt`](@ref) and `vmapntt!`: threaded variants of `vmapnt` and `vmapnt!`
-- [`vfilter`](@ref) and `vfilter!`: vectorized versions of `filter` and `filter!`
+  - [`@turbo`](@ref): transform `for`-loops and broadcasting
+  - [`vmapreduce`](@ref): vectorized version of `mapreduce`
+  - [`vreduce`](@ref): vectorized version of `reduce`
+  - [`vmap`](@ref) and `vmap!`: vectorized version of `map` and `map!`
+  - [`vmapnt`](@ref) and `vmapnt!`: non-temporal variants of `vmap` and `vmap!`
+  - [`vmapntt`](@ref) and `vmapntt!`: threaded variants of `vmapnt` and `vmapnt!`
+  - [`vfilter`](@ref) and `vfilter!`: vectorized versions of `filter` and `filter!`
 """
 LoopVectorization
 
diff --git a/src/broadcast.jl b/src/broadcast.jl
index e94ce9e62..28d110112 100644
--- a/src/broadcast.jl
+++ b/src/broadcast.jl
@@ -19,13 +19,13 @@ struct LowDimArray{D,T,N,A<:AbstractArray{T,N}} <: AbstractArray{T,N}
   end
 end
 function LowDimArray{D0}(
-  data::LowDimArray{D1,T,N,A},
+  data::LowDimArray{D1,T,N,A}
 ) where {D0,T,N,D1,A<:AbstractArray{T,N}}
   LowDimArray{map(|, D0, D1),T,N,A}(parent(data))
 end
 Base.@propagate_inbounds Base.getindex(
   A::LowDimArray,
-  i::Vararg{Union{StaticInt,Integer,CartesianIndex},K},
+  i::Vararg{Union{StaticInt,Integer,CartesianIndex},K}
 ) where {K} = getindex(A.data, i...)
 @inline Base.size(A::LowDimArray) = Base.size(A.data)
 @inline Base.size(A::LowDimArray, i) = Base.size(A.data, i)
@@ -36,10 +36,14 @@ Base.@propagate_inbounds Base.getindex(
   l = _pick_lowdim_known(Base.tail(b), Base.tail(x))
   (f, l...)
 end
-@inline function ArrayInterface.known_size(::Type{LowDimArray{D,T,N,A}}) where {D,T,N,A}
+@inline function ArrayInterface.known_size(
+  ::Type{LowDimArray{D,T,N,A}}
+) where {D,T,N,A}
   _pick_lowdim_known(D, ArrayInterface.known_size(A))
 end
-@inline ArrayInterface.parent_type(::Type{LowDimArray{D,T,N,A}}) where {T,D,N,A} = A
+@inline ArrayInterface.parent_type(
+  ::Type{LowDimArray{D,T,N,A}}
+) where {T,D,N,A} = A
 @inline Base.strides(A::LowDimArray) = map(Int, strides(A))
 @inline ArrayInterface.device(::LowDimArray) = ArrayInterface.CPUPointer()
 @generated function ArrayInterface.size(A::LowDimArray{D,T,N}) where {D,T,N}
@@ -54,14 +58,20 @@ end
   Expr(:block, Expr(:meta, :inline), :(s = ArrayInterface.size(parent(A))), t)
 end
 Base.parent(A::LowDimArray) = getfield(A, :data)
-Base.unsafe_convert(::Type{Ptr{T}}, A::LowDimArray{D,T}) where {D,T} = pointer(parent(A))
-ArrayInterface.contiguous_axis(A::LowDimArray) = ArrayInterface.contiguous_axis(parent(A))
+Base.unsafe_convert(::Type{Ptr{T}}, A::LowDimArray{D,T}) where {D,T} =
+  pointer(parent(A))
+ArrayInterface.contiguous_axis(A::LowDimArray) =
+  ArrayInterface.contiguous_axis(parent(A))
 ArrayInterface.contiguous_batch_size(A::LowDimArray) =
   ArrayInterface.contiguous_batch_size(parent(A))
-ArrayInterface.stride_rank(A::LowDimArray) = ArrayInterface.stride_rank(parent(A))
+ArrayInterface.stride_rank(A::LowDimArray) =
+  ArrayInterface.stride_rank(parent(A))
 ArrayInterface.offsets(A::LowDimArray) = ArrayInterface.offsets(parent(A))
 
-@generated function _lowdimfilter(::Val{D}, tup::Tuple{Vararg{Any,N}}) where {D,N}
+@generated function _lowdimfilter(
+  ::Val{D},
+  tup::Tuple{Vararg{Any,N}}
+) where {D,N}
   t = Expr(:tuple)
   for n ∈ 1:N
     if n > length(D) || D[n]
@@ -75,14 +85,20 @@ struct ForBroadcast{T,N,A<:AbstractArray{T,N}} <: AbstractArray{T,N}
   data::A
 end
 @inline Base.parent(fb::ForBroadcast) = getfield(fb, :data)
-@inline ArrayInterface.parent_type(::Type{ForBroadcast{T,N,A}}) where {T,N,A} = A
-Base.@propagate_inbounds Base.getindex(A::ForBroadcast, i::Vararg{Any,K}) where {K} =
-  parent(A)[i...]
+@inline ArrayInterface.parent_type(::Type{ForBroadcast{T,N,A}}) where {T,N,A} =
+  A
+Base.@propagate_inbounds Base.getindex(
+  A::ForBroadcast,
+  i::Vararg{Any,K}
+) where {K} = parent(A)[i...]
 const LowDimArrayForBroadcast{D,T,N,A} = ForBroadcast{T,N,LowDimArray{D,T,N,A}}
 @inline function VectorizationBase.contiguous_axis(
-  fb::LowDimArrayForBroadcast{D,T,N,A},
+  fb::LowDimArrayForBroadcast{D,T,N,A}
 ) where {D,T,N,A}
-  _contiguous_axis(Val{D}(), VectorizationBase.contiguous_axis(parent(parent(fb))))
+  _contiguous_axis(
+    Val{D}(),
+    VectorizationBase.contiguous_axis(parent(parent(fb)))
+  )
 end
 @inline forbroadcast(A::AbstractArray) = ForBroadcast(A)
 @inline forbroadcast(A::AbstractRange) = A
@@ -95,11 +111,10 @@ end
     size(A),
     strides(B),
     VectorizationBase.val_stride_rank(B),
-    VectorizationBase.val_dense_dims(B),
+    VectorizationBase.val_dense_dims(B)
   )
 end
 
-
 # @inline function VectorizationBase.contiguous_batch_size(fb::LowDimArrayForBroadcast{D,T,N,A}) where {D,T,N,A}
 #     _contiguous_axis(Val{D}(), VectorizationBase.contiguous_batch_size(parent(parent(fb))))
 # end
@@ -116,30 +131,44 @@ end
   Expr(:block, Expr(:meta, :inline), staticexpr(Cnew))
 end
 function ArrayInterface.contiguous_axis(
-  ::Type{LowDimArrayForBroadcast{D,T,N,A}},
+  ::Type{LowDimArrayForBroadcast{D,T,N,A}}
 ) where {D,T,N,A}
   ArrayInterface.contiguous_axis(A)
 end
 @inline function ArrayInterface.stride_rank(
-  ::Type{LowDimArrayForBroadcast{D,T,N,A}},
+  ::Type{LowDimArrayForBroadcast{D,T,N,A}}
 ) where {D,T,N,A}
   _lowdimfilter(Val(D), ArrayInterface.stride_rank(A))
 end
 @inline function ArrayInterface.dense_dims(
-  ::Type{LowDimArrayForBroadcast{D,T,N,A}},
+  ::Type{LowDimArrayForBroadcast{D,T,N,A}}
 ) where {D,T,N,A}
   _lowdimfilter(Val(D), ArrayInterface.dense_dims(A))
 end
-@inline function ArrayInterface.strides(fb::LowDimArrayForBroadcast{D}) where {D}
+@inline function ArrayInterface.strides(
+  fb::LowDimArrayForBroadcast{D}
+) where {D}
   _lowdimfilter(Val(D), strides(parent(fb)))
 end
-@inline function ArrayInterface.offsets(fb::LowDimArrayForBroadcast{D}) where {D}
+@inline function ArrayInterface.offsets(
+  fb::LowDimArrayForBroadcast{D}
+) where {D}
   _lowdimfilter(Val(D), ArrayInterface.offsets(parent(parent(fb))))
 end
-@inline function ArrayInterface.StrideIndex(a::A) where {A<:LowDimArrayForBroadcast}
-  _stride_index(ArrayInterface.stride_rank(A), ArrayInterface.contiguous_axis(A), a)
+@inline function ArrayInterface.StrideIndex(
+  a::A
+) where {A<:LowDimArrayForBroadcast}
+  _stride_index(
+    ArrayInterface.stride_rank(A),
+    ArrayInterface.contiguous_axis(A),
+    a
+  )
 end
-@inline function _stride_index(r::Tuple{Vararg{StaticInt,N}}, ::StaticInt{C}, A) where {N,C}
+@inline function _stride_index(
+  r::Tuple{Vararg{StaticInt,N}},
+  ::StaticInt{C},
+  A
+) where {N,C}
   StrideIndex{N,ArrayInterface.known(r),C}(A)
 end
 
@@ -147,7 +176,7 @@ for f ∈ [ # groupedstridedpointer support
   :(ArrayInterface.contiguous_axis),
   :(ArrayInterface.contiguous_batch_size),
   :(ArrayInterface.device),
-  :(ArrayInterface.stride_rank),
+  :(ArrayInterface.stride_rank)
 ]
   @eval @inline $f(::Type{ForBroadcast{T,N,A}}) where {T,N,A} = $f(A)
 end
@@ -159,7 +188,7 @@ for f ∈ [ # groupedstridedpointer support
   :(ArrayInterface.stride_rank),
   :(VectorizationBase.val_dense_dims),
   :(ArrayInterface.offsets),
-  :(Base.size),#, :(ArrayInterface.strides)
+  :(Base.size)#, :(ArrayInterface.strides)
 ]
   @eval @inline $f(fb::ForBroadcast) = $f(getfield(fb, :data))
 end
@@ -172,7 +201,12 @@ function is_column_major(x)
 end
 is_row_major(x) = is_column_major(reverse(x))
 # @inline _bytestrides(s,paren) = VectorizationBase.bytestrides(paren)
-function _strides_expr(@nospecialize(s), @nospecialize(x), R::Vector{Int}, D::Vector{Bool})
+function _strides_expr(
+  @nospecialize(s),
+  @nospecialize(x),
+  R::Vector{Int},
+  D::Vector{Bool}
+)
   N = length(R)
   q = Expr(:block, Expr(:meta, :inline))
   strd_tup = Expr(:tuple)
@@ -214,7 +248,7 @@ function _strides_expr(@nospecialize(s), @nospecialize(x), R::Vector{Int}, D::Ve
       else
         push!(
           strd_tup.args,
-          :($ifel(isone($getfield(s, $n)), zero($xₙ_type), $getfield(x, $n))),
+          :($ifel(isone($getfield(s, $n)), zero($xₙ_type), $getfield(x, $n)))
         )
       end
     end
@@ -242,7 +276,7 @@ end
   s::Tuple{Vararg{Union{StaticInt,Integer},N}},
   x::Tuple{Vararg{Union{StaticInt,Integer},N}},
   ::Val{R},
-  ::Val{D},
+  ::Val{D}
 ) where {N,R,D}
   Rv = Vector{Int}(undef, N)
   Dv = Vector{Bool}(undef, N)
@@ -272,16 +306,20 @@ end
 @inline Base.ndims(::Type{Product{A,B}}) where {A,B} = numdims(B)
 # This numdims nonsense is a hack to avoid type piracy in defining:
 @inline numdims(
-  ::Type{B},
-) where {N,S<:Base.Broadcast.AbstractArrayStyle{N},B<:Base.Broadcast.Broadcasted{S}} = N
+  ::Type{B}
+) where {
+  N,
+  S<:Base.Broadcast.AbstractArrayStyle{N},
+  B<:Base.Broadcast.Broadcasted{S}
+} = N
 
 Base.Broadcast._broadcast_getindex_eltype(
-  ::Product{A,B},
+  ::Product{A,B}
 ) where {T,A<:AbstractVecOrMat{T},B<:AbstractVecOrMat{T}} = T
 function Base.Broadcast._broadcast_getindex_eltype(p::Product)
   promote_type(
     Base.Broadcast._broadcast_getindex_eltype(p.a),
-    Base.Broadcast._broadcast_getindex_eltype(p.b),
+    Base.Broadcast._broadcast_getindex_eltype(p.b)
   )
 end
 
@@ -316,7 +354,7 @@ function add_broadcast!(
   loopsyms::Vector{Symbol},
   @nospecialize(prod::Type{<:Product}),
   dontbc,
-  elementbytes::Int,
+  elementbytes::Int
 )
   A, B = prod.parameters
   Krange = gensym!(ls, "K")
@@ -325,7 +363,10 @@ function add_broadcast!(
   mB = gensym!(ls, "Bₖₙ")
   pushprepreamble!(ls, Expr(:(=), mA, Expr(:(.), bcname, QuoteNode(:a))))
   pushprepreamble!(ls, Expr(:(=), mB, Expr(:(.), bcname, QuoteNode(:b))))
-  pushprepreamble!(ls, Expr(:(=), Klen, Expr(:call, getfield, Expr(:call, :size, mB), 1)))
+  pushprepreamble!(
+    ls,
+    Expr(:(=), Klen, Expr(:call, getfield, Expr(:call, :size, mB), 1))
+  )
   pushpreamble!(ls, Expr(:(=), Krange, Expr(:call, :(:), staticexpr(1), Klen)))
   k = gensym!(ls, "k")
   add_loop!(ls, Loop(k, 1, Klen, 1, Krange, Klen), k)
@@ -346,18 +387,46 @@ function add_broadcast!(
   end
   # load A
   # loadA = add_load!(ls, gensym!(ls, :A), productref(A, mA, m, k), elementbytes)
-  loadA = add_broadcast!(ls, gensym!(ls, "A"), mA, Symbol[m, k], A, dontbc[1], elementbytes)
+  loadA = add_broadcast!(
+    ls,
+    gensym!(ls, "A"),
+    mA,
+    Symbol[m, k],
+    A,
+    dontbc[1],
+    elementbytes
+  )
   # load B
-  loadB = add_broadcast!(ls, gensym!(ls, "B"), mB, bloopsyms, B, dontbc[2], elementbytes)
+  loadB = add_broadcast!(
+    ls,
+    gensym!(ls, "B"),
+    mB,
+    bloopsyms,
+    B,
+    dontbc[2],
+    elementbytes
+  )
   # set Cₘₙ = 0
   # setC = add_constant!(ls, zero(promote_type(recursive_eltype(A), recursive_eltype(B))), cloopsyms, mC, elementbytes)
   # targetC will be used for reduce_to_add
   mCt = gensym!(ls, mC)
-  targetC =
-    add_constant!(ls, gensym!(ls, "zero"), cloopsyms, mCt, elementbytes, :numericconstant)
+  targetC = add_constant!(
+    ls,
+    gensym!(ls, "zero"),
+    cloopsyms,
+    mCt,
+    elementbytes,
+    :numericconstant
+  )
   push!(ls.preamble_zeros, (identifier(targetC), IntOrFloat))
-  setC =
-    add_constant!(ls, gensym!(ls, "zero"), cloopsyms, mC, elementbytes, :numericconstant)
+  setC = add_constant!(
+    ls,
+    gensym!(ls, "zero"),
+    cloopsyms,
+    mC,
+    elementbytes,
+    :numericconstant
+  )
   push!(ls.preamble_zeros, (identifier(setC), IntOrFloat))
   setC.reduced_children = kvec
   # compute Cₘₙ += Aₘₖ * Bₖₙ
@@ -370,7 +439,7 @@ function add_broadcast!(
     compute,
     reductdeps,
     kvec,
-    Operation[loadA, loadB, setC],
+    Operation[loadA, loadB, setC]
   )
   reductop = pushop!(ls, reductop, mC)
   reductfinal = Operation(
@@ -381,12 +450,17 @@ function add_broadcast!(
     compute,
     cloopsyms,
     kvec,
-    Operation[reductop, targetC],
+    Operation[reductop, targetC]
   )
   pushop!(ls, reductfinal, mCt)
 end
 
-function extract_all_1_array!(ls::LoopSet, bcname::Symbol, N::Int, elementbytes::Int)
+function extract_all_1_array!(
+  ls::LoopSet,
+  bcname::Symbol,
+  N::Int,
+  elementbytes::Int
+)
   refextract = gensym!(ls, bcname)
   ref = Expr(:ref, bcname)
   for _ ∈ 1:N
@@ -408,7 +482,7 @@ function add_broadcast!(
   loopsyms::Vector{Symbol},
   @nospecialize(_::Type{<:AbstractArray{T,N}}),
   @nospecialize(dontbc::NTuple{N,Bool}),
-  elementbytes::Int,
+  elementbytes::Int
 ) where {T,N}
   any(dontbc) || return extract_all_1_array!(ls, bcname, N, elementbytes)
   bcname2 = gensym!(ls, bcname)
@@ -437,7 +511,7 @@ function add_broadcast!(
   loopsyms::Vector{Symbol},
   @nospecialize(_::Type{T}),
   @nospecialize(__),
-  elementbytes::Int,
+  elementbytes::Int
 ) where {T<:Number}
   add_constant!(ls, bcname, elementbytes) # or replace elementbytes with sizeof(T) ? u
 end
@@ -448,13 +522,14 @@ function add_broadcast!(
   loopsyms::Vector{Symbol},
   @nospecialize(_::Type{Base.RefValue{T}}),
   @nospecialize(__),
-  elementbytes::Int,
+  elementbytes::Int
 ) where {T}
   refextract = gensym!(ls, bcname)
   pushprepreamble!(ls, Expr(:(=), refextract, Expr(:ref, bcname)))
   add_constant!(ls, refextract, elementbytes) # or replace elementbytes with sizeof(T) ? u
 end
-const BroadcastedArray{S<:Broadcast.AbstractArrayStyle,F,A} = Broadcasted{S,Nothing,F,A}
+const BroadcastedArray{S<:Broadcast.AbstractArrayStyle,F,A} =
+  Broadcasted{S,Nothing,F,A}
 function add_broadcast!(
   ls::LoopSet,
   destname::Symbol,
@@ -462,7 +537,7 @@ function add_broadcast!(
   loopsyms::Vector{Symbol},
   @nospecialize(B::Type{<:BroadcastedArray}),
   @nospecialize(dontbc),
-  elementbytes::Int,
+  elementbytes::Int
 )
   S, _, F, A = B.parameters
   instr = get(FUNCTIONSYMBOLS, F) do
@@ -488,7 +563,7 @@ function add_broadcast!(
       loopsyms,
       arg,
       dontbc[i],
-      elementbytes,
+      elementbytes
     )::Operation
     push!(parents, parent)
     mergesetdiffv!(deps, loopdependencies(parent), reduceddependencies(parent))
@@ -501,12 +576,16 @@ function add_broadcast!(
     compute,
     deps,
     NODEPENDENCY,
-    parents,
+    parents
   )
   pushop!(ls, op, destname)
 end
 
-function add_broadcast_loops!(ls::LoopSet, loopsyms::Vector{Symbol}, destsym::Symbol)
+function add_broadcast_loops!(
+  ls::LoopSet,
+  loopsyms::Vector{Symbol},
+  destsym::Symbol
+)
   axes_tuple = Expr(:tuple)
   pushpreamble!(ls, Expr(:(=), axes_tuple, Expr(:call, :axes, destsym)))
   for itersym ∈ loopsyms
@@ -516,11 +595,21 @@ function add_broadcast_loops!(ls::LoopSet, loopsyms::Vector{Symbol}, destsym::Sy
     Nlen = gensym!(ls, "N")
     add_loop!(ls, Loop(itersym, Nlower, Nupper, 1, Nrange, Nlen), itersym)
     push!(axes_tuple.args, Nrange)
-    pushpreamble!(ls, Expr(:(=), Nlower, Expr(:call, lv(:maybestaticfirst), Nrange)))
-    pushpreamble!(ls, Expr(:(=), Nupper, Expr(:call, lv(:maybestaticlast), Nrange)))
     pushpreamble!(
       ls,
-      Expr(:(=), Nlen, Expr(:call, GlobalRef(ArrayInterface, :static_length), Nrange)),
+      Expr(:(=), Nlower, Expr(:call, lv(:maybestaticfirst), Nrange))
+    )
+    pushpreamble!(
+      ls,
+      Expr(:(=), Nupper, Expr(:call, lv(:maybestaticlast), Nrange))
+    )
+    pushpreamble!(
+      ls,
+      Expr(
+        :(=),
+        Nlen,
+        Expr(:call, GlobalRef(ArrayInterface, :static_length), Nrange)
+      )
     )
   end
 end
@@ -532,24 +621,30 @@ function vmaterialize_fun(
   Mod,
   UNROLL,
   dontbc,
-  transpose::Bool,
+  transpose::Bool
 ) where {BC}
   # 2 + 1
   # we have an N dimensional loop.
   # need to construct the LoopSet
   ls = LoopSet(Mod)
-  inline, u₁, u₂, v, isbroadcast, _, rs, rc, cls, threads, warncheckarg, safe = UNROLL
+  inline, u₁, u₂, v, isbroadcast, _, rs, rc, cls, threads, warncheckarg, safe =
+    UNROLL
   set_hw!(ls, rs, rc, cls)
   ls.isbroadcast = isbroadcast # maybe set `false` in a DiffEq-like `@..` macro
   loopsyms = [gensym!(ls, "n") for _ ∈ 1:N]
-  transpose && pushprepreamble!(ls, Expr(:(=), :dest, Expr(:call, :parent, :dest′)))
+  transpose &&
+    pushprepreamble!(ls, Expr(:(=), :dest, Expr(:call, :parent, :dest′)))
   ret = transpose ? :dest′ : :dest
   add_broadcast_loops!(ls, loopsyms, ret)
   elementbytes = sizeofT
   add_broadcast!(ls, :destination, :bc, loopsyms, BC, dontbc, elementbytes)
   transpose && reverse!(loopsyms)
-  storeop =
-    add_simple_store!(ls, :destination, ArrayReference(:dest, loopsyms), elementbytes)
+  storeop = add_simple_store!(
+    ls,
+    :destination,
+    ArrayReference(:dest, loopsyms),
+    elementbytes
+  )
   doaddref!(ls, storeop)
   resize!(ls.loop_order, num_loops(ls)) # num_loops may be greater than N, eg Product
   # return ls
@@ -564,7 +659,7 @@ function vmaterialize_fun(
     v,
     threads % Int,
     warncheckarg,
-    safe,
+    safe
   )
   Expr(:block, Expr(:meta, :inline), sc, ret)
 end
@@ -576,7 +671,7 @@ end
   bc::BC,
   ::Val{Mod},
   ::Val{UNROLL},
-  ::Val{dontbc},
+  ::Val{dontbc}
 ) where {T<:NativeTypes,N,BC<:Union{Broadcasted,Product},Mod,UNROLL,dontbc}
   vmaterialize_fun(sizeof(T), N, BC, Mod, UNROLL, dontbc, false)
 end
@@ -585,7 +680,7 @@ end
   bc::BC,
   ::Val{Mod},
   ::Val{UNROLL},
-  ::Val{dontbc},
+  ::Val{dontbc}
 ) where {
   T<:NativeTypes,
   N,
@@ -593,25 +688,31 @@ end
   BC<:Union{Broadcasted,Product},
   Mod,
   UNROLL,
-  dontbc,
+  dontbc
 }
   vmaterialize_fun(sizeof(T), N, BC, Mod, UNROLL, dontbc, true)
 end
 # these are marked `@inline` so the `@turbo` itself can choose whether or not to inline.
 @generated function vmaterialize!(
   dest::AbstractArray{T,N},
-  bc::Broadcasted{Base.Broadcast.DefaultArrayStyle{0},Nothing,typeof(identity),Tuple{T2}},
+  bc::Broadcasted{
+    Base.Broadcast.DefaultArrayStyle{0},
+    Nothing,
+    typeof(identity),
+    Tuple{T2}
+  },
   ::Val{Mod},
   ::Val{UNROLL},
-  ::Val{dontbc},
+  ::Val{dontbc}
 ) where {T<:NativeTypes,N,T2<:Number,Mod,UNROLL,dontbc}
-  inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, threads, warncheckarg, safe = UNROLL
+  inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, threads, warncheckarg, safe =
+    UNROLL
   quote
     $(Expr(:meta, :inline))
     arg = T(first(bc.args))
     @turbo inline = $inline unroll = ($u₁, $u₂) thread = $threads vectorize = $v for i ∈
                                                                                      eachindex(
-      dest,
+      dest
     )
       dest[i] = arg
     end
@@ -620,37 +721,52 @@ end
 end
 @generated function vmaterialize!(
   dest′::Union{Adjoint{T,A},Transpose{T,A}},
-  bc::Broadcasted{Base.Broadcast.DefaultArrayStyle{0},Nothing,typeof(identity),Tuple{T2}},
+  bc::Broadcasted{
+    Base.Broadcast.DefaultArrayStyle{0},
+    Nothing,
+    typeof(identity),
+    Tuple{T2}
+  },
   ::Val{Mod},
   ::Val{UNROLL},
-  ::Val{dontbc},
+  ::Val{dontbc}
 ) where {T<:NativeTypes,N,A<:AbstractArray{T,N},T2<:Number,Mod,UNROLL,dontbc}
-  inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, threads, warncheckarg, safe = UNROLL
+  inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, threads, warncheckarg, safe =
+    UNROLL
   quote
     $(Expr(:meta, :inline))
     arg = T(first(bc.args))
     dest = parent(dest′)
     @turbo inline = $inline unroll = ($u₁, $u₂) thread = $threads vectorize = $v for i ∈
                                                                                      eachindex(
-      dest,
+      dest
     )
       dest[i] = arg
     end
     dest′
   end
 end
-@inline function vmaterialize!(dest, bc, ::Val{Mod}, ::Val{UNROLL}) where {Mod,UNROLL}
+@inline function vmaterialize!(
+  dest,
+  bc,
+  ::Val{Mod},
+  ::Val{UNROLL}
+) where {Mod,UNROLL}
   vmaterialize!(dest, bc, Val{Mod}(), Val{UNROLL}(), Val(_dontbc(bc)))
 end
 
 @inline function vmaterialize(
   bc::Broadcasted{Base.Broadcast.DefaultArrayStyle{0}},
   ::Val{Mod},
-  ::Val{UNROLL},
+  ::Val{UNROLL}
 ) where {Mod,UNROLL}
   Base.materialize(bc)
 end
-@inline function vmaterialize(bc::Broadcasted, ::Val{Mod}, ::Val{UNROLL}) where {Mod,UNROLL}
+@inline function vmaterialize(
+  bc::Broadcasted,
+  ::Val{Mod},
+  ::Val{UNROLL}
+) where {Mod,UNROLL}
   ElType = Base.Broadcast.combine_eltypes(bc.f, bc.args)
   dest = similar(bc, ElType)
   vmaterialize!(dest, bc, Val{Mod}(), Val{UNROLL}(), Val(_dontbc(bc)))
diff --git a/src/codegen/line_number_nodes.jl b/src/codegen/line_number_nodes.jl
index 81d9533f8..9d9485546 100644
--- a/src/codegen/line_number_nodes.jl
+++ b/src/codegen/line_number_nodes.jl
@@ -3,7 +3,10 @@ function extract_all_lnns(x)
   return extract_all_lnns!(lnns, x)
 end
 
-function extract_all_lnns!(lnns::AbstractVector{<:LineNumberNode}, lnn::LineNumberNode)
+function extract_all_lnns!(
+  lnns::AbstractVector{<:LineNumberNode},
+  lnn::LineNumberNode
+)
   push!(lnns, lnn)
   return lnns
 end
@@ -13,14 +16,16 @@ function extract_all_lnns!(lnns::AbstractVector{<:LineNumberNode}, ex::Expr)
   end
   return lnns
 end
-function extract_all_lnns!(lnns::AbstractVector{<:LineNumberNode}, ::Any)
-  return lnns
-end
+extract_all_lnns!(lnns::AbstractVector{<:LineNumberNode}, ::Any) = lnns
 
 function prepend_lnns!(ex::Expr, lnns::AbstractVector{<:LineNumberNode})
   return prepend_lnns!(ex, lnns, Val(ex.head))
 end
-function prepend_lnns!(ex::Expr, lnns::AbstractVector{<:LineNumberNode}, ::Val{:block})
+function prepend_lnns!(
+  ex::Expr,
+  lnns::AbstractVector{<:LineNumberNode},
+  ::Val{:block}
+)
   for lnn in lnns
     pushfirst!(ex.args, Expr(:block, lnn, :(nothing)))
   end
diff --git a/src/codegen/loopstartstopmanager.jl b/src/codegen/loopstartstopmanager.jl
index 7b36d035a..c41de4d4b 100644
--- a/src/codegen/loopstartstopmanager.jl
+++ b/src/codegen/loopstartstopmanager.jl
@@ -22,7 +22,10 @@ function uniquearrayrefs_csesummary(ls::LoopSet)
       if unique # matching name, no matching ref
         push!(uniquerefs, arrayref)
         push!(namev, length(uniquerefs))
-        push!(unique_to_name_and_op_map, Tuple{Int,Int,Int}[(j, length(namev), i)])
+        push!(
+          unique_to_name_and_op_map,
+          Tuple{Int,Int,Int}[(j, length(namev), i)]
+        )
         unique = false
       end
       break
@@ -32,7 +35,7 @@ function uniquearrayrefs_csesummary(ls::LoopSet)
       push!(name_to_array_map, Int[length(uniquerefs)])
       push!(
         unique_to_name_and_op_map,
-        Tuple{Int,Int,Int}[(length(name_to_array_map), 1, i)],
+        Tuple{Int,Int,Int}[(length(name_to_array_map), 1, i)]
       )
     end
   end
@@ -65,12 +68,15 @@ function uniquearrayrefs(ls::LoopSet)
 end
 
 otherindexunrolled(loopsym::Symbol, ind::Symbol, loopdeps::Vector{Symbol}) =
-  ((loopsym ≢ ind) & (loopsym ≢ Symbol("##undefined##"))) && (loopsym ∈ loopdeps)
+  ((loopsym ≢ ind) & (loopsym ≢ Symbol("##undefined##"))) &&
+  (loopsym ∈ loopdeps)
 function otherindexunrolled(ls::LoopSet, ind::Symbol, ref::ArrayReferenceMeta)
   us = ls.unrollspecification
   @unpack u₁loopnum, u₂loopnum, u₁, u₂ = us
   u₁sym = u₁ > 1 ? names(ls)[u₁loopnum] : Symbol("##undefined##")
-  u₂sym = ((u₂ > 1) & (u₂loopnum > 0)) ? names(ls)[u₂loopnum] : Symbol("##undefined##")
+  u₂sym =
+    ((u₂ > 1) & (u₂loopnum > 0)) ? names(ls)[u₂loopnum] :
+    Symbol("##undefined##")
   # u₁sym = names(ls)[u₁loopnum]
   # u₂sym = ((u₂loopnum > 0)) ? names(ls)[u₂loopnum] : Symbol("##undefined##")
   otherindexunrolled(u₁sym, ind, loopdependencies(ref)) ||
@@ -87,7 +93,10 @@ function multiple_with_name(n::Symbol, v::Vector{ArrayReferenceMeta})
 end
 # multiple_with_name(n::Symbol, v::Vector{ArrayReferenceMeta}) = sum(ref -> n === vptr(ref), v) > 1
 # TODO: DRY between indices_calculated_by_pointer_offsets and use_loop_induct_var
-function indices_calculated_by_pointer_offsets(ls::LoopSet, ar::ArrayReferenceMeta)
+function indices_calculated_by_pointer_offsets(
+  ls::LoopSet,
+  ar::ArrayReferenceMeta
+)
   indices = getindices(ar)
   ls.isbroadcast && return fill(false, length(indices))
   looporder = names(ls)
@@ -156,7 +165,7 @@ function set_ref_loopedindex_and_ind!(
   i::Int,
   ii::Int,
   li::Bool,
-  ind::Symbol,
+  ind::Symbol
 )
   ref.loopedindex[i] = li
   getindices(ref)[ii] = ind
@@ -168,12 +177,18 @@ function set_all_to_constant_index!(
   indop::Operation,
   allarrayrefs::Vector{ArrayReferenceMeta},
   array_refs_with_same_name::Vector{Int},
-  arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}},
+  arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}}
 )
   ops = operations(ls)
   for j ∈ array_refs_with_same_name
     arrayref_to_name_op = arrayref_to_name_op_collection[j]
-    set_ref_loopedindex_and_ind!(allarrayrefs[j], i, ii, true, CONSTANTZEROINDEX)
+    set_ref_loopedindex_and_ind!(
+      allarrayrefs[j],
+      i,
+      ii,
+      true,
+      CONSTANTZEROINDEX
+    )
     for (_, __, opid) ∈ arrayref_to_name_op
       op = ops[opid]
       set_ref_loopedindex_and_ind!(op.ref, i, ii, true, CONSTANTZEROINDEX)
@@ -188,7 +203,8 @@ function set_all_to_constant_index!(
     end
   end
 end
-maybeloopvaluename(op::Operation) = isloopvalue(op) ? instruction(op).instr : name(op)
+maybeloopvaluename(op::Operation) =
+  isloopvalue(op) ? instruction(op).instr : name(op)
 function substitute_ops_all!(
   ls::LoopSet,
   i::Int,
@@ -197,7 +213,7 @@ function substitute_ops_all!(
   new_parent::Operation,
   allarrayrefs::Vector{ArrayReferenceMeta},
   array_refs_with_same_name::Vector{Int},
-  arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}},
+  arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}}
 )
   newindsym = maybeloopvaluename(new_parent)
   isloopval = isloopvalue(new_parent)
@@ -303,7 +319,7 @@ end
 function isloopvalue(
   ls::LoopSet,
   ind::Symbol,
-  isrooted::Union{Nothing,Vector{Bool}} = nothing,
+  isrooted::Union{Nothing,Vector{Bool}} = nothing
 )
   for (i, op) ∈ enumerate(operations(ls))
     if (isrooted ≢ nothing)
@@ -324,12 +340,13 @@ function cse_constant_offsets!(
   allarrayrefs::Vector{ArrayReferenceMeta},
   allarrayrefsind::Int,
   name_to_array_map::Vector{Vector{Int}},
-  arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}},
+  arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}}
 )
   ar = allarrayrefs[allarrayrefsind]
   # vptrar = vptr(ar)
   arrayref_to_name_op = arrayref_to_name_op_collection[allarrayrefsind]
-  array_refs_with_same_name = name_to_array_map[first(first(arrayref_to_name_op))]
+  array_refs_with_same_name =
+    name_to_array_map[first(first(arrayref_to_name_op))]
   li = ar.loopedindex
   indices = getindices(ar)
   offset = first(indices) === DISCONTIGUOUS
@@ -350,7 +367,9 @@ function cse_constant_offsets!(
           ref = allarrayrefs[j]
           refinds = getindices(ref)
           # refinds === indices && continue # fast check, should be covered by `j == position_in_array_refs_with_same_name`
-          if !((refinds[ii] === ind) & (getstrides(ar)[i] == getstrides(ref)[i]))
+          if !(
+            (refinds[ii] === ind) & (getstrides(ar)[i] == getstrides(ref)[i])
+          )
             # For now, we'll only bother with `licm` if all share the same indices
             # This is so that we can apply the same `licm` to each and keep the same array name.
             # Otherwise, we'll rely on LLVM to optimize indexing.
@@ -415,7 +434,7 @@ function cse_constant_offsets!(
                 indop,
                 allarrayrefs,
                 array_refs_with_same_name,
-                arrayref_to_name_op_collection,
+                arrayref_to_name_op_collection
               )
             else # new_parent is a new parent to replace `indop`
               ind = maybeloopvaluename(new_parent)
@@ -427,7 +446,7 @@ function cse_constant_offsets!(
                 new_parent,
                 allarrayrefs,
                 array_refs_with_same_name,
-                arrayref_to_name_op_collection,
+                arrayref_to_name_op_collection
               )
             end
           end
@@ -445,7 +464,7 @@ function cse_constant_offsets!(
                   :call,
                   GlobalRef(Base, :(-)),
                   Expr(:call, GlobalRef(Base, :(+)), gespsymbol, name(op1)),
-                  name(op2),
+                  name(op2)
                 )
               end
               gespsymbol = gensym!(ls, "#gespsym#")
@@ -458,7 +477,7 @@ function cse_constant_offsets!(
                 indop,
                 allarrayrefs,
                 array_refs_with_same_name,
-                arrayref_to_name_op_collection,
+                arrayref_to_name_op_collection
               )
             else# op1const, op2dynamic
               # won't bother with this for now
@@ -481,7 +500,7 @@ function cse_constant_offsets!(
               op1,
               allarrayrefs,
               array_refs_with_same_name,
-              arrayref_to_name_op_collection,
+              arrayref_to_name_op_collection
             )
           else
             licmoffset = false
@@ -516,12 +535,11 @@ end
 #   return nothing
 # end
 
-
 function adjust_offsets!(
   ls::LoopSet,
   i::Int,
   array_refs_with_same_name::Vector{Int},
-  arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}},
+  arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}}
 )
   ops = operations(ls)
   if length(ops) ≤ 256
@@ -532,7 +550,7 @@ function adjust_offsets!(
       i,
       poffsets,
       array_refs_with_same_name,
-      arrayref_to_name_op_collection,
+      arrayref_to_name_op_collection
     )
   else
     offsetsv = similar(ops, Int8)
@@ -542,7 +560,7 @@ function adjust_offsets!(
       i,
       poffsets,
       array_refs_with_same_name,
-      arrayref_to_name_op_collection,
+      arrayref_to_name_op_collection
     )
   end
 end
@@ -551,7 +569,7 @@ function adjust_offsets!(
   i::Int,
   poffsets::Ptr{Int8},
   array_refs_with_same_name::Vector{Int},
-  arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}},
+  arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}}
 )
   ops = operations(ls)
   minoffset = typemax(Int8)
@@ -595,7 +613,7 @@ function calcgespinds(
   gespindsummary::Vector{Symbol},
   shouldindbyind::Vector{Bool},
   array_refs_with_same_name::Vector{Int},
-  arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}},
+  arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}}
 )
   gespinds = Expr(:tuple)
   li = ar.loopedindex
@@ -614,8 +632,12 @@ function calcgespinds(
     #   end
     # end
     # constoffset ≠ 0 &&
-    constoffset =
-      adjust_offsets!(ls, i, array_refs_with_same_name, arrayref_to_name_op_collection)
+    constoffset = adjust_offsets!(
+      ls,
+      i,
+      array_refs_with_same_name,
+      arrayref_to_name_op_collection
+    )
     index_by_index = isli ? check_shouldindbyind(ls, ind, shouldindbyind) : true
     # (stridesunequal & isli) && (@assert isknown(first(getloop(ls, ind))))
 
@@ -642,7 +664,7 @@ function calcgespinds(
       ind,
       isli,
       index_by_index,
-      true,
+      true
     )
   end
   gespinds
@@ -657,7 +679,7 @@ function pushgespind!(
   ind::Symbol,
   isli::Bool,
   index_by_index::Bool,
-  fromgsp::Bool,
+  fromgsp::Bool
 )
   if isli
     if ind === CONSTANTZEROINDEX
@@ -668,7 +690,12 @@ function pushgespind!(
       else
         push!(
           gespinds.args,
-          Expr(:call, GlobalRef(Base, :(+)), gespsymbol, staticexpr(constoffset)),
+          Expr(
+            :call,
+            GlobalRef(Base, :(+)),
+            gespsymbol,
+            staticexpr(constoffset)
+          )
         )
       end
     else
@@ -701,7 +728,10 @@ function pushgespind!(
         if gespsymbol === Symbol("")
           if isknown(first(loop))
             # @show constoffset, gethint(first(loop))
-            push!(gespinds.args, staticexpr(constoffset + stride * gethint(first(loop))))
+            push!(
+              gespinds.args,
+              staticexpr(constoffset + stride * gethint(first(loop)))
+            )
           elseif constoffset == 0
             if stride == 1
               push!(gespinds.args, getsym(first(loop)))
@@ -711,7 +741,10 @@ function pushgespind!(
           elseif stride == 1
             push!(gespinds.args, addexpr(getsym(first(loop)), constoffset))
           else
-            push!(gespinds.args, addexpr(mulexpr(getsym(first(loop)), stride), constoffset))
+            push!(
+              gespinds.args,
+              addexpr(mulexpr(getsym(first(loop)), stride), constoffset)
+            )
           end
         elseif isknown(first(loop))
           loopfirst = gethint(first(loop)) * stride + constoffset
@@ -720,7 +753,12 @@ function pushgespind!(
           else
             push!(
               gespinds.args,
-              Expr(:call, GlobalRef(Base, :(+)), gespsymbol, staticexpr(loopfirst)),
+              Expr(
+                :call,
+                GlobalRef(Base, :(+)),
+                gespsymbol,
+                staticexpr(loopfirst)
+              )
             )
           end
         else
@@ -731,7 +769,7 @@ function pushgespind!(
               :call,
               GlobalRef(Base, :(+)),
               mulexpr(stride, gespsymbol),
-              getsym(first(loop)),
+              getsym(first(loop))
             )
           end
           if constoffset == 0
@@ -739,7 +777,12 @@ function pushgespind!(
           else
             push!(
               gespinds.args,
-              Expr(:call, GlobalRef(Base, :(+)), addedstarts, staticexpr(constoffset)),
+              Expr(
+                :call,
+                GlobalRef(Base, :(+)),
+                addedstarts,
+                staticexpr(constoffset)
+              )
             )
           end
         end
@@ -758,8 +801,8 @@ function pushgespind!(
         else
           isconstantop(op) || throw(
             LoopError(
-              "Please file an issue with LoopVectorization.jl with a reproducer; tried to eliminate a non-constant operation.",
-            ),
+              "Please file an issue with LoopVectorization.jl with a reproducer; tried to eliminate a non-constant operation."
+            )
           )
           rangesym = name(op)
         end
@@ -791,7 +834,7 @@ function pushsimdims!(
   gespinds::Expr,
   rangesym::Symbol,
   gespsymbol::Symbol,
-  constoffset::Int,
+  constoffset::Int
 )
   simdimscall = Expr(:call, lv(:similardims), rangesym)
   pushgespsym!(simdimscall, gespsymbol, constoffset)
@@ -811,7 +854,7 @@ function use_loop_induct_var!(
   q::Expr,
   ar::ArrayReferenceMeta,
   allarrayrefs::Vector{ArrayReferenceMeta},
-  includeinlet::Bool,
+  includeinlet::Bool
   # array_refs_with_same_name::Vector{Int}, arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}}
 )::Vector{Int}
   us = ls.unrollspecification
@@ -846,28 +889,33 @@ function use_loop_induct_var!(
     if !li[i] # if it wasn't set
       uliv[i] = 0
       push!(offsetprecalc_descript.args, 0)
-      Wisz || pushgespind!(gespinds, ls, Symbol(""), 0, 1, ind, isli, true, false)
+      Wisz ||
+        pushgespind!(gespinds, ls, Symbol(""), 0, 1, ind, isli, true, false)
     elseif ind === CONSTANTZEROINDEX
       uliv[i] = 0
       push!(offsetprecalc_descript.args, 0)
-      Wisz || pushgespind!(gespinds, ls, Symbol(""), 0, 1, ind, isli, true, false)
+      Wisz ||
+        pushgespind!(gespinds, ls, Symbol(""), 0, 1, ind, isli, true, false)
     elseif isbroadcast ||
            (
              (isone(ii) && (last(looporder) === ind)) &&
-             !(otherindexunrolled(ls, ind, ar)) || multiple_with_name(vptrar, allarrayrefs)
+             !(otherindexunrolled(ls, ind, ar)) ||
+             multiple_with_name(vptrar, allarrayrefs)
            ) ||
            (iszero(ls.vector_width) && isstaticloop(getloop(ls, ind))) ||
            (strds[i] ≤ 0)
       # Not doing normal offset indexing
       uliv[i] = -findfirst(Base.Fix2(===, ind), looporder)::Int
       push!(offsetprecalc_descript.args, 0) # not doing offset indexing, so push 0
-      Wisz || pushgespind!(gespinds, ls, Symbol(""), 0, 1, ind, isli, true, false)
+      Wisz ||
+        pushgespind!(gespinds, ls, Symbol(""), 0, 1, ind, isli, true, false)
     else
       uliv[i] = findfirst(Base.Fix2(===, ind), looporder)::Int
       # loop = getloop(ls, ind)
       push!(offsetprecalc_descript.args, max(5, us.u₁ + 1, us.u₂ + 1))
       use_offsetprecalc = true
-      Wisz || pushgespind!(gespinds, ls, Symbol(""), 0, 1, ind, isli, false, false)
+      Wisz ||
+        pushgespind!(gespinds, ls, Symbol(""), 0, 1, ind, isli, false, false)
     end
     # cases for pushgespind! and loopval!
     # if !isloopval, same as before
@@ -881,7 +929,7 @@ function use_loop_induct_var!(
         :call,
         lv(:offsetprecalc),
         vpgesped,
-        Expr(:call, Expr(:curly, :Val, offsetprecalc_descript)),
+        Expr(:call, Expr(:curly, :Val, offsetprecalc_descript))
       )
     end
     push!(q.args, Expr(:(=), vptrar, vpgesped))
@@ -890,8 +938,8 @@ function use_loop_induct_var!(
       Expr(
         :(=),
         vptr_offset(vptrar),
-        Expr(:call, GlobalRef(VectorizationBase, :increment_ptr), vptrar),
-      ),
+        Expr(:call, GlobalRef(VectorizationBase, :increment_ptr), vptrar)
+      )
     )
   end
   uliv
@@ -912,7 +960,8 @@ function add_loop_start_stop_manager!(ls::LoopSet)
   use_livs = Vector{Vector{Int}}(undef, length(arrayrefs))
   # for i ∈ eachindex(name_to_array_map)
   for i ∈ eachindex(arrayrefs)
-    use_livs[i] = use_loop_induct_var!(ls, q, arrayrefs[i], arrayrefs, includeinlet[i])
+    use_livs[i] =
+      use_loop_induct_var!(ls, q, arrayrefs[i], arrayrefs, includeinlet[i])
     #name_to_array_map[first(first(unique_to_name_and_op_map[i]))], unique_to_name_and_op_map)
   end
   # loops, sorted from outer-most to inner-most
@@ -958,7 +1007,7 @@ function pointermax(
   ar::ArrayReferenceMeta,
   n::Int,
   sub::Int,
-  isvectorized::Bool,
+  isvectorized::Bool
 )::Expr
   pointermax(ls, ar, n, sub, isvectorized, getloop(ls, names(ls)[n]))
 end
@@ -968,7 +1017,7 @@ function pointermax(
   n::Int,
   sub::Int,
   isvectorized::Bool,
-  loop::Loop,
+  loop::Loop
 )::Expr
   start = first(loop)
   stop = last(loop)
@@ -981,7 +1030,7 @@ function pointermax(
       sub,
       isvectorized,
       1 + gethint(stop) - gethint(start),
-      incr,
+      incr
     )
   end
   looplensym = isone(start) ? getsym(stop) : loop.lensym
@@ -994,7 +1043,7 @@ function pointermax_index(
   sub::Int,
   isvectorized::Bool,
   stophint::Int,
-  incr::MaybeKnown,
+  incr::MaybeKnown
 )::Tuple{Expr,Int}
   # @unpack u₁loopnum, u₂loopnum, vloopnum, u₁, u₂ = us
   loopsym = names(ls)[n]
@@ -1010,7 +1059,12 @@ function pointermax_index(
           if isone(sub)
             Expr(:call, lv(:vsub_nsw), staticexpr(stophint), VECTORWIDTHSYMBOL)
           else
-            Expr(:call, lv(:vsub_nsw), staticexpr(stophint), mulexpr(VECTORWIDTHSYMBOL, sub))
+            Expr(
+              :call,
+              lv(:vsub_nsw),
+              staticexpr(stophint),
+              mulexpr(VECTORWIDTHSYMBOL, sub)
+            )
           end
         else
           staticexpr(stophint - sub)
@@ -1041,7 +1095,7 @@ function pointermax_index(
   sub::Int,
   isvectorized::Bool,
   stopsym,
-  incr::MaybeKnown,
+  incr::MaybeKnown
 )::Tuple{Expr,Int}
   loopsym = names(ls)[n]
   index = Expr(:tuple)
@@ -1087,7 +1141,7 @@ function pointermax(
   sub::Int,
   isvectorized::Bool,
   stopsym,
-  incr::MaybeKnown,
+  incr::MaybeKnown
 )::Expr
   index = first(pointermax_index(ls, ar, n, sub, isvectorized, stopsym, incr))
   vptrar = vptr(ar)
@@ -1096,7 +1150,7 @@ function pointermax(
     GlobalRef(VectorizationBase, :increment_ptr),
     vptrar,
     vptr_offset(vptrar),
-    index,
+    index
   )
 end
 
@@ -1105,11 +1159,17 @@ function defpointermax(
   ar::ArrayReferenceMeta,
   n::Int,
   sub::Int,
-  isvectorized::Bool,
+  isvectorized::Bool
 )::Expr
   Expr(:(=), maxsym(vptr(ar), sub), pointermax(ls, ar, n, sub, isvectorized))
 end
-function offsetindex(dim::Int, ind::Int, scale::Int, isvectorized::Bool, incr::MaybeKnown)
+function offsetindex(
+  dim::Int,
+  ind::Int,
+  scale::Int,
+  isvectorized::Bool,
+  incr::MaybeKnown
+)
   index = Expr(:tuple)
   for d ∈ 1:dim
     if d ≠ ind || iszero(scale)
@@ -1134,7 +1194,7 @@ function append_pointer_maxes!(
   submax::Int,
   isvectorized::Bool,
   stopindicator,
-  incr::MaybeKnown,
+  incr::MaybeKnown
 )
   vptr_ar = vptr(ar)
   if submax < 2
@@ -1144,17 +1204,22 @@ function append_pointer_maxes!(
         Expr(
           :(=),
           maxsym(vptr_ar, sub),
-          pointermax(ls, ar, n, sub, isvectorized, stopindicator, incr),
-        ),
+          pointermax(ls, ar, n, sub, isvectorized, stopindicator, incr)
+        )
       )
     end
   else
-    index, ind = pointermax_index(ls, ar, n, submax, isvectorized, stopindicator, incr)
+    index, ind =
+      pointermax_index(ls, ar, n, submax, isvectorized, stopindicator, incr)
     pointercompbase = maxsym(vptr_ar, submax)
     ip = GlobalRef(VectorizationBase, :increment_ptr)
     push!(
       loopstart.args,
-      Expr(:(=), pointercompbase, Expr(:call, ip, vptr_ar, vptr_offset(vptr_ar), index)),
+      Expr(
+        :(=),
+        pointercompbase,
+        Expr(:call, ip, vptr_ar, vptr_offset(vptr_ar), index)
+      )
     )
     dim = length(getindicesonly(ar))
     # OFFSETPRECALCDEF = true
@@ -1166,7 +1231,7 @@ function append_pointer_maxes!(
         ip,
         vptr_ar,
         pointercompbase,
-        offsetindex(dim, ind, (submax - sub) * strd, isvectorized, incr),
+        offsetindex(dim, ind, (submax - sub) * strd, isvectorized, incr)
       )
       push!(loopstart.args, Expr(:(=), maxsym(vptr_ar, sub), ptrcmp))
     end
@@ -1178,7 +1243,7 @@ function append_pointer_maxes!(
   ar::ArrayReferenceMeta,
   n::Int,
   submax::Int,
-  isvectorized::Bool,
+  isvectorized::Bool
 )
   loop = getloop(ls, n)
   @assert loop.itersymbol == names(ls)[n]
@@ -1194,11 +1259,20 @@ function append_pointer_maxes!(
       submax,
       isvectorized,
       startstopΔ(loop) + 1,
-      incr,
+      incr
     )
   end
   looplensym = isone(start) ? getsym(stop) : loop.lensym
-  append_pointer_maxes!(loopstart, ls, ar, n, submax, isvectorized, looplensym, incr)
+  append_pointer_maxes!(
+    loopstart,
+    ls,
+    ar,
+    n,
+    submax,
+    isvectorized,
+    looplensym,
+    incr
+  )
 end
 
 function maxunroll(us::UnrollSpecification, n)
@@ -1212,8 +1286,12 @@ function maxunroll(us::UnrollSpecification, n)
   end
 end
 
-
-function startloop(ls::LoopSet, us::UnrollSpecification, n::Int, staticinit::Bool = false)
+function startloop(
+  ls::LoopSet,
+  us::UnrollSpecification,
+  n::Int,
+  staticinit::Bool = false
+)
   @unpack u₁loopnum, u₂loopnum, vloopnum, u₁, u₂ = us
   lssm = ls.lssm
   termind = lssm.terminators[n]
@@ -1229,7 +1307,14 @@ function startloop(ls::LoopSet, us::UnrollSpecification, n::Int, staticinit::Boo
   else
     isvectorized = n == vloopnum
     submax = maxunroll(us, n)
-    append_pointer_maxes!(loopstart, ls, ptrdefs[termind], n, submax, isvectorized)
+    append_pointer_maxes!(
+      loopstart,
+      ls,
+      ptrdefs[termind],
+      n,
+      submax,
+      isvectorized
+    )
   end
   loopstart
 end
@@ -1240,7 +1325,7 @@ function offset_ptr(
   n::Int,
   UF::Int,
   offsetinds::Vector{Bool},
-  loop::Loop,
+  loop::Loop
 )
   indices = getindices(ar)
   strides = getstrides(ar)
@@ -1258,8 +1343,13 @@ function offset_ptr(
     # ind == loopsym && break
   end
   vpoff = vptr_offset(ar)
-  call =
-    Expr(:call, GlobalRef(VectorizationBase, :increment_ptr), vptr(ar), vpoff, gespinds)
+  call = Expr(
+    :call,
+    GlobalRef(VectorizationBase, :increment_ptr),
+    vptr(ar),
+    vpoff,
+    gespinds
+  )
   Expr(:(=), vpoff, call)
 end
 function incrementloopcounter!(
@@ -1267,7 +1357,7 @@ function incrementloopcounter!(
   ls::LoopSet,
   us::UnrollSpecification,
   n::Int,
-  UF::Int,
+  UF::Int
 )
   @unpack u₁loopnum, u₂loopnum, vloopnum, u₁, u₂ = us
   lssm = ls.lssm
@@ -1290,7 +1380,7 @@ function terminatecondition(
   us::UnrollSpecification,
   n::Int,
   inclmask::Bool,
-  UF::Int,
+  UF::Int
 )
   lssm = ls.lssm
   termind = lssm.terminators[n]
diff --git a/src/codegen/lower_compute.jl b/src/codegen/lower_compute.jl
index 7c1cb7f2b..86a3c062b 100644
--- a/src/codegen/lower_compute.jl
+++ b/src/codegen/lower_compute.jl
@@ -1,11 +1,10 @@
 
-
 function load_constrained(
   op::Operation,
   u₁loop::Symbol,
   u₂loop::Symbol,
   innermost_loop_or_vloop::Symbol,
-  forprefetch::Bool = false,
+  forprefetch::Bool = false
 )
   dependsonu₁ = isu₁unrolled(op)
   dependsonu₂ = isu₂unrolled(op)
@@ -42,10 +41,17 @@ function check_if_remfirst(ls::LoopSet, ua::UnrollArgs)
 end
 function sub_fmas(ls::LoopSet, op::Operation, ua::UnrollArgs)
   @unpack u₁, u₁loopsym, u₂loopsym, vloopsym, u₂max = ua
-  !(load_constrained(op, u₁loopsym, u₂loopsym, vloopsym) || check_if_remfirst(ls, ua))
+  !(
+    load_constrained(op, u₁loopsym, u₂loopsym, vloopsym) ||
+    check_if_remfirst(ls, ua)
+  )
 end
 
-function parent_unroll_status(op::Operation, u₁loop::Symbol, us::UnrollSpecification)
+function parent_unroll_status(
+  op::Operation,
+  u₁loop::Symbol,
+  us::UnrollSpecification
+)
   parentsop = parents(op)
   u2 = fill(false, length(parentsop))
   u1 = similar(u2)
@@ -60,7 +66,7 @@ function parent_unroll_status(
   u₂loop::Symbol,
   vloop::Symbol,
   u₂max::Int,
-  us::UnrollSpecification,
+  us::UnrollSpecification
 )
   u₂max == -1 && return parent_unroll_status(op, u₁loop, us)
   vparents = parents(op)
@@ -75,7 +81,13 @@ function parent_unroll_status(
   parents_u₁syms, parents_u₂syms
 end
 
-function _add_loopvalue!(ex::Expr, loopval::Symbol, vloop::Loop, u::Int, loop::Loop)
+function _add_loopvalue!(
+  ex::Expr,
+  loopval::Symbol,
+  vloop::Loop,
+  u::Int,
+  loop::Loop
+)
   vloopsym = vloop.itersymbol
   if loopval === vloopsym
     if iszero(u)
@@ -86,18 +98,30 @@ function _add_loopvalue!(ex::Expr, loopval::Symbol, vloop::Loop, u::Int, loop::L
       if isone(u) & isone(vstep)
         push!(ex.args, Expr(:call, lv(:vadd_nsw), VECTORWIDTHSYMBOL, mm))
       else
-        push!(ex.args, Expr(:call, lv(:vadd_nsw), mulexpr(VECTORWIDTHSYMBOL, u, vstep), mm))
+        push!(
+          ex.args,
+          Expr(:call, lv(:vadd_nsw), mulexpr(VECTORWIDTHSYMBOL, u, vstep), mm)
+        )
       end
     end
   elseif u == 0
     push!(ex.args, loopval)
   elseif isknown(step(loop))
-    push!(ex.args, Expr(:call, lv(:vadd_nsw), loopval, staticexpr(u * gethint(step(loop)))))
+    push!(
+      ex.args,
+      Expr(:call, lv(:vadd_nsw), loopval, staticexpr(u * gethint(step(loop))))
+    )
   else
     push!(ex.args, Expr(:call, lv(:vadd_nsw), loopval, mulexpr(step(loop), u)))
   end
 end
-function add_loopvalue!(instrcall::Expr, loopval, ua::UnrollArgs, u₁::Int, loop::Loop)
+function add_loopvalue!(
+  instrcall::Expr,
+  loopval,
+  ua::UnrollArgs,
+  u₁::Int,
+  loop::Loop
+)
   @unpack u₁loopsym, u₂loopsym, vloopsym, vloop, suffix = ua
   if loopval === u₁loopsym #parentsunrolled[n]
     if isone(u₁)
@@ -120,7 +144,14 @@ end
 
 vecunrolllen(::Type{VecUnroll{N,W,T,V}}) where {N,W,T,V} = (N::Int + 1)
 vecunrolllen(_) = -1
-function ifelselastexpr(hasf::Bool, M::Int, vargtypes, K::Int, S::Int, maskearly::Bool)
+function ifelselastexpr(
+  hasf::Bool,
+  M::Int,
+  vargtypes,
+  K::Int,
+  S::Int,
+  maskearly::Bool
+)
   q = Expr(:block, Expr(:meta, :inline))
   vargs = Vector{Symbol}(undef, K)
   for k ∈ 1:K
@@ -152,8 +183,8 @@ function ifelselastexpr(hasf::Bool, M::Int, vargtypes, K::Int, S::Int, maskearly
     else
       hasf || throw(
         ArgumentError(
-          "Argument reduction only supported for `ifelse(last/partial)(f::Function, args...)`",
-        ),
+          "Argument reduction only supported for `ifelse(last/partial)(f::Function, args...)`"
+        )
       )
       M = maxlen
       t = q
@@ -166,8 +197,8 @@ function ifelselastexpr(hasf::Bool, M::Int, vargtypes, K::Int, S::Int, maskearly
   end
   for m ∈ start:M
     call = if hasf
-      (maskearly | (m == M)) ? Expr(:call, VectorizationBase.vifelse, :f, :m) :
-      Expr(:call, :f)
+      (maskearly | (m == M)) ?
+      Expr(:call, VectorizationBase.vifelse, :f, :m) : Expr(:call, :f)
     else# m == M because !hasf
       Expr(:call, :ifelse, :m)
     end
@@ -202,7 +233,7 @@ end
   m::AbstractMask{W},
   ::StaticInt{M},
   ::StaticInt{S},
-  vargs::Vararg{Any,K},
+  vargs::Vararg{Any,K}
 ) where {F,W,K,M,S}
   ifelselastexpr(true, M, vargs, K, S, false)
 end
@@ -211,7 +242,7 @@ end
   ::StaticInt{M},
   ::StaticInt{S},
   varg_1::V1,
-  varg_2::V2,
+  varg_2::V2
 ) where {W,V1,V2,M,S}
   ifelselastexpr(false, M, (V1, V2), 2, S, false)
 end
@@ -220,7 +251,7 @@ end
   m::AbstractMask{W},
   ::StaticInt{M},
   ::StaticInt{S},
-  vargs::Vararg{Any,K},
+  vargs::Vararg{Any,K}
 ) where {F,W,K,M,S}
   ifelselastexpr(true, M, vargs, K, S, true)
 end
@@ -229,21 +260,25 @@ end
   ::StaticInt{M},
   ::StaticInt{S},
   varg_1::V1,
-  varg_2::V2,
+  varg_2::V2
 ) where {W,V1,V2,M,S}
   ifelselastexpr(false, M, (V1, V2), 2, S, true)
 end
 # @inline ifelselast(f::F, m::AbstractMask{W}, ::StaticInt{M}, ::StaticInt{S}, vargs::Vararg{NativeTypes,K}) where {F,W,K,M,S} = f(vargs...)
 # @inline ifelsepartial(f::F, m::AbstractMask{W}, ::StaticInt{M}, ::StaticInt{S}, vargs::Vararg{NativeTypes,K}) where {F,W,K,M,S} = f(vargs...)
-@generated function subset_vec_unroll(vu::VecUnroll{N}, ::StaticInt{S}) where {N,S}
+@generated function subset_vec_unroll(
+  vu::VecUnroll{N},
+  ::StaticInt{S}
+) where {N,S}
   (1 ≤ S ≤ N + 1) || throw(
     ArgumentError(
-      "`vu` isa `VecUnroll` of `$(N+1)` elements, but trying to subset $S of them.",
-    ),
+      "`vu` isa `VecUnroll` of `$(N+1)` elements, but trying to subset $S of them."
+    )
   )
   t = Expr(:tuple)
   gf = GlobalRef(Core, :getfield)
-  S == 1 && return Expr(:block, Expr(:meta, :inline), :($gf($gf(vu, 1), 1, false)))
+  S == 1 &&
+    return Expr(:block, Expr(:meta, :inline), :($gf($gf(vu, 1), 1, false)))
   for s ∈ 1:S
     push!(t.args, Expr(:call, gf, :vud, s, false))
   end
@@ -259,7 +294,7 @@ end
   default::D,
   ::StaticInt{M},
   ::StaticInt{S},
-  vargs::Vararg{Any,K},
+  vargs::Vararg{Any,K}
 ) where {F,M,K,D,S}
   lengths = Vector{Int}(undef, K)
   q = Expr(:block, Expr(:meta, :inline))
@@ -343,7 +378,7 @@ function parent_op_name!(
   u₂max,
   u₂unrolledsym,
   op,
-  tiledouterreduction,
+  tiledouterreduction
 )
   opp = parents_op[n]
   opisvectorized = isvectorized(op)
@@ -367,14 +402,15 @@ function parent_op_name!(
     if parents_u₂syms[n]
       if isu₂unrolled(op) # u₂unrolledsym ||
         parent =
-          isouterreduct ? Symbol(parent, suffix) : Symbol(parent, suffix, '_', '_', u)
+          isouterreduct ? Symbol(parent, suffix) :
+          Symbol(parent, suffix, '_', '_', u)
       elseif u₂max > 1
         t = Expr(:tuple)
         reduction = Expr(
           :call,
           GlobalRef(ArrayInterface, :reduce_tup),
           reduce_to_onevecunroll(opp),
-          t,
+          t
         )
         for u₂ ∈ 0:u₂max-1
           push!(t.args, Symbol(parent, u₂, '_', '_', u))
@@ -437,8 +473,16 @@ function getu₁forreduct(ls::LoopSet, op::Operation, u₁::Int)
   end
 end
 isidentityop(op::Operation) =
-  iscompute(op) && (instruction(op).instr === :identity) && (length(parents(op)) == 1)
-function reduce_parent!(q::Expr, ls::LoopSet, op::Operation, opp::Operation, parent::Symbol)
+  iscompute(op) &&
+  (instruction(op).instr === :identity) &&
+  (length(parents(op)) == 1)
+function reduce_parent!(
+  q::Expr,
+  ls::LoopSet,
+  op::Operation,
+  opp::Operation,
+  parent::Symbol
+)
   isvectorized(op) && return parent
   if isvectorized(opp)
     oppt = opp
@@ -449,7 +493,8 @@ function reduce_parent!(q::Expr, ls::LoopSet, op::Operation, opp::Operation, par
     return parent
   end
   reduct_class = reduction_instruction_class(oppt.instruction)
-  if (instruction(op).instr === :mul_fast) & (reduct_class == ADDITIVE_IN_REDUCTIONS)
+  if (instruction(op).instr === :mul_fast) &
+     (reduct_class == ADDITIVE_IN_REDUCTIONS)
     op.vectorized = true
     return parent
   end
@@ -457,7 +502,11 @@ function reduce_parent!(q::Expr, ls::LoopSet, op::Operation, opp::Operation, par
   if instruction(op).instr ≢ :ifelse
     push!(
       q.args,
-      Expr(:(=), newp, Expr(:call, lv(reduction_to_scalar(reduct_class)), parent)),
+      Expr(
+        :(=),
+        newp,
+        Expr(:call, lv(reduction_to_scalar(reduct_class)), parent)
+      )
     )#IfElseReducer
   else
     reductexpr = ifelse_reduction(:IfElseReducer, op) do opv
@@ -467,7 +516,13 @@ function reduce_parent!(q::Expr, ls::LoopSet, op::Operation, opp::Operation, par
   end
   newp
 end
-function lower_compute!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs, mask::Bool)
+function lower_compute!(
+  q::Expr,
+  op::Operation,
+  ls::LoopSet,
+  ua::UnrollArgs,
+  mask::Bool
+)
   @unpack u₁, u₁loopsym, u₂loopsym, vloopsym, u₂max, suffix = ua
   var = name(op)
   instr = instruction(op)
@@ -506,7 +561,7 @@ function lower_compute!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs, mas
         parentop.reduced_deps,
         parentop.parents,
         parentop.ref,
-        parentop.reduced_children,
+        parentop.reduced_children
       )
       newparentop.vectorized = false
       newparentop.u₁unrolled = false
@@ -519,14 +574,21 @@ function lower_compute!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs, mas
         newparentname = Symbol(newparentname, suffix_)
       end
       if isconstant(newparentop)
-        push!(q.args, Expr(:(=), Symbol(newparentname, '_', 1), Symbol(parentname, '_', 1)))
+        push!(
+          q.args,
+          Expr(:(=), Symbol(newparentname, '_', 1), Symbol(parentname, '_', 1))
+        )
       else
         newpname = Symbol(newparentname, '_', u₁)
         push!(q.args, Expr(:(=), newpname, Symbol(parentname, '_', u₁)))
         reduce_expr!(q, newparentname, newparentop, u₁, -1, true, false)
         push!(
           q.args,
-          Expr(:(=), Symbol(newparentname, '_', 1), Symbol(newparentname, "##onevec##")),
+          Expr(
+            :(=),
+            Symbol(newparentname, '_', 1),
+            Symbol(newparentname, "##onevec##")
+          )
         )
       end
     end
@@ -544,14 +606,15 @@ function lower_compute!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs, mas
     # instrfid = findfirst(isequal(instr.instr), (:vfmadd, :vfnmadd, :vfmsub, :vfnmsub))
     instrfid = findfirst(
       Base.Fix2(===, instr.instr),
-      (:vfmadd_fast, :vfnmadd_fast, :vfmsub_fast, :vfnmsub_fast),
+      (:vfmadd_fast, :vfnmadd_fast, :vfmsub_fast, :vfnmsub_fast)
     )
     # instrfid = findfirst(isequal(instr.instr), (:vfnmadd_fast, :vfmsub_fast, :vfnmsub_fast))
     # want to instcombine when parent load's deps are superset
     # also make sure opp is unrolled
     if !(instrfid === nothing) && (opunrolled && u₁ > 1) && sub_fmas(ls, op, ua)
       specific_fmas =
-        Base.libllvm_version >= v"11.0.0" ? (:vfmadd, :vfnmadd, :vfmsub, :vfnmsub) :
+        Base.libllvm_version >= v"11.0.0" ?
+        (:vfmadd, :vfnmadd, :vfmsub, :vfnmsub) :
         (:vfmadd231, :vfnmadd231, :vfmsub231, :vfnmsub231)
       # specific_fmas = Base.libllvm_version >= v"11.0.0" ? (:vfnmadd, :vfmsub, :vfnmsub) : (:vfnmadd231, :vfmsub231, :vfnmsub231)
       # specific_fmas = (:vfmadd231, :vfnmadd231, :vfmsub231, :vfnmsub231)
@@ -597,7 +660,8 @@ function lower_compute!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs, mas
     if isreduct #(isanouterreduction(ls, op))
       # isouterreduct = true
       isouterreduct = isanouterreduction(ls, op)
-      u₁reduct = isouterreduct ? getu₁full(ls, u₁) : getu₁forreduct(ls, op, u₁)
+      u₁reduct =
+        isouterreduct ? getu₁full(ls, u₁) : getu₁forreduct(ls, op, u₁)
       dopartialmap = u₁reduct ≠ u₁
       Symbol(mvar, '_', u₁reduct)
     else
@@ -618,7 +682,6 @@ function lower_compute!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs, mas
       if ((isvectorized(opp) && !isvectorized(op))) ||
          (parents_u₁syms[n] != u₁unrolledsym) ||
          (parents_u₂syms[n] != u₂unrolledsym)
-
         selfopname, uₚ = parent_op_name!(
           q,
           ls,
@@ -632,7 +695,7 @@ function lower_compute!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs, mas
           u₂max,
           u₂unrolledsym,
           op,
-          tiledouterreduction,
+          tiledouterreduction
         )
         push!(instrcall.args, selfopname)
       else
@@ -665,7 +728,7 @@ function lower_compute!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs, mas
         u₂max,
         u₂unrolledsym,
         op,
-        0,
+        0
       )
       parent = reduce_parent!(q, ls, op, opp, parent)
       if (selfdep == 0) && search_tree(parents(opp), name(op))
@@ -678,7 +741,8 @@ function lower_compute!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs, mas
       end
     end
   end
-  selfdepreduce = ifelse(((!u₁unrolledsym) & isu₁unrolled(op)) & (u₁ > 1), selfdep, 0)
+  selfdepreduce =
+    ifelse(((!u₁unrolledsym) & isu₁unrolled(op)) & (u₁ > 1), selfdep, 0)
   if maskreduct
     ifelsefunc = if us.u₁ == 1
       :ifelse # don't need to be fancy
@@ -705,8 +769,8 @@ function lower_compute!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs, mas
           Expr(
             :(=),
             varsym,
-            Expr(:call, lv(ifelsefunc), MASKSYMBOL, instrcall, selfopname),
-          ),
+            Expr(:call, lv(ifelsefunc), MASKSYMBOL, instrcall, selfopname)
+          )
         )
       elseif ((u₁ ≡ 1) | (selfdepreduce ≡ 0))
         # if the current unroll is 1, no need to accumulate. Same if there is no selfdepreduce, but there has to be if we're here?
@@ -723,16 +787,20 @@ function lower_compute!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs, mas
               staticexpr(u₁),
               staticexpr(selfdepreduce),
               instrcall,
-              selfopname,
-            ),
-          ),
+              selfopname
+            )
+          )
         )
       else
         make_partial_map!(instrcall, selfopname, u₁, selfdepreduce)
         # partialmap accumulates
         push!(
           q.args,
-          Expr(:(=), varsym, Expr(:call, lv(:ifelse), MASKSYMBOL, instrcall, selfopname)),
+          Expr(
+            :(=),
+            varsym,
+            Expr(:call, lv(:ifelse), MASKSYMBOL, instrcall, selfopname)
+          )
         )
       end
       return
@@ -761,7 +829,11 @@ function lower_compute!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs, mas
   elseif identifier(op) ∉ ls.outer_reductions && should_broadcast_op(op)
     push!(
       q.args,
-      Expr(:(=), varsym, Expr(:call, lv(:vbroadcast), VECTORWIDTHSYMBOL, instrcall)),
+      Expr(
+        :(=),
+        varsym,
+        Expr(:call, lv(:vbroadcast), VECTORWIDTHSYMBOL, instrcall)
+      )
     )
   else
     push!(q.args, Expr(:(=), varsym, instrcall))
diff --git a/src/codegen/lower_constant.jl b/src/codegen/lower_constant.jl
index add374880..0429b75dd 100644
--- a/src/codegen/lower_constant.jl
+++ b/src/codegen/lower_constant.jl
@@ -6,8 +6,9 @@ function should_broadcast_op(op::Operation)
   true
 end
 
-
-@inline sizeequivalentfloat(::Type{T}) where {T<:Union{Float16,Float32,Float64}} = T
+@inline sizeequivalentfloat(
+  ::Type{T}
+) where {T<:Union{Float16,Float32,Float64}} = T
 @inline sizeequivalentfloat(::Type{T}) where {T<:Union{Int8,UInt8}} = Float32
 @inline sizeequivalentfloat(::Type{T}) where {T<:Union{Int16,UInt16}} = Float16
 @inline sizeequivalentfloat(::Type{T}) where {T<:Union{Int32,UInt32}} = Float32
@@ -19,8 +20,9 @@ end
 if (Sys.ARCH === :x86_64) || (Sys.ARCH === :i686)
   @inline widest_supported_integer(::True) = Int64
   @inline widest_supported_integer(::False) = Int32
-  @inline sizeequivalentint(::Type{Float64}) =
-    widest_supported_integer(VectorizationBase.has_feature(Val(:x86_64_avx512dq)))
+  @inline sizeequivalentint(::Type{Float64}) = widest_supported_integer(
+    VectorizationBase.has_feature(Val(:x86_64_avx512dq))
+  )
 else
   @inline sizeequivalentint(::Type{Float64}) = Int
 end
@@ -35,14 +37,14 @@ function typeof_sym(ls::LoopSet, op::Operation, zerotyp::NumberType)
     newtypeT = gensym(:IntType)
     pushpreamble!(
       ls,
-      Expr(:(=), newtypeT, Expr(:call, lv(:sizeequivalentint), ELTYPESYMBOL)),
+      Expr(:(=), newtypeT, Expr(:call, lv(:sizeequivalentint), ELTYPESYMBOL))
     )
     newtypeT
   elseif zerotyp == HardFloat
     newtypeT = gensym(:FloatType)
     pushpreamble!(
       ls,
-      Expr(:(=), newtypeT, Expr(:call, lv(:sizeequivalentfloat), ELTYPESYMBOL)),
+      Expr(:(=), newtypeT, Expr(:call, lv(:sizeequivalentfloat), ELTYPESYMBOL))
     )
     newtypeT
   else
@@ -55,7 +57,7 @@ function lower_zero!(
   op::Operation,
   ls::LoopSet,
   ua::UnrollArgs,
-  zerotyp::NumberType = zerotype(ls, op),
+  zerotyp::NumberType = zerotype(ls, op)
 )
   @unpack u₁, u₁loopsym, u₂loopsym, vloopsym, vloop, u₂max, suffix = ua
   mvar, opu₁, opu₂ =
@@ -79,10 +81,16 @@ function lower_zero!(
         staticexpr(u₁),
         VECTORWIDTHSYMBOL,
         typeT,
-        staticexpr(reg_size(ls)),
+        staticexpr(reg_size(ls))
       )
     else
-      call = Expr(:call, lv(:_vzero), VECTORWIDTHSYMBOL, typeT, staticexpr(reg_size(ls)))
+      call = Expr(
+        :call,
+        lv(:_vzero),
+        VECTORWIDTHSYMBOL,
+        typeT,
+        staticexpr(reg_size(ls))
+      )
     end
   else
     call = Expr(:call, :zero, typeT)
@@ -96,7 +104,10 @@ function lower_zero!(
   end
   if (suffix == -1) && opu₂
     for u ∈ 0:u₂max-1
-      push!(q.args, Expr(:(=), Symbol(mvar, u, "__", Core.ifelse(opu₁, u₁, 1)), call))
+      push!(
+        q.args,
+        Expr(:(=), Symbol(mvar, u, "__", Core.ifelse(opu₁, u₁, 1)), call)
+      )
     end
   else
     mvar = Symbol(mvar, '_', Core.ifelse(opu₁, u₁, 1))
@@ -118,8 +129,11 @@ function getparentsreductzero(ls::LoopSet, op::Operation)::Float64
   end
   throw("Reduct zero not found for operation $(name(op)).")
 end
-vecbasefunc(f) =
-  Expr(:(.), Expr(:(.), :LoopVectorization, QuoteNode(:VectorizationBase)), QuoteNode(f))
+vecbasefunc(f) = Expr(
+  :(.),
+  Expr(:(.), :LoopVectorization, QuoteNode(:VectorizationBase)),
+  QuoteNode(f)
+)
 function lower_constant!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs)
   @unpack u₁, u₁loopsym, u₂loopsym, vloopsym, u₂max, suffix = ua
   mvar, opu₁, opu₂ =
@@ -141,14 +155,19 @@ function lower_constant!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs)
           :call,
           vecbasefunc(:addscalar),
           Expr(:call, lv(:vzero), VECTORWIDTHSYMBOL, ELTYPESYMBOL),
-          constsym,
+          constsym
         )
       elseif instrclass == MULTIPLICATIVE_IN_REDUCTIONS
         Expr(
           :call,
           vecbasefunc(:mulscalar),
-          Expr(:call, lv(:vbroadcast), VECTORWIDTHSYMBOL, Expr(:call, :one, ELTYPESYMBOL)),
-          constsym,
+          Expr(
+            :call,
+            lv(:vbroadcast),
+            VECTORWIDTHSYMBOL,
+            Expr(:call, :one, ELTYPESYMBOL)
+          ),
+          constsym
         )
       elseif instrclass == MAX
         Expr(
@@ -158,9 +177,9 @@ function lower_constant!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs)
             :call,
             lv(:vbroadcast),
             VECTORWIDTHSYMBOL,
-            Expr(:call, :typemin, ELTYPESYMBOL),
+            Expr(:call, :typemin, ELTYPESYMBOL)
           ),
-          constsym,
+          constsym
         )
       elseif instrclass == MIN
         Expr(
@@ -170,13 +189,13 @@ function lower_constant!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs)
             :call,
             lv(:vbroadcast),
             VECTORWIDTHSYMBOL,
-            Expr(:call, :typemax, ELTYPESYMBOL),
+            Expr(:call, :typemax, ELTYPESYMBOL)
           ),
-          constsym,
+          constsym
         )
       else
         throw(
-          "Reductions of type $(reduction_zero(instrclass)) not yet supported; please file an issue as a reminder to take care of this.",
+          "Reductions of type $(reduction_zero(instrclass)) not yet supported; please file an issue as a reminder to take care of this."
         )
       end
     else
@@ -219,7 +238,8 @@ function lower_constant!(q::Expr, op::Operation, ls::LoopSet, ua::UnrollArgs)
 end
 
 isconstantop(op::Operation) =
-  (instruction(op) == LOOPCONSTANT) || (isconstant(op) && length(loopdependencies(op)) == 0)
+  (instruction(op) == LOOPCONSTANT) ||
+  (isconstant(op) && length(loopdependencies(op)) == 0)
 function isinitializedconst(op::Operation)
   if isconstant(op)
     return true
@@ -281,7 +301,11 @@ function lower_licm_constants!(ls::LoopSet)
     end
   end
   for (id, floatval) ∈ ls.preamble_symfloat
-    setop!(ls, ops[id], Expr(:call, lv(:sizeequivalentfloat), ELTYPESYMBOL, floatval))
+    setop!(
+      ls,
+      ops[id],
+      Expr(:call, lv(:sizeequivalentfloat), ELTYPESYMBOL, floatval)
+    )
   end
   for (id, typ) ∈ ls.preamble_zeros
     instruction(ops[id]) === LOOPCONSTANT || continue
diff --git a/src/codegen/lower_load.jl b/src/codegen/lower_load.jl
index 7795db04f..d9af6a55c 100644
--- a/src/codegen/lower_load.jl
+++ b/src/codegen/lower_load.jl
@@ -35,7 +35,13 @@ function prefetchisagoodidea(ls::LoopSet, op::Operation, td::UnrollArgs)
           for opp ∈ operations(ls)
             if iscompute(opp) &&
                (innermostloopsym ∈ loopdependencies(opp)) &&
-               load_constrained(opp, u₁loopsym, u₂loopsym, innermostloopsym, true)
+               load_constrained(
+                 opp,
+                 u₁loopsym,
+                 u₂loopsym,
+                 innermostloopsym,
+                 true
+               )
               return 0
             end
           end
@@ -51,7 +57,7 @@ function add_prefetches!(
   ls::LoopSet,
   op::Operation,
   td::UnrollArgs,
-  prefetchind::Int,
+  prefetchind::Int
 )
   # TODO: maybe prefetch for non-x86_64?
   ((Sys.ARCH === :x86_64) || (Sys.ARCH === :i686)) || return nothing
@@ -62,7 +68,8 @@ function add_prefetches!(
   innermostloopsym = first(names(ls))
   us = ls.unrollspecification
   prefetch_distance =
-    u₁loopsym === innermostloopsym ? us.u₁ : (u₂loopsym === innermostloopsym ? us.u₂ : 1)
+    u₁loopsym === innermostloopsym ? us.u₁ :
+    (u₂loopsym === innermostloopsym ? us.u₂ : 1)
   # prefetch_distance = u₁loopsym === innermostloopsym ? u₁ : ( u₂loopsym === innermostloopsym ? u₂max : 1 )
   prefetch_multiplier = 5
   prefetch_distance *= prefetch_multiplier
@@ -81,7 +88,7 @@ function add_prefetches!(
     false,
     0,
     ls,
-    false,
+    false
   )
   offsets[prefetchind] = inner_offset
   ptr = vptr(op)
@@ -89,14 +96,19 @@ function add_prefetches!(
   if !isknown(prefetchloop_step)
     for i ∈ eachindex(gespinds.args)
       if i == prefetchind
-        gespinds.args[i] =
-          mulexpr(getsym(prefetchloop_step), (gespinds.args[i])::Union{Symbol,Expr})
+        gespinds.args[i] = mulexpr(
+          getsym(prefetchloop_step),
+          (gespinds.args[i])::Union{Symbol,Expr}
+        )
       end
       # gespinds.args[i] = Expr(:call, lv(:data), gespinds.args[i])
     end
   end
   ip = GlobalRef(VectorizationBase, :increment_ptr)
-  push!(q.args, Expr(:(=), gptr, Expr(:call, ip, ptr, vptr_offset(ptr), gespinds)))
+  push!(
+    q.args,
+    Expr(:(=), gptr, Expr(:call, ip, ptr, vptr_offset(ptr), gespinds))
+  )
   inds = Expr(:tuple)
   indices = getindicesonly(op)
 
@@ -126,7 +138,10 @@ function add_prefetches!(
     else
       inds.args[i] = staticexpr(u)
     end
-    push!(q.args, Expr(:call, prefetch0, Expr(:call, ip, ptr, gptr, copy(inds))))
+    push!(
+      q.args,
+      Expr(:call, prefetch0, Expr(:call, ip, ptr, gptr, copy(inds)))
+    )
   end
   nothing
 end
@@ -137,8 +152,8 @@ function pushbroadcast!(q::Expr, mvar::Symbol)
     Expr(
       :(=),
       broadcastedname(mvar),
-      Expr(:call, lv(:vbroadcast), VECTORWIDTHSYMBOL, mvar),
-    ),
+      Expr(:call, lv(:vbroadcast), VECTORWIDTHSYMBOL, mvar)
+    )
   )
 end
 
@@ -168,7 +183,7 @@ function lower_load_no_optranslation!(
   op::Operation,
   td::UnrollArgs,
   mask::Bool,
-  inds_calc_by_ptr_offset::Vector{Bool},
+  inds_calc_by_ptr_offset::Vector{Bool}
 )
   @unpack u₁, u₁loopsym, u₂loopsym, vloopsym, suffix = td
   # @assert isvectorized(op)
@@ -187,9 +202,11 @@ function lower_load_no_optranslation!(
     t = Expr(:tuple)
     sptrsym = sptr!(q, op)
     for u ∈ 1:u₁
-      inds = mem_offset_u(op, td, inds_calc_by_ptr_offset, true, u - 1, ls, false)
+      inds =
+        mem_offset_u(op, td, inds_calc_by_ptr_offset, true, u - 1, ls, false)
       loadexpr = Expr(:call, lv(:_vload), sptrsym, inds)
-      domask = mask && (isvectorized(op) & ((u == u₁) | (vloopsym !== u₁loopsym)))
+      domask =
+        mask && (isvectorized(op) & ((u == u₁) | (vloopsym !== u₁loopsym)))
       add_memory_mask!(loadexpr, op, td, domask, ls, u)
       push!(loadexpr.args, falseexpr, rs)
       push!(t.args, loadexpr)
@@ -230,7 +247,7 @@ function lower_load_for_optranslation!(
   ls::LoopSet,
   td::UnrollArgs,
   mask::Bool,
-  translationind::Int,
+  translationind::Int
 )
   @unpack u₁loop, u₂loop, vloop, u₁, u₂max, suffix = td
   # @unpack u₁, u₁loopsym, u₂loopsym, vloopsym, u₂max, suffix = td
@@ -245,7 +262,14 @@ function lower_load_for_optranslation!(
   step₂ = gethint(step(u₂loop))
   # abs of steps are equal
   equal_steps = (step₁ == step₂) ⊻ (posindicator ≠ 0x03)
-  _td = UnrollArgs(u₁loop, u₂loop, vloop, u₁, u₂max, Core.ifelse(equal_steps, 0, u₂max - 1))
+  _td = UnrollArgs(
+    u₁loop,
+    u₂loop,
+    vloop,
+    u₁,
+    u₂max,
+    Core.ifelse(equal_steps, 0, u₂max - 1)
+  )
   gespinds = mem_offset(op, _td, inds_by_ptroff, false, ls, false)
   ptr = vptr(op)
   gptr = Symbol(ptr, "##GESPED##")
@@ -254,13 +278,16 @@ function lower_load_for_optranslation!(
       gespinds.args[i] = Expr(
         :call,
         lv(Core.ifelse(equal_steps, :firstunroll, :lastunroll)),
-        gespinds.args[i],
+        gespinds.args[i]
       )
     end
   end
   ip = GlobalRef(VectorizationBase, :increment_ptr)
   vpo = vptr_offset(gptr)
-  push!(q.args, Expr(:(=), vpo, Expr(:call, ip, ptr, vptr_offset(ptr), gespinds)))
+  push!(
+    q.args,
+    Expr(:(=), vpo, Expr(:call, ip, ptr, vptr_offset(ptr), gespinds))
+  )
   push!(q.args, Expr(:(=), gptr, ptr))#Expr(:call, GlobalRef(VectorizationBase, :reconstruct_ptr),
   fill!(inds_by_ptroff, true)
   @unpack ref, loopedindex = mref
@@ -268,9 +295,11 @@ function lower_load_for_optranslation!(
   # old_translation_index = indices[translationind]
   # indices[translationind] = u₁loop.itersymbol
   # getindicesonly returns a view of `getindices`
-  dummyref = ArrayReference(ref.array, indices, zero(getoffsets(ref)), getstrides(ref))
+  dummyref =
+    ArrayReference(ref.array, indices, zero(getoffsets(ref)), getstrides(ref))
   # loopedindex[translationind] = true
-  dummymref = ArrayReferenceMeta(dummyref, fill!(similar(loopedindex), true), gptr)
+  dummymref =
+    ArrayReferenceMeta(dummyref, fill!(similar(loopedindex), true), gptr)
   indonly = getindicesonly(dummyref)
   for i ∈ eachindex(indonly)
     if i == translationind
@@ -299,7 +328,10 @@ function lower_load_for_optranslation!(
   push!(q.args, :($variable_name_data = getfield($variable_name_u, 1)))
   if shouldbroadcast
     broadcasted_data = broadcastedname(variable_name_data)
-    push!(q.args, :($broadcasted_data = getfield($(broadcastedname(variable_name_u)), 1)))
+    push!(
+      q.args,
+      :($broadcasted_data = getfield($(broadcastedname(variable_name_u)), 1))
+    )
   end
   gf = GlobalRef(Core, :getfield)
   for u₂ ∈ 0:u₂max-1
@@ -323,7 +355,11 @@ function lower_load_for_optranslation!(
     if shouldbroadcast
       push!(
         q.args,
-        Expr(:(=), broadcastedname(variable_name_u₂), Expr(:call, lv(:VecUnroll), tb)),
+        Expr(
+          :(=),
+          broadcastedname(variable_name_u₂),
+          Expr(:call, lv(:VecUnroll), tb)
+        )
       )
     end
   end
@@ -332,11 +368,18 @@ end
 
 # TODO: this code should be rewritten to be more "orthogonal", so that we're just combining separate pieces.
 # Using sentinel values (eg, T = -1 for non tiling) in part to avoid recompilation.
-function lower_load!(q::Expr, op::Operation, ls::LoopSet, td::UnrollArgs, mask::Bool)
+function lower_load!(
+  q::Expr,
+  op::Operation,
+  ls::LoopSet,
+  td::UnrollArgs,
+  mask::Bool
+)
   @unpack u₁, u₂max, u₁loopsym, u₂loopsym, vloopsym, suffix = td
   if (suffix != -1) && ls.loadelimination
     if (u₁ > 1) & (u₂max > 1)
-      istr, ispl = isoptranslation(ls, op, UnrollSymbols(u₁loopsym, u₂loopsym, vloopsym))
+      istr, ispl =
+        isoptranslation(ls, op, UnrollSymbols(u₁loopsym, u₂loopsym, vloopsym))
       if istr ≠ 0x00
         return lower_load_for_optranslation!(q, op, ispl, ls, td, mask, istr)
       end
@@ -347,7 +390,10 @@ function lower_load!(q::Expr, op::Operation, ls::LoopSet, td::UnrollArgs, mask::
         varnew = variable_name(op, suffix)
         varold = variable_name(operations(ls)[id], suffix + mno)
         u = isu₁unrolled(op) ? u₁ : 1
-        push!(q.args, Expr(:(=), Symbol(varnew, '_', u), Symbol(varold, '_', u)))
+        push!(
+          q.args,
+          Expr(:(=), Symbol(varnew, '_', u), Symbol(varold, '_', u))
+        )
         return
       end
     end
@@ -360,13 +406,24 @@ function _lower_load!(
   op::Operation,
   td::UnrollArgs,
   mask::Bool,
-  inds_calc_by_ptr_offset::Vector{Bool} = indices_calculated_by_pointer_offsets(ls, op.ref),
+  inds_calc_by_ptr_offset::Vector{Bool} = indices_calculated_by_pointer_offsets(
+    ls,
+    op.ref
+  )
 )
   if rejectinterleave(op)
-    return lower_load_no_optranslation!(q, ls, op, td, mask, inds_calc_by_ptr_offset)
+    return lower_load_no_optranslation!(
+      q,
+      ls,
+      op,
+      td,
+      mask,
+      inds_calc_by_ptr_offset
+    )
   else
     omop = offsetloadcollection(ls)
-    @unpack opids, opidcollectionmap, batchedcollections, batchedcollectionmap = omop
+    @unpack opids, opidcollectionmap, batchedcollections, batchedcollectionmap =
+      omop
     batchid, opind = batchedcollectionmap[identifier(op)]
     for (bid, oid) ∈ batchedcollectionmap # this relies on `for op ∈ ops` in codegen/operation_evaluation_order.jl
       if bid == batchid
@@ -381,7 +438,7 @@ function _lower_load!(
             idsformap,
             td,
             mask,
-            inds_calc_by_ptr_offset,
+            inds_calc_by_ptr_offset
           )
         end
         return nothing
@@ -403,7 +460,12 @@ function rejectcurly(ls::LoopSet, op::Operation, td::UnrollArgs)
   @unpack u₁loopsym, vloopsym = td
   rejectcurly(ls, op, u₁loopsym, vloopsym)
 end
-function rejectcurly(ls::LoopSet, op::Operation, u₁loopsym::Symbol, vloopsym::Symbol)
+function rejectcurly(
+  ls::LoopSet,
+  op::Operation,
+  u₁loopsym::Symbol,
+  vloopsym::Symbol
+)
   indices = getindicesonly(op)
   li = op.ref.loopedindex
   AV = AU = false
@@ -427,7 +489,8 @@ function rejectcurly(ls::LoopSet, op::Operation, u₁loopsym::Symbol, vloopsym::
       end
       if instruction(opp).instr === :(+) || instruction(opp).instr === :add_fast
         isadd = true
-      elseif instruction(opp).instr === :(-) || instruction(opp).instr === :sub_fast
+      elseif instruction(opp).instr === :(-) ||
+             instruction(opp).instr === :sub_fast
         isadd = false
       else
         return true
@@ -445,8 +508,8 @@ function rejectcurly(ls::LoopSet, op::Operation, u₁loopsym::Symbol, vloopsym::
           isloopvalue(opp2) || return true
         end
       end
-      if (u₁loopsym === CONSTANTZEROINDEX) ? (CONSTANTZEROINDEX ∈ loopdependencies(opp)) :
-         (isu₁unrolled(opp))
+      if (u₁loopsym === CONSTANTZEROINDEX) ?
+         (CONSTANTZEROINDEX ∈ loopdependencies(opp)) : (isu₁unrolled(opp))
         AU && return true
         AU = true
       end
@@ -458,7 +521,13 @@ function rejectinterleave(
   ls::LoopSet,
   op::Operation,
   vloop::Loop,
-  idsformap::SubArray{Tuple{Int,Int},1,Vector{Tuple{Int,Int}},Tuple{UnitRange{Int}},true},
+  idsformap::SubArray{
+    Tuple{Int,Int},
+    1,
+    Vector{Tuple{Int,Int}},
+    Tuple{UnitRange{Int}},
+    true
+  }
 )
   strd = step(vloop)
   isknown(strd) || return true
@@ -497,10 +566,16 @@ function lower_load_collection!(
   q::Expr,
   ls::LoopSet,
   opidmap::Vector{Int},
-  idsformap::SubArray{Tuple{Int,Int},1,Vector{Tuple{Int,Int}},Tuple{UnitRange{Int}},true},
+  idsformap::SubArray{
+    Tuple{Int,Int},
+    1,
+    Vector{Tuple{Int,Int}},
+    Tuple{UnitRange{Int}},
+    true
+  },
   ua::UnrollArgs,
   mask::Bool,
-  inds_calc_by_ptr_offset::Vector{Bool},
+  inds_calc_by_ptr_offset::Vector{Bool}
 )
   @unpack u₁, u₁loop, u₁loopsym, u₂loopsym, vloopsym, vloop, suffix = ua
 
@@ -523,7 +598,7 @@ function lower_load_collection!(
     MaybeKnown(1024),
     MaybeKnown(1),
     Symbol(""),
-    Symbol(""),
+    Symbol("")
   )
   unrollcurl₂ = unrolled_curly(op, nouter, offset_dummy_loop, vloop, mask, 1) # interleave always 1 here
   inds = mem_offset_u(op, ua, inds_calc_by_ptr_offset, false, 0, ls, false)
@@ -532,13 +607,15 @@ function lower_load_collection!(
   rs = staticexpr(reg_size(ls))
   opu₁, opu₂ = isunrolled_sym(op, u₁loopsym, u₂loopsym, vloopsym, ls)
   manualunrollu₁ = if opu₁ && u₁ > 1 # both unrolled
-    if isknown(step(u₁loop)) && sum(Base.Fix2(===, u₁loopsym), getindicesonly(op)) == 1
+    if isknown(step(u₁loop)) &&
+       sum(Base.Fix2(===, u₁loopsym), getindicesonly(op)) == 1
       # if first(opindices) === u₁loopsym#vloopsym
       #   interleaveval = -nouter
       # else
       interleaveval = 0
       # end
-      unrollcurl₁ = unrolled_curly(op, u₁, ua.u₁loop, vloop, mask, interleaveval)
+      unrollcurl₁ =
+        unrolled_curly(op, u₁, ua.u₁loop, vloop, mask, interleaveval)
       inds = Expr(:call, unrollcurl₁, inds)
       false
     else
@@ -567,7 +644,7 @@ function lower_load_collection!(
     "##size##",
     nouter,
     "##u₁##",
-    u₁,
+    u₁
   )
   gf = GlobalRef(Core, :getfield)
   if manualunrollu₁
@@ -579,7 +656,8 @@ function lower_load_collection!(
     for u ∈ 0:u₁-1
       collectionname_u = Symbol(collectionname, :_, u)
       if u ≠ 0
-        inds = mem_offset_u(op, ua, inds_calc_by_ptr_offset, false, u, ls, false)
+        inds =
+          mem_offset_u(op, ua, inds_calc_by_ptr_offset, false, u, ls, false)
         uinds = Expr(:call, unrollcurl₂, inds)
         loadexpr = copy(loadexpr)
         loadexpr.args[3] = Expr(:call, unrollcurl₂, inds)
@@ -593,7 +671,8 @@ function lower_load_collection!(
         ext = extractedvs[i]
         if (u + 1) == u₁
           _op = ops[opidmap[opid]]
-          mvar = Symbol(variable_name(_op, Core.ifelse(opu₂, suffix, -1)), '_', u₁)
+          mvar =
+            Symbol(variable_name(_op, Core.ifelse(opu₂, suffix, -1)), '_', u₁)
           push!(q.args, Expr(:(=), mvar, Expr(:call, lv(:VecUnroll), ext)))
         end
         push!(ext.args, Expr(:call, gf, collectionname_u, i, false))
diff --git a/src/codegen/lower_memory_common.jl b/src/codegen/lower_memory_common.jl
index e41084f18..4741ba348 100644
--- a/src/codegen/lower_memory_common.jl
+++ b/src/codegen/lower_memory_common.jl
@@ -9,8 +9,14 @@ function symbolind(ind::Symbol, op::Operation, td::UnrollArgs, ls::LoopSet)
   id == -1 && return ind, op
   @unpack u₁, u₁loopsym, u₂loopsym, vloopsym, u₂max, suffix = td
   parent = parents(op)[id]
-  pvar, u₁op, u₂op =
-    variable_name_and_unrolled(parent, u₁loopsym, u₂loopsym, vloopsym, suffix, ls)
+  pvar, u₁op, u₂op = variable_name_and_unrolled(
+    parent,
+    u₁loopsym,
+    u₂loopsym,
+    vloopsym,
+    suffix,
+    ls
+  )
   Symbol(pvar, '_', Core.ifelse(u₁op, u₁, 1)), parent
 end
 
@@ -60,12 +66,19 @@ function _addoffset!(
   indexstride::Union{Int,MaybeKnown},
   index,
   offset,
-  calcbypointeroffset::Bool,
+  calcbypointeroffset::Bool
 ) # 6 -> 5 args
   if _isone(indexstride)
     addoffset!(ret, vloopstride, index, offset, calcbypointeroffset)
   else
-    __addoffset!(ret, vloopstride, indexstride, index, offset, calcbypointeroffset)
+    __addoffset!(
+      ret,
+      vloopstride,
+      indexstride,
+      index,
+      offset,
+      calcbypointeroffset
+    )
   end
 end
 function _addoffset!(
@@ -74,9 +87,16 @@ function _addoffset!(
   indexstride,
   index,
   offset,
-  calcbypointeroffset::Bool,
+  calcbypointeroffset::Bool
 ) # 6 -> 5 args
-  ___addoffset!(ret, vloopstride, indexstride, index, offset, calcbypointeroffset)
+  ___addoffset!(
+    ret,
+    vloopstride,
+    indexstride,
+    index,
+    offset,
+    calcbypointeroffset
+  )
 end
 function __addoffset!(
   ret::Expr,
@@ -84,9 +104,16 @@ function __addoffset!(
   indexstride,
   index,
   offset,
-  calcbypointeroffset::Bool,
+  calcbypointeroffset::Bool
 ) # 6 -> 5 args
-  ___addoffset!(ret, vloopstride, indexstride, index, offset, calcbypointeroffset)
+  ___addoffset!(
+    ret,
+    vloopstride,
+    indexstride,
+    index,
+    offset,
+    calcbypointeroffset
+  )
 end
 function __addoffset!(
   ret::Expr,
@@ -94,7 +121,7 @@ function __addoffset!(
   indexstride::Union{Int,MaybeKnown},
   index,
   offset,
-  calcbypointeroffset::Bool,
+  calcbypointeroffset::Bool
 ) # 6 -> 5 args
   if isknown(vloopstride) & isknown(indexstride)
     addoffset!(
@@ -102,10 +129,17 @@ function __addoffset!(
       gethint(vloopstride) * gethint(indexstride),
       index,
       offset,
-      calcbypointeroffset,
+      calcbypointeroffset
     )
   else
-    ___addoffset!(ret, vloopstride, indexstride, index, offset, calcbypointeroffset)
+    ___addoffset!(
+      ret,
+      vloopstride,
+      indexstride,
+      index,
+      offset,
+      calcbypointeroffset
+    )
   end
 end
 function ___addoffset!(
@@ -114,9 +148,15 @@ function ___addoffset!(
   indexstride,
   index,
   offset,
-  calcbypointeroffset::Bool,
+  calcbypointeroffset::Bool
 ) # 6 -> 5 args
-  addoffset!(ret, mulexpr(vloopstride, indexstride), index, offset, calcbypointeroffset)
+  addoffset!(
+    ret,
+    mulexpr(vloopstride, indexstride),
+    index,
+    offset,
+    calcbypointeroffset
+  )
 end
 # multiply `index` by `indexstride`
 function addoffset!(
@@ -125,12 +165,19 @@ function addoffset!(
   indexstride,
   index,
   offset,
-  calcbypointeroffset::Bool,
+  calcbypointeroffset::Bool
 ) # 6 -> (5 or 6) args
   if _isone(indexstride)
     addoffset!(ret, vloopstride, index, offset, calcbypointeroffset) # 5
   elseif calcbypointeroffset # `ind` is getting dropped, no need to allocate via `mulexpr`
-    _addoffset!(ret, vloopstride, indexstride, index, offset, calcbypointeroffset) # 6
+    _addoffset!(
+      ret,
+      vloopstride,
+      indexstride,
+      index,
+      offset,
+      calcbypointeroffset
+    ) # 6
   else # multiply index by stride
     _addoffset!(
       ret,
@@ -138,12 +185,11 @@ function addoffset!(
       indexstride,
       mulexpr(index, indexstride),
       offset,
-      calcbypointeroffset,
+      calcbypointeroffset
     ) # 6
   end
 end
 
-
 function addoffset!(
   ret::Expr,
   indvectorized::Bool,
@@ -151,14 +197,27 @@ function addoffset!(
   indexstride,
   index,
   offset,
-  calcbypointeroffset::Bool,
+  calcbypointeroffset::Bool
 ) # 7 -> (5 or 6) args
   if indvectorized
-    addoffset!(ret, vloopstride, indexstride, index, offset, calcbypointeroffset)
+    addoffset!(
+      ret,
+      vloopstride,
+      indexstride,
+      index,
+      offset,
+      calcbypointeroffset
+    )
   elseif _isone(indexstride)
     addoffset!(ret, 0, index, offset, calcbypointeroffset)
   else
-    addoffset!(ret, 0, lazymulexpr(index, indexstride), offset, calcbypointeroffset)
+    addoffset!(
+      ret,
+      0,
+      lazymulexpr(index, indexstride),
+      offset,
+      calcbypointeroffset
+    )
   end
 end
 
@@ -170,7 +229,7 @@ function addvectoroffset!(
   indexstride,
   index,
   offset,
-  calcbypointeroffset::Bool,
+  calcbypointeroffset::Bool
 ) # 8 -> 7 args
   # if _iszero(unrolledsteps) # if no steps, pass through; should be unreachable
   #     addoffset!(ret, indvectorized, vloopstride, indexstride, index, offset, calcbypointeroffset)
@@ -184,7 +243,7 @@ function addvectoroffset!(
         indexstride,
         VECTORWIDTHSYMBOL,
         offset,
-        false,
+        false
       )
     else
       addoffset!(
@@ -194,7 +253,7 @@ function addvectoroffset!(
         indexstride,
         mulexpr(VECTORWIDTHSYMBOL, unrolledsteps),
         offset,
-        false,
+        false
       )
     end
   elseif _isone(unrolledsteps) # add the step to the index
@@ -205,7 +264,7 @@ function addvectoroffset!(
       indexstride,
       addexpr(VECTORWIDTHSYMBOL, index),
       offset,
-      false,
+      false
     )
   else
     addoffset!(
@@ -215,7 +274,7 @@ function addvectoroffset!(
       indexstride,
       addexpr(mulexpr(VECTORWIDTHSYMBOL, unrolledsteps), index),
       offset,
-      false,
+      false
     )
   end
 end
@@ -230,10 +289,18 @@ function addvectoroffset!(
   index,
   offset::Integer,
   calcbypointeroffset::Bool,
-  indvectorized::Bool,
+  indvectorized::Bool
 ) # 10 -> (7 or 8) args
   if unrolledsteps == 0 # neither unrolledloopstride or indexstride can be 0
-    addoffset!(ret, mm, vloopstride, indexstride, index, offset, calcbypointeroffset) # 7 arg
+    addoffset!(
+      ret,
+      mm,
+      vloopstride,
+      indexstride,
+      index,
+      offset,
+      calcbypointeroffset
+    ) # 7 arg
   elseif indvectorized
     unrolledsteps *= indexstride
     if isknown(unrolledloopstride)
@@ -245,7 +312,7 @@ function addvectoroffset!(
         indexstride,
         index,
         offset,
-        calcbypointeroffset,
+        calcbypointeroffset
       ) # 8 arg
     elseif unrolledsteps == 1
       addvectoroffset!(
@@ -256,7 +323,7 @@ function addvectoroffset!(
         indexstride,
         index,
         offset,
-        calcbypointeroffset,
+        calcbypointeroffset
       ) # 8 arg
     else
       addvectoroffset!(
@@ -267,7 +334,7 @@ function addvectoroffset!(
         indexstride,
         index,
         offset,
-        calcbypointeroffset,
+        calcbypointeroffset
       ) # 8 arg
     end
   elseif _isone(unrolledloopstride)
@@ -278,7 +345,7 @@ function addvectoroffset!(
       indexstride,
       index,
       offset + unrolledsteps,
-      calcbypointeroffset,
+      calcbypointeroffset
     ) # 7 arg
   else
     addoffset!(
@@ -288,7 +355,7 @@ function addvectoroffset!(
       mulexpr(unrolledloopstride, indexstride),
       index,
       addexpr(offset, lazymulexpr(unrolledloopstride, unrolledsteps)),
-      calcbypointeroffset,
+      calcbypointeroffset
     ) # 7 arg
   end
 end
@@ -304,7 +371,7 @@ function mem_offset(
   inds_calc_by_ptr_offset::Vector{Bool},
   _mm::Bool,
   ls::LoopSet,
-  preserve_vecunroll::Bool,
+  preserve_vecunroll::Bool
 )
   # @assert accesses_memory(op) "Computing memory offset only makes sense for operations that access memory."
   ret = Expr(:tuple)
@@ -347,7 +414,12 @@ function mem_offset(
 end
 function sptr(op::Operation)
   vp = vptr(op)
-  Expr(:call, GlobalRef(VectorizationBase, :reconstruct_ptr), vp, vptr_offset(vp))
+  Expr(
+    :call,
+    GlobalRef(VectorizationBase, :reconstruct_ptr),
+    vp,
+    vptr_offset(vp)
+  )
 end
 function sptr!(q::Expr, op::Operation)
   vp = vptr(op)
@@ -365,7 +437,7 @@ function unrolled_curly(
   u₁loop::Loop,
   vloop::Loop,
   mask::Bool,
-  interleave::Int = 0,
+  interleave::Int = 0
 )
   u₁loopsym = u₁loop.itersymbol
   vloopsym = vloop.itersymbol
@@ -396,14 +468,16 @@ function unrolled_curly(
       end
       # if (u₁loopsym === CONSTANTZEROINDEX) ? (CONSTANTZEROINDEX ∈ loopdependencies(opp)) : (isu₁unrolled(opp) || (ind === u₁loopsym))
       # can't check isu₁unrolled(opp) because we may be lying.
-      if (u₁loopsym === CONSTANTZEROINDEX) ? (CONSTANTZEROINDEX ∈ loopdependencies(opp)) :
+      if (u₁loopsym === CONSTANTZEROINDEX) ?
+         (CONSTANTZEROINDEX ∈ loopdependencies(opp)) :
          (u₁loopsym ∈ loopdependencies(opp) || (ind === u₁loopsym))
         @assert AU == -1
         AU = n
       end
     end
   end
-  AU == -1 && throw(LoopError("Failed to find $(u₁loopsym) in args of $(repr(op))."))
+  AU == -1 &&
+    throw(LoopError("Failed to find $(u₁loopsym) in args of $(repr(op))."))
   vecnotunrolled = AU != AV
   conditional_memory_op = isconditionalmemop(op)
   if mask || conditional_memory_op
@@ -458,7 +532,7 @@ function unrolled_curly(
       0,
       1,
       M,
-      1,
+      1
     )
   end
 end
@@ -467,10 +541,11 @@ function unrolledindex(
   td::UnrollArgs,
   mask::Bool,
   inds_calc_by_ptr_offset::Vector{Bool},
-  ls::LoopSet,
+  ls::LoopSet
 )
   @unpack u₁, u₁loopsym, u₁loop, vloop = td
-  isone(u₁) && return mem_offset_u(op, td, inds_calc_by_ptr_offset, true, 0, ls, false)
+  isone(u₁) &&
+    return mem_offset_u(op, td, inds_calc_by_ptr_offset, true, 0, ls, false)
   any(==(u₁loopsym), getindicesonly(op)) ||
     return mem_offset_u(op, td, inds_calc_by_ptr_offset, true, 0, ls, true)
 
@@ -486,7 +561,7 @@ function mem_offset_u(
   _mm::Bool,
   incr₁::Int,
   ls::LoopSet,
-  preserve_vecunroll::Bool,
+  preserve_vecunroll::Bool
 )
   @assert accesses_memory(op) "Computing memory offset only makes sense for operations that access memory."
   @unpack u₁loopsym, u₂loopsym, vloopsym, u₁step, u₂step, vstep, suffix = td
@@ -500,7 +575,14 @@ function mem_offset_u(
   # allbasezero = all(inds_calc_by_ptr_offset) && all(iszero, offsets)
   loopedindex = op.ref.loopedindex
   if iszero(incr₁) & iszero(incr₂)
-    return mem_offset(op, td, inds_calc_by_ptr_offset, _mm, ls, preserve_vecunroll)
+    return mem_offset(
+      op,
+      td,
+      inds_calc_by_ptr_offset,
+      _mm,
+      ls,
+      preserve_vecunroll
+    )
     # append_inds!(ret, indices, loopedindex)
   else
     for (n, ind) ∈ enumerate(indices)
@@ -522,7 +604,7 @@ function mem_offset_u(
           ind,
           offset,
           ind_by_offset,
-          indvectorized,
+          indvectorized
         ) # 9 arg
       elseif ind === u₂loopsym
         addvectoroffset!(
@@ -535,10 +617,18 @@ function mem_offset_u(
           ind,
           offset,
           ind_by_offset,
-          indvectorized,
+          indvectorized
         ) # 9 arg
       elseif loopedindex[n]
-        addoffset!(ret, indvectorizedmm, vstep, stride, ind, offset, ind_by_offset) # 7 arg
+        addoffset!(
+          ret,
+          indvectorizedmm,
+          vstep,
+          stride,
+          ind,
+          offset,
+          ind_by_offset
+        ) # 7 arg
       else
         offset += (stride - 1) # 1 -> 0-based indexing
         newname, parent = symbolind(ind, op, td, ls)
@@ -551,7 +641,7 @@ function mem_offset_u(
             newname_unmm = Expr(
               :call,
               lv(:unmm),
-              Expr(:call, gf, Expr(:call, gf, newname, 1), incr₁ + 1, false),
+              Expr(:call, gf, Expr(:call, gf, newname, 1), incr₁ + 1, false)
             )
           else
             newname_unmm = Expr(:call, lv(:unmm), newname)
@@ -562,7 +652,8 @@ function mem_offset_u(
           addoffset!(ret, 0, newname_unmm, offset, false)
         elseif (isu₁unrolled(parent) & (td.u₁ > 1)) && !preserve_vecunroll
           gf = GlobalRef(Core, :getfield)
-          firstnew = Expr(:call, gf, Expr(:call, gf, newname, 1), incr₁ + 1, false)
+          firstnew =
+            Expr(:call, gf, Expr(:call, gf, newname, 1), incr₁ + 1, false)
           if stride ≠ 1
             firstnew = mulexpr(firstnew, stride)
           end
@@ -591,7 +682,6 @@ end
   q
 end
 
-
 isconditionalmemop(op::Operation) =
   (instruction(op).instr === :conditionalload) ||
   (instruction(op).instr === :conditionalstore!)
@@ -601,14 +691,21 @@ function add_memory_mask!(
   td::UnrollArgs,
   mask::Bool,
   ls::LoopSet,
-  u₁ᵢ::Int,
+  u₁ᵢ::Int
 )
   @unpack u₁, u₁loopsym, u₂loopsym, vloopsym, u₂max, suffix = td
   if isconditionalmemop(op)
     condop = last(parents(op))
     opu₂ = (suffix ≠ -1) && isu₂unrolled(op)
-    condvar, condu₁unrolled =
-      condvarname_and_unroll(condop, u₁loopsym, u₂loopsym, vloopsym, suffix, opu₂, ls)
+    condvar, condu₁unrolled = condvarname_and_unroll(
+      condop,
+      u₁loopsym,
+      u₂loopsym,
+      vloopsym,
+      suffix,
+      opu₂,
+      ls
+    )
     # if it isn't unrolled, then `m`
     u = condu₁unrolled ? u₁ : 1
     # u = isu₁unrolled(condop) ? u₁ : 1
@@ -626,7 +723,10 @@ function add_memory_mask!(
       if (u₁ᵢ == 0) | (u == 1)
         push!(memopexpr.args, condvar)
       else
-        push!(memopexpr.args, :($getfield($getfield($condvar, 1), $(u₁ᵢ), false)))
+        push!(
+          memopexpr.args,
+          :($getfield($getfield($condvar, 1), $(u₁ᵢ), false))
+        )
       end
     elseif (u₁loopsym ≢ vloopsym) | (u₁ == 1) # mask all equivalenetly
       push!(memopexpr.args, Expr(:call, lv(:&), condvar, MASKSYMBOL))
@@ -648,12 +748,17 @@ function add_memory_mask!(
     elseif u₁ᵢ == u₁ # mask
       push!(
         memopexpr.args,
-        Expr(:call, lv(:&), :($getfield($getfield(condvar, 1), $u₁ᵢ, false)), MASKSYMBOL),
+        Expr(
+          :call,
+          lv(:&),
+          :($getfield($getfield(condvar, 1), $u₁ᵢ, false)),
+          MASKSYMBOL
+        )
       )
     else
       push!(
         memopexpr.args,
-        Expr(:call, lv(:&), :($getfield($getfield(condvar, 1), $u₁ᵢ, false))),
+        Expr(:call, lv(:&), :($getfield($getfield(condvar, 1), $u₁ᵢ, false)))
       )
     end
   elseif mask && isvectorized(op)
@@ -669,7 +774,7 @@ function condvarname_and_unroll(
   vloop::Symbol,
   suffix::Int,
   opu₂::Bool,
-  ls::LoopSet,
+  ls::LoopSet
 )
   condvar, condu₁, condu₂ = variable_name_and_unrolled(
     cond,
@@ -677,7 +782,7 @@ function condvarname_and_unroll(
     u₂loop,
     vloop,
     Core.ifelse(opu₂, suffix, -1),
-    ls,
+    ls
   )
   condvar, condu₁
 end
diff --git a/src/codegen/lower_store.jl b/src/codegen/lower_store.jl
index fb6862448..58fad9fef 100644
--- a/src/codegen/lower_store.jl
+++ b/src/codegen/lower_store.jl
@@ -34,7 +34,12 @@ function storeinstr_preprend(op::Operation, vloopsym::Symbol)
   # end
 end
 
-function reduce_expr_u₂(toreduct::Symbol, op::Operation, u₂::Int, suffix::Symbol)
+function reduce_expr_u₂(
+  toreduct::Symbol,
+  op::Operation,
+  u₂::Int,
+  suffix::Symbol
+)
   t = Expr(:tuple)
   for u ∈ 0:u₂-1
     push!(t.args, Symbol(toreduct, u, suffix))
@@ -48,11 +53,14 @@ function reduce_expr!(
   u₁::Int,
   u₂::Int,
   isu₁unrolled::Bool,
-  isu₂unrolled::Bool,
+  isu₂unrolled::Bool
 )
   if isu₂unrolled# u₂ != -1
     _toreduct = Symbol(toreduct, 0)
-    push!(q.args, Expr(:(=), _toreduct, reduce_expr_u₂(toreduct, op, u₂, Symbol(""))))
+    push!(
+      q.args,
+      Expr(:(=), _toreduct, reduce_expr_u₂(toreduct, op, u₂, Symbol("")))
+    )
   else#if u₂ == -1
     _toreduct = Symbol(toreduct, '_', u₁)
   end
@@ -64,8 +72,8 @@ function reduce_expr!(
       Expr(
         :(=),
         Symbol(toreduct, "##onevec##"),
-        Expr(:call, reduction_to_single_vector(op), _toreduct),
-      ),
+        Expr(:call, reduction_to_single_vector(op), _toreduct)
+      )
     )
   else
     fifelse = let u₁ = u₁
@@ -78,8 +86,8 @@ function reduce_expr!(
       Expr(
         :(=),
         Symbol(toreduct, "##onevec##"),
-        Expr(:call, fifelse, _toreduct, staticexpr(1)),
-      ),
+        Expr(:call, fifelse, _toreduct, staticexpr(1))
+      )
     )
   end
   nothing
@@ -91,7 +99,7 @@ function lower_store_collection!(
   op::Operation,
   ua::UnrollArgs,
   mask::Bool,
-  inds_calc_by_ptr_offset::Vector{Bool},
+  inds_calc_by_ptr_offset::Vector{Bool}
 )
   omop = offsetloadcollection(ls)
   batchid, _ = omop.batchedcollectionmap[identifier(op)]
@@ -124,7 +132,7 @@ function lower_store_collection!(
     MaybeKnown(1024),
     MaybeKnown(1),
     Symbol(""),
-    Symbol(""),
+    Symbol("")
   )
   unrollcurl₂ = unrolled_curly(op, nouter, offset_dummy_loop, vloop, mask, 1)
   inds = mem_offset_u(op, ua, inds_calc_by_ptr_offset, false, 0, ls, false)
@@ -136,7 +144,10 @@ function lower_store_collection!(
     # unrollcurl₂ is unrolled along `first(getindices(op))` by factor of `nouter`
     # 
     # if isknown(step(u₁loop)) && sum(Base.Fix2(===,u₁loopsym), getindicesonly(op)) == 1
-    if (isknown(step(u₁loop)) && sum(Base.Fix2(===, u₁loopsym), getindicesonly(op)) == 1)# && (isone(step(u₁loop)) | (first(getindices(op)) ≢ u₁loopsym))
+    if (
+      isknown(step(u₁loop)) &&
+      sum(Base.Fix2(===, u₁loopsym), getindicesonly(op)) == 1
+    )# && (isone(step(u₁loop)) | (first(getindices(op)) ≢ u₁loopsym))
       # if first(getindices(op)) === u₁loopsym#vloopsym
       #   interleaveval = -nouter
       # else
@@ -153,7 +164,8 @@ function lower_store_collection!(
   end
   uinds = Expr(:call, unrollcurl₂, inds)
   sptrsym = sptr!(q, op)
-  storeexpr = Expr(:call, lv(:_vstore!), sptrsym, Expr(:call, lv(:VecUnroll), t), uinds)
+  storeexpr =
+    Expr(:call, lv(:_vstore!), sptrsym, Expr(:call, lv(:VecUnroll), t), uinds)
   # not using `add_memory_mask!(storeexpr, op, ua, mask, ls)` because we checked `isconditionalmemop` earlier in `lower_load_collection!`
   u₁vectorized = u₁loopsym === vloopsym
   if mask# && isvectorized(op))
@@ -190,7 +202,7 @@ function lower_store_collection!(
         storeexpr_tmp.args[4] = Expr(
           :call,
           unrollcurl₂,
-          mem_offset_u(op, ua, inds_calc_by_ptr_offset, false, u, ls, false),
+          mem_offset_u(op, ua, inds_calc_by_ptr_offset, false, u, ls, false)
         )
       end
       push!(q.args, storeexpr_tmp)
@@ -208,13 +220,14 @@ function lower_store!(
   ua::UnrollArgs,
   mask::Bool,
   reductfunc::Symbol = storeinstr_preprend(op, ua.vloop.itersymbol),
-  inds_calc_by_ptr_offset = indices_calculated_by_pointer_offsets(ls, op.ref),
+  inds_calc_by_ptr_offset = indices_calculated_by_pointer_offsets(ls, op.ref)
 )
   @unpack u₁, u₁loopsym, u₂loopsym, vloopsym, vloop, u₂max, suffix = ua
   omop = offsetloadcollection(ls)
   batchid, opind = omop.batchedcollectionmap[identifier(op)]
   if ((batchid ≠ 0) && isvectorized(op)) && (!rejectinterleave(op))
-    (opind == 1) && lower_store_collection!(q, ls, op, ua, mask, inds_calc_by_ptr_offset)
+    (opind == 1) &&
+      lower_store_collection!(q, ls, op, ua, mask, inds_calc_by_ptr_offset)
     return
   end
   falseexpr = Expr(:call, lv(:False))
@@ -222,7 +235,10 @@ function lower_store!(
   # trueexpr = Expr(:call, lv(:True));
   rs = staticexpr(reg_size(ls))
   opp = first(parents(op))
-  if ((opp.instruction.instr === reductfunc) || (opp.instruction.instr === :identity))
+  if (
+    (opp.instruction.instr === reductfunc) ||
+    (opp.instruction.instr === :identity)
+  )
     parents_opp = parents(opp)
     opppstate = Base.iterate(parents_opp)
     if opppstate ≢ nothing
@@ -267,19 +283,28 @@ function lower_store!(
       data_u₁ && push!(q.args, Expr(:(=), mvard, Expr(:call, lv(:data), mvar)))
       sptrsym = sptr!(q, op)
       for u ∈ 1:u₁
-        inds = mem_offset_u(op, ua, inds_calc_by_ptr_offset, true, u - 1, ls, false)
+        inds =
+          mem_offset_u(op, ua, inds_calc_by_ptr_offset, true, u - 1, ls, false)
         storeexpr = if data_u₁
           if reductfunc === Symbol("")
             Expr(:call, lv(:_vstore!), sptrsym, gf(mvard, u), inds)
           else
-            Expr(:call, lv(:_vstore!), lv(reductfunc), sptrsym, gf(mvard, u), inds)
+            Expr(
+              :call,
+              lv(:_vstore!),
+              lv(reductfunc),
+              sptrsym,
+              gf(mvard, u),
+              inds
+            )
           end
         elseif reductfunc === Symbol("")
           Expr(:call, lv(:_vstore!), sptrsym, mvar, inds)
         else
           Expr(:call, lv(:_vstore!), lv(reductfunc), sptrsym, mvar, inds)
         end
-        domask = mask && (isvectorized(op) & ((u == u₁) | (vloopsym !== u₁loopsym)))
+        domask =
+          mask && (isvectorized(op) & ((u == u₁) | (vloopsym !== u₁loopsym)))
         add_memory_mask!(storeexpr, op, ua, domask, ls, u)# & ((u == u₁) | isvectorized(op)))
         push!(storeexpr.args, falseexpr, aliasexpr, falseexpr, rs)
         push!(q.args, storeexpr)
@@ -307,7 +332,7 @@ function lower_tiled_store!(
   unrollsyms::UnrollSymbols,
   u₁::Int,
   u₂::Int,
-  mask::Bool,
+  mask::Bool
 )
   ua = UnrollArgs(ls, u₁, unrollsyms, u₂, 0)
   for opsv ∈ (opsv1, opsv2)
@@ -317,7 +342,12 @@ function lower_tiled_store!(
   end
 end
 
-function donot_tile_store(ls::LoopSet, op::Operation, reductfunc::Symbol, u₂::Int)
+function donot_tile_store(
+  ls::LoopSet,
+  op::Operation,
+  reductfunc::Symbol,
+  u₂::Int
+)
   (
     (!((reductfunc === Symbol("")) && all(op.ref.loopedindex))) ||
     (u₂ ≤ 1) ||
@@ -340,7 +370,7 @@ function lower_tiled_store!(
   ua::UnrollArgs,
   u₁::Int,
   u₂::Int,
-  mask::Bool,
+  mask::Bool
 )
   @unpack u₁loopsym, u₂loopsym, vloopsym, u₁loop, u₂loop, vloop = ua
   reductfunc = storeinstr_preprend(op, vloopsym)
@@ -352,7 +382,15 @@ function lower_tiled_store!(
     @unpack u₁, u₂max = ua
     for t ∈ 0:u₂-1
       unrollargs = UnrollArgs(u₁loop, u₂loop, vloop, u₁, u₂max, t)
-      lower_store!(blockq, ls, op, unrollargs, mask, reductfunc, inds_calc_by_ptr_offset)
+      lower_store!(
+        blockq,
+        ls,
+        op,
+        unrollargs,
+        mask,
+        reductfunc,
+        inds_calc_by_ptr_offset
+      )
     end
     return
   end
@@ -360,8 +398,8 @@ function lower_tiled_store!(
   if (opp.instruction.instr === reductfunc) && isone(length(parents(opp)))
     throw(
       LoopError(
-        "Operation $opp's instruction is $reductfunc, shouldn't be able to reach here.",
-      ),
+        "Operation $opp's instruction is $reductfunc, shouldn't be able to reach here."
+      )
     )
     # opp = only(parents(opp))
   end
diff --git a/src/codegen/lower_threads.jl b/src/codegen/lower_threads.jl
index 3f0a86d34..e1c0d770e 100644
--- a/src/codegen/lower_threads.jl
+++ b/src/codegen/lower_threads.jl
@@ -4,7 +4,7 @@ struct TURBO{UNROLL,OPS,ARF,AM,LPSYM,LBV,FLBV} <: Function end
 # hopefully shouldn't add much to compile time.
 
 function (::TURBO{UNROLL,OPS,ARF,AM,LPSYM,LBV,FLBV})(
-  p::Ptr{UInt},
+  p::Ptr{UInt}
 ) where {UNROLL,OPS,ARF,AM,LPSYM,K,LBV,FLBV<:Tuple{Vararg{Any,K}}}
   (_, _vargs) = ThreadingUtilities.load(p, FLBV, 2 * sizeof(UInt))
   # Main.VARGS[Threads.threadid()] = first(_vargs)
@@ -16,14 +16,14 @@ function (::TURBO{UNROLL,OPS,ARF,AM,LPSYM,LBV,FLBV})(
     Val{AM}(),
     Val{LPSYM}(),
     Val{LBV}(),
-    _vargs...,
+    _vargs...
   )
   ThreadingUtilities.store!(p, ret, Int(register_size()))
   ThreadingUtilities._atomic_store!(p, ThreadingUtilities.SPIN)
   nothing
 end
 @generated function Base.pointer(
-  ::TURBO{UNROLL,OPS,ARF,AM,LPSYM,LBV,FLBV},
+  ::TURBO{UNROLL,OPS,ARF,AM,LPSYM,LBV,FLBV}
 ) where {UNROLL,OPS,ARF,AM,LPSYM,K,LBV,FLBV<:Tuple{Vararg{Any,K}}}
   f = TURBO{UNROLL,OPS,ARF,AM,LPSYM,LBV,FLBV}()
   precompile(f, (Ptr{UInt},))
@@ -36,7 +36,7 @@ end
 @inline function setup_turbo_threads!(
   p::Ptr{UInt},
   fptr::Ptr{Cvoid},
-  args::LBV,
+  args::LBV
 ) where {K,LBV<:Tuple{Vararg{Any,K}}}
   offset = ThreadingUtilities.store!(p, fptr, sizeof(UInt))
   offset = ThreadingUtilities.store!(p, args, offset)
@@ -54,13 +54,13 @@ struct StaticType{T} end
   ::Val{LPSYM},
   ::StaticType{LBV},
   fargs::FARGS,
-  tid,
+  tid
 ) where {UNROLL,OPS,ARF,AM,LPSYM,K,LBV<:Tuple{Vararg{Any,K}},FARGS}
   ThreadingUtilities.launch(
     setup_turbo_threads!,
     tid,
     pointer(TURBO{UNROLL,OPS,ARF,AM,LPSYM,LBV,FARGS}()),
-    fargs,
+    fargs
   )
 end
 
@@ -73,7 +73,11 @@ end
   t
 end
 @inline cld_fast(x, y) = Base.udiv_int(vsub_nw(vadd_nw(x, y), one(y)), y)
-@inline function choose_num_blocks(MoW::UInt, ::StaticInt{U}, ::StaticInt{T}) where {U,T}
+@inline function choose_num_blocks(
+  MoW::UInt,
+  ::StaticInt{U},
+  ::StaticInt{T}
+) where {U,T}
   factors = calc_factors(StaticInt{T}())
   for i ∈ 1:length(factors)-1
     # miter decreases in each iteration of factors
@@ -107,7 +111,7 @@ end
   M::UInt,
   ::StaticInt{U},
   nt,
-  ::StaticInt{NTMAX},
+  ::StaticInt{NTMAX}
 ) where {U,NTMAX}
   if NTMAX == 2 # `nt` must be `2`
     return quote
@@ -119,7 +123,7 @@ end
   ifq = Expr(
     :if,
     :(nt == $NTMAX),
-    :(choose_num_blocks(M, StaticInt{$U}(), StaticInt{$NTMAX}())),
+    :(choose_num_blocks(M, StaticInt{$U}(), StaticInt{$NTMAX}()))
   )
   add_bisecting_if_branches!(ifq, 2, NTMAX - 1, U, false)
   push!(q.args, ifq)
@@ -151,24 +155,26 @@ scale_cost(c, looplen) = scale_cost(@fastmath c / looplen)
 @inline function choose_num_threads(
   C::T,
   NT::UInt,
-  x::Base.BitInteger,
+  x::Base.BitInteger
 ) where {T<:Union{Float32,Float64}}
   _choose_num_threads(scale_cost(T(C)), NT, x)
 end
 @inline function _choose_num_threads(
   C::T,
   NT::UInt,
-  x::Base.BitInteger,
+  x::Base.BitInteger
 ) where {T<:Union{Float32,Float64}}
   max(
     min(
       Base.fptoui(
         UInt,
-        Base.ceil_llvm(Base.mul_float_fast(C, Base.sqrt_llvm_fast(Base.uitofp(T, x)))),
+        Base.ceil_llvm(
+          Base.mul_float_fast(C, Base.sqrt_llvm_fast(Base.uitofp(T, x)))
+        )
       ),
-      NT,
+      NT
     ),
-    one(UInt),
+    one(UInt)
   )
 end
 function push_loop_length_expr!(q::Expr, ls::LoopSet)
@@ -213,11 +219,15 @@ function outer_reduct_combine_expressions(ls::LoopSet, retv)
     :block,
     :(
       var"#load#thread#ret#" = $gf(
-        ThreadingUtilities.load(var"#thread#ptr#", typeof($retv), $(reg_size(ls))),
+        ThreadingUtilities.load(
+          var"#thread#ptr#",
+          typeof($retv),
+          $(reg_size(ls))
+        ),
         2,
-        false,
+        false
       )
-    ),
+    )
   )
   # push!(q.args, :(@show var"#load#thread#ret#"))
   for (i, or) ∈ enumerate(ls.outer_reductions)
@@ -235,9 +245,16 @@ function outer_reduct_combine_expressions(ls::LoopSet, retv)
           otherarg = Expr(
             :call,
             lv(:vecmemaybe),
-            Expr(:call, GlobalRef(Core, :getfield), Symbol("#load#thread#ret#"), j, false),
+            Expr(
+              :call,
+              GlobalRef(Core, :getfield),
+              Symbol("#load#thread#ret#"),
+              j,
+              false
+            )
           )
-          Expr(:call, lv(:vecmemaybe), Symbol(mangledvar(opv), "##onevec##")), (otherarg,)
+          Expr(:call, lv(:vecmemaybe), Symbol(mangledvar(opv), "##onevec##")),
+          (otherarg,)
         end
       end
       Expr(:call, reductexpr)
@@ -249,11 +266,14 @@ function outer_reduct_combine_expressions(ls::LoopSet, retv)
         Expr(
           :call,
           lv(:vecmemaybe),
-          Expr(:call, gf, Symbol("#load#thread#ret#"), i, false),
-        ),
+          Expr(:call, gf, Symbol("#load#thread#ret#"), i, false)
+        )
       )
     else
-      push!(instrcall.args, Expr(:call, lv(:vecmemaybe), Symbol("#load#thread#ret#")))
+      push!(
+        instrcall.args,
+        Expr(:call, lv(:vecmemaybe), Symbol("#load#thread#ret#"))
+      )
     end
     push!(q.args, Expr(:(=), out, Expr(:call, :data, instrcall)))
     # push!(q.args, Expr(:(=), out, :(@show $data($instrcall))))
@@ -265,7 +285,7 @@ function thread_loop_summary!(
   ls::LoopSet,
   ua::UnrollArgs,
   threadedloop::Loop,
-  issecondthreadloop::Bool,
+  issecondthreadloop::Bool
 )
   W = ls.vector_width
   @unpack u₁loop, u₂loop, vloop, u₁, u₂max = ua
@@ -285,7 +305,7 @@ function thread_loop_summary!(
     :(
       $num_unroll_sym = Base.udiv_int(
         vadd_nw($lensym, $(UInt(unroll_factor - 1))),
-        $(UInt(unroll_factor)),
+        $(UInt(unroll_factor))
       )
     )
   end
@@ -309,10 +329,13 @@ function thread_loop_summary!(
         unroll_factor,
         threadedloop,
         iterstop_sym,
-        true,
+        true
       )
     else
-      iterstop = :($iterstop_sym::Int = vadd_nsw($iterstart_sym, vmul_nw($blksz_sym, $mf)))
+      iterstop = :(
+        $iterstop_sym::Int =
+          vadd_nsw($iterstart_sym, vmul_nw($blksz_sym, $mf))
+      )
       looprange = :($iterstart_sym:StaticInt{$mf}())
       lastrange = :($iterstart_sym:StaticInt{$mf}())
       push_loopbound_ends!(
@@ -321,14 +344,16 @@ function thread_loop_summary!(
         unroll_factor,
         threadedloop,
         :(vsub_nsw($iterstop_sym, one($iterstop_sym))),
-        false,
+        false
       )
     end
   else
     stepthread_sym = Symbol("#step#thread#$threadloopnumtag#")
     pushpreamble!(ls, :($stepthread_sym = $(getsym(step(threadedloop)))))
-    iterstop =
-      :($iterstop_sym = vadd_nsw($iterstart_sym, vmul_nw($blksz_sym, $stepthread_sym)))
+    iterstop = :(
+      $iterstop_sym =
+        vadd_nsw($iterstart_sym, vmul_nw($blksz_sym, $stepthread_sym))
+    )
     looprange = :($iterstart_sym:$stepthread_sym)
     lastrange = :($iterstart_sym:$stepthread_sym)
     push_loopbound_ends!(
@@ -337,7 +362,7 @@ function thread_loop_summary!(
       unroll_factor,
       threadedloop,
       :(vsub_nsw($iterstop_sym, one($iterstop_sym))),
-      false,
+      false
     )
   end
   define_len, define_num_unrolls, loopstart, iterstop, looprange, lastrange
@@ -347,7 +372,7 @@ function push_last_bound!(
   lastrange::Expr,
   lastexpr,
   iterstop,
-  unroll_factor::Int,
+  unroll_factor::Int
 )
   push!(lastrange.args, lastexpr)
   unroll_factor ≠ 1 && push!(looprange.args, :(min($lastexpr, $iterstop)))
@@ -359,7 +384,7 @@ function push_loopbound_ends!(
   unroll_factor::Int,
   threadedloop::Loop,
   iterstop,
-  offsetlast::Bool,
+  offsetlast::Bool
 )
   if unroll_factor == 1
     push!(looprange.args, iterstop)
@@ -370,7 +395,7 @@ function push_loopbound_ends!(
       lastrange,
       gethint(last(threadedloop)) + offsetlast,
       iterstop,
-      unroll_factor,
+      unroll_factor
     )
   else
     lastsym = getsym(last(threadedloop))
@@ -380,7 +405,7 @@ function push_loopbound_ends!(
         lastrange,
         :(vadd_nsw($lastsym, one($lastsym))),
         iterstop,
-        unroll_factor,
+        unroll_factor
       )
     else
       push_last_bound!(looprange, lastrange, lastsym, iterstop, unroll_factor)
@@ -398,7 +423,8 @@ function define_block_size(threadedloop, vloop, tn, W)
   if threadedloop === vloop
     quote
       $baseblocksizeuint, $nrem = divrem_fast($num_unroll, $thread_factor)
-      $baseblocksizeint = ($baseblocksizeuint << $(VectorizationBase.intlog2(W))) % Int
+      $baseblocksizeint =
+        ($baseblocksizeuint << $(VectorizationBase.intlog2(W))) % Int
       $remstep = $(Int(W))
     end
   else
@@ -419,7 +445,7 @@ function thread_one_loops_expr(
   OPS::Expr,
   ARF::Expr,
   AM::Expr,
-  LPSYM::Expr,
+  LPSYM::Expr
 )
   looplen = looplengthprod(ls)
   c = scale_cost(c, looplen)
@@ -427,11 +453,16 @@ function thread_one_loops_expr(
     _num_threads = _choose_num_threads(c, ntmax, Int64(looplen))::UInt
     _num_threads > 1 || return avx_body(ls, UNROLL)
     ntcallexpr = Expr(:call, %, Expr(:call, Threads.nthreads), UInt)
-    choose_nthread =
-      Expr(:(=), Symbol("#nthreads#"), Expr(:call, min, ntcallexpr, _num_threads))
+    choose_nthread = Expr(
+      :(=),
+      Symbol("#nthreads#"),
+      Expr(:call, min, ntcallexpr, _num_threads)
+    )
   else
-    choose_nthread =
-      :(_choose_num_threads($(Float32(c)), min(Threads.nthreads() % UInt, $ntmax)))
+    choose_nthread = :(_choose_num_threads(
+      $(Float32(c)),
+      min(Threads.nthreads() % UInt, $ntmax)
+    ))
     push_loop_length_expr!(choose_nthread, ls)
     choose_nthread = Expr(:(=), Symbol("#nthreads#"), choose_nthread)
   end
@@ -458,7 +489,7 @@ function thread_one_loops_expr(
     $AM,
     $LPSYM,
     Val(typeof(var"#avx#call#args#")),
-    flatten_to_tuple(var"#avx#call#args#")...,
+    flatten_to_tuple(var"#avx#call#args#")...
   ))
   update_return_values = if length(ls.outer_reductions) > 0
     retv = loopset_return_value(ls, Val(false))
@@ -467,7 +498,8 @@ function thread_one_loops_expr(
   else
     nothing
   end
-  retexpr = length(ls.outer_reductions) > 0 ? :(return $retv) : :(return nothing)
+  retexpr =
+    length(ls.outer_reductions) > 0 ? :(return $retv) : :(return nothing)
   iterdef = define_block_size(threadedloop, ua.vloop, 0, ls.vector_width)
   q = quote
     $choose_nthread # UInt
@@ -487,23 +519,27 @@ function thread_one_loops_expr(
         var"#thread#id#" = 0x00000000
         var"##do#thread##" = false
         for var"#threads#" in var"#threads#tuple#"
-
           var"#thread#launch#count#" = 0x00000000
           var"#thread#mask#" = PolyesterWeave.mask(var"#threads#")
           var"#nrequest#" = length(var"#threads#")
           var"#threads#remain#" = var"#thread#launch#count#" ≠ var"#nrequest#"
           var"##do#thread##" |= var"#threads#remain#"
           while var"#threads#remain#"
-            VectorizationBase.assume(var"#thread#mask#" ≠ zero(var"#thread#mask#"))
-            var"#trailzing#zeros#" = Base.trailing_zeros(var"#thread#mask#") % UInt32
+            VectorizationBase.assume(
+              var"#thread#mask#" ≠ zero(var"#thread#mask#")
+            )
+            var"#trailzing#zeros#" =
+              Base.trailing_zeros(var"#thread#mask#") % UInt32
             var"#nblock#size#thread#0#" = Core.ifelse(
               var"#thread#launch#count#" < (var"#nrem#thread#0#" % UInt32),
               vadd_nw(var"#base#block#size#thread#0#", var"#block#rem#step#0#"),
-              var"#base#block#size#thread#0#",
+              var"#base#block#size#thread#0#"
             )
-            var"#trailzing#zeros#" = vadd_nw(var"#trailzing#zeros#", 0x00000001)
+            var"#trailzing#zeros#" =
+              vadd_nw(var"#trailzing#zeros#", 0x00000001)
             $iterstop
-            var"#thread#id#" = vadd_nw(var"#thread#id#", var"#trailzing#zeros#")
+            var"#thread#id#" =
+              vadd_nw(var"#thread#id#", var"#trailzing#zeros#")
 
             var"##lbvargs#to_launch##" = ($loopboundexpr, var"#vargs#")
             avx_launch(
@@ -514,14 +550,16 @@ function thread_one_loops_expr(
               $LPSYM,
               StaticType{typeof(var"##lbvargs#to_launch##")}(),
               flatten_to_tuple(var"##lbvargs#to_launch##"),
-              var"#thread#id#",
+              var"#thread#id#"
             )
 
             var"#thread#mask#" >>>= var"#trailzing#zeros#"
 
             var"#iter#start#0#" = var"#iter#stop#0#"
-            var"#thread#launch#count#" = vadd_nw(var"#thread#launch#count#", 0x00000001)
-            var"#threads#remain#" = var"#thread#launch#count#" ≠ var"#nrequest#"
+            var"#thread#launch#count#" =
+              vadd_nw(var"#thread#launch#count#", 0x00000001)
+            var"#threads#remain#" =
+              var"#thread#launch#count#" ≠ var"#nrequest#"
           end
           var"#nrem#thread#0#" -= var"#nrequest#"
         end
@@ -544,8 +582,10 @@ function thread_one_loops_expr(
         (var"#thread#mask#" ≠ zero(var"#thread#mask#"))
       while var"#threads#remain#"
         VectorizationBase.assume(var"#thread#mask#" ≠ zero(var"#thread#mask#"))
-        var"#trailzing#zeros#" =
-          vadd_nw(Base.trailing_zeros(var"#thread#mask#") % UInt32, 0x00000001)
+        var"#trailzing#zeros#" = vadd_nw(
+          Base.trailing_zeros(var"#thread#mask#") % UInt32,
+          0x00000001
+        )
         var"#thread#mask#" >>>= var"#trailzing#zeros#"
         var"#thread#id#" = vadd_nw(var"#thread#id#", var"#trailzing#zeros#")
         var"#thread#ptr#" = ThreadingUtilities.taskpointer(var"#thread#id#")
@@ -570,16 +610,31 @@ function define_vthread_blocks(vloop, u₁loop, u₂loop, u₁, u₂, ntmax, tn)
   sntmax = staticexpr(ntmax % Int)
   if vloop === u₁loop
     :(
-      $lhs =
-        _choose_num_blocks($loopunrollname, StaticInt{$u₁}(), var"#nthreads#", $sntmax)
+      $lhs = _choose_num_blocks(
+        $loopunrollname,
+        StaticInt{$u₁}(),
+        var"#nthreads#",
+        $sntmax
+      )
     )
   elseif vloop === u₂loop
     :(
-      $lhs =
-        _choose_num_blocks($loopunrollname, StaticInt{$u₂}(), var"#nthreads#", $sntmax)
+      $lhs = _choose_num_blocks(
+        $loopunrollname,
+        StaticInt{$u₂}(),
+        var"#nthreads#",
+        $sntmax
+      )
     )
   else
-    :($lhs = _choose_num_blocks($loopunrollname, StaticInt{1}(), var"#nthreads#", $sntmax))
+    :(
+      $lhs = _choose_num_blocks(
+        $loopunrollname,
+        StaticInt{1}(),
+        var"#nthreads#",
+        $sntmax
+      )
+    )
   end
 end
 function define_thread_blocks(
@@ -590,7 +645,7 @@ function define_thread_blocks(
   u₂loop,
   u₁,
   u₂,
-  ntmax,
+  ntmax
 )
   if vloop === threadedloop1
     define_vthread_blocks(threadedloop1, u₁loop, u₂loop, u₁, u₂, ntmax, 0)
@@ -613,7 +668,7 @@ function thread_two_loops_expr(
   OPS::Expr,
   ARF::Expr,
   AM::Expr,
-  LPSYM::Expr,
+  LPSYM::Expr
 )
   looplen = looplengthprod(ls)
   # c = 0.0225 * c / looplen
@@ -622,11 +677,16 @@ function thread_two_loops_expr(
     _num_threads = _choose_num_threads(c, ntmax, Int64(looplen))::UInt
     _num_threads > 1 || return avx_body(ls, UNROLL)
     ntcallexpr = Expr(:call, %, Expr(:call, Threads.nthreads), UInt)
-    choose_nthread =
-      Expr(:(=), Symbol("#nthreads#"), Expr(:call, min, ntcallexpr, _num_threads))
+    choose_nthread = Expr(
+      :(=),
+      Symbol("#nthreads#"),
+      Expr(:call, min, ntcallexpr, _num_threads)
+    )
   else
-    choose_nthread =
-      :(_choose_num_threads($(Float32(c)), min(Threads.nthreads() % UInt, $ntmax)))
+    choose_nthread = :(_choose_num_threads(
+      $(Float32(c)),
+      min(Threads.nthreads() % UInt, $ntmax)
+    ))
     push_loop_length_expr!(choose_nthread, ls)
     choose_nthread = Expr(:(=), Symbol("#nthreads#"), choose_nthread)
   end
@@ -644,10 +704,18 @@ function thread_two_loops_expr(
   u₂ = u₂max
   threadedloop1 = getloop(ls, threadedid1)
   threadedloop2 = getloop(ls, threadedid2)
-  define_len1, define_num_unrolls1, loopstart1, iterstop1, looprange1, lastrange1 =
-    thread_loop_summary!(ls, ua, threadedloop1, false)
-  define_len2, define_num_unrolls2, loopstart2, iterstop2, looprange2, lastrange2 =
-    thread_loop_summary!(ls, ua, threadedloop2, true)
+  define_len1,
+  define_num_unrolls1,
+  loopstart1,
+  iterstop1,
+  looprange1,
+  lastrange1 = thread_loop_summary!(ls, ua, threadedloop1, false)
+  define_len2,
+  define_num_unrolls2,
+  loopstart2,
+  iterstop2,
+  looprange2,
+  lastrange2 = thread_loop_summary!(ls, ua, threadedloop2, true)
   loopboundexpr = Expr(:tuple)
   lastboundexpr = Expr(:tuple)
   for loop ∈ ls.loops
@@ -670,7 +738,7 @@ function thread_two_loops_expr(
     $AM,
     $LPSYM,
     Val(typeof(var"#avx#call#args#")),
-    flatten_to_tuple(var"#avx#call#args#")...,
+    flatten_to_tuple(var"#avx#call#args#")...
   ))
   update_return_values = if length(ls.outer_reductions) > 0
     retv = loopset_return_value(ls, Val(false))
@@ -679,11 +747,20 @@ function thread_two_loops_expr(
   else
     nothing
   end
-  blockdef =
-    define_thread_blocks(threadedloop1, threadedloop2, vloop, u₁loop, u₂loop, u₁, u₂, ntmax)
+  blockdef = define_thread_blocks(
+    threadedloop1,
+    threadedloop2,
+    vloop,
+    u₁loop,
+    u₂loop,
+    u₁,
+    u₂,
+    ntmax
+  )
   iterdef1 = define_block_size(threadedloop1, vloop, 0, ls.vector_width)
   iterdef2 = define_block_size(threadedloop2, vloop, 1, ls.vector_width)
-  retexpr = length(ls.outer_reductions) > 0 ? :(return $retv) : :(return nothing)
+  retexpr =
+    length(ls.outer_reductions) > 0 ? :(return $retv) : :(return nothing)
   q = quote
     $choose_nthread # UInt
     $loopstart1
@@ -701,32 +778,38 @@ function thread_two_loops_expr(
         var"#thread#factor#0#" = var"#num#unrolls#thread#0#"
         var"#thread#factor#1#" = var"#num#unrolls#thread#1#"
       else
-        var"##thread#0##excess##" = var"#num#unrolls#thread#0#" ≥ var"#nthreads#"
-        var"##thread#1##excess##" = var"#num#unrolls#thread#1#" ≥ var"#nthreads#"
+        var"##thread#0##excess##" =
+          var"#num#unrolls#thread#0#" ≥ var"#nthreads#"
+        var"##thread#1##excess##" =
+          var"#num#unrolls#thread#1#" ≥ var"#nthreads#"
         if var"##thread#0##excess##" & var"##thread#1##excess##"
           $blockdef
         elseif var"##thread#0##excess##" # var"#num#unrolls#thread#1#" is small but var"#num#unrolls#thread#0#" is not; we want to place a small one in front
-          (var"#thread#factor#1#", var"#thread#factor#0#") = _choose_num_blocks(
-            var"#num#unrolls#thread#1#",
-            StaticInt{1}(),
-            var"#nthreads#",
-            $(staticexpr(ntmax % Int)),
-          )
+          (var"#thread#factor#1#", var"#thread#factor#0#") =
+            _choose_num_blocks(
+              var"#num#unrolls#thread#1#",
+              StaticInt{1}(),
+              var"#nthreads#",
+              $(staticexpr(ntmax % Int))
+            )
         else # var"#num#unrolls#thread#0#" is small, and var"#num#unrolls#thread#1#" may or may not be
-          (var"#thread#factor#0#", var"#thread#factor#1#") = _choose_num_blocks(
-            var"#num#unrolls#thread#0#",
-            StaticInt{1}(),
-            var"#nthreads#",
-            $(staticexpr(ntmax % Int)),
-          )
+          (var"#thread#factor#0#", var"#thread#factor#1#") =
+            _choose_num_blocks(
+              var"#num#unrolls#thread#0#",
+              StaticInt{1}(),
+              var"#nthreads#",
+              $(staticexpr(ntmax % Int))
+            )
         end
-        var"#thread#factor#0#" = min(var"#thread#factor#0#", var"#num#unrolls#thread#0#")
-        var"#thread#factor#1#" = min(var"#thread#factor#1#", var"#num#unrolls#thread#1#")
+        var"#thread#factor#0#" =
+          min(var"#thread#factor#0#", var"#num#unrolls#thread#0#")
+        var"#thread#factor#1#" =
+          min(var"#thread#factor#1#", var"#num#unrolls#thread#1#")
       end
       # @show (var"#thread#factor#0#", var"#thread#factor#1#")
       var"#nrequest#" = vsub_nsw(
         vmul_nsw(var"#thread#factor#0#", var"#thread#factor#1#" % UInt32),
-        0x00000001,
+        0x00000001
       )
       var"#loop#1#start#init#" = var"#iter#start#0#"
       var"##do#thread##" = var"#nrequest#" ≠ 0x00000000
@@ -748,22 +831,27 @@ function thread_two_loops_expr(
           var"#threads#remain#" = var"#thread#launch#count#" ≠ var"#nrequest#"
           var"##do#thread##" |= var"#threads#remain#"
           while var"#threads#remain#"
-            VectorizationBase.assume(var"#thread#mask#" ≠ zero(var"#thread#mask#"))
-            var"#trailzing#zeros#" = Base.trailing_zeros(var"#thread#mask#") % UInt32
+            VectorizationBase.assume(
+              var"#thread#mask#" ≠ zero(var"#thread#mask#")
+            )
+            var"#trailzing#zeros#" =
+              Base.trailing_zeros(var"#thread#mask#") % UInt32
             var"#nblock#size#thread#0#" = Core.ifelse(
               var"#thread#launch#count#0#" < (var"#nrem#thread#0#" % UInt32),
               vadd_nw(var"#base#block#size#thread#0#", var"#block#rem#step#0#"),
-              var"#base#block#size#thread#0#",
+              var"#base#block#size#thread#0#"
             )
             var"#nblock#size#thread#1#" = Core.ifelse(
               var"#thread#launch#count#1#" < (var"#nrem#thread#1#" % UInt32),
               vadd_nw(var"#base#block#size#thread#1#", var"#block#rem#step#1#"),
-              var"#base#block#size#thread#1#",
+              var"#base#block#size#thread#1#"
             )
-            var"#trailzing#zeros#" = vadd_nw(var"#trailzing#zeros#", 0x00000001)
+            var"#trailzing#zeros#" =
+              vadd_nw(var"#trailzing#zeros#", 0x00000001)
             $iterstop1
             $iterstop2
-            var"#thread#id#" = vadd_nw(var"#thread#id#", var"#trailzing#zeros#")
+            var"#thread#id#" =
+              vadd_nw(var"#thread#id#", var"#trailzing#zeros#")
             # @show var"#thread#id#" $loopboundexpr
             var"##lbvargs#to_launch##" = ($loopboundexpr, var"#vargs#")
             avx_launch(
@@ -774,33 +862,39 @@ function thread_two_loops_expr(
               $LPSYM,
               StaticType{typeof(var"##lbvargs#to_launch##")}(),
               flatten_to_tuple(var"##lbvargs#to_launch##"),
-              var"#thread#id#",
+              var"#thread#id#"
             )
             var"#thread#mask#" >>>= var"#trailzing#zeros#"
 
             var"##end#inner##" =
-              var"#thread#launch#count#0#" == vsub_nw(var"#thread#factor#0#", 0x00000001)
+              var"#thread#launch#count#0#" ==
+              vsub_nw(var"#thread#factor#0#", 0x00000001)
             var"#thread#launch#count#0#" = Core.ifelse(
               var"##end#inner##",
               0x00000000,
-              vadd_nw(var"#thread#launch#count#0#", 0x00000001),
+              vadd_nw(var"#thread#launch#count#0#", 0x00000001)
             )
             var"#thread#launch#count#1#" = Core.ifelse(
               var"##end#inner##",
               var"#thread#launch#count#1#" + 0x00000001,
-              var"#thread#launch#count#1#",
+              var"#thread#launch#count#1#"
             )
 
             var"#iter#start#0#" = Core.ifelse(
               var"##end#inner##",
               var"#loop#1#start#init#",
-              var"#iter#stop#0#",
+              var"#iter#stop#0#"
+            )
+            var"#iter#start#1#" = Core.ifelse(
+              var"##end#inner##",
+              var"#iter#stop#1#",
+              var"#iter#start#1#"
             )
-            var"#iter#start#1#" =
-              Core.ifelse(var"##end#inner##", var"#iter#stop#1#", var"#iter#start#1#")
 
-            var"#thread#launch#count#" = vadd_nw(var"#thread#launch#count#", 0x00000001)
-            var"#threads#remain#" = var"#thread#launch#count#" ≠ var"#nrequest#"
+            var"#thread#launch#count#" =
+              vadd_nw(var"#thread#launch#count#", 0x00000001)
+            var"#threads#remain#" =
+              var"#thread#launch#count#" ≠ var"#nrequest#"
           end
         end
       else# eliminate undef var errors that the compiler should be able to figure out are unreachable, but doesn't
@@ -822,8 +916,10 @@ function thread_two_loops_expr(
         (var"#thread#mask#" ≠ zero(var"#thread#mask#"))
       while var"#threads#remain#"
         VectorizationBase.assume(var"#thread#mask#" ≠ zero(var"#thread#mask#"))
-        var"#trailzing#zeros#" =
-          vadd_nw(Base.trailing_zeros(var"#thread#mask#") % UInt32, 0x00000001)
+        var"#trailzing#zeros#" = vadd_nw(
+          Base.trailing_zeros(var"#thread#mask#") % UInt32,
+          0x00000001
+        )
         var"#thread#mask#" >>>= var"#trailzing#zeros#"
         var"#thread#id#" = vadd_nw(var"#thread#id#", var"#trailzing#zeros#")
         var"#thread#ptr#" = ThreadingUtilities.taskpointer(var"#thread#id#")
@@ -839,7 +935,8 @@ function thread_two_loops_expr(
 end
 
 function valid_thread_loops(ls::LoopSet)
-  order, u₁loop, u₂loop, vectorized, u₁, u₂, c, shouldinline = choose_order_cost(ls)
+  order, u₁loop, u₂loop, vectorized, u₁, u₂, c, shouldinline =
+    choose_order_cost(ls)
   # NOTE: `names` are being placed in the opposite order here versus normal lowering!
   copyto!(names(ls), order)
   init_loop_map!(ls)
@@ -877,15 +974,37 @@ function avx_threads_expr(
   OPS::Expr,
   ARF::Expr,
   AM::Expr,
-  LPSYM::Expr,
+  LPSYM::Expr
 )
   valid_thread_loop, ua, c = valid_thread_loops(ls)
   num_candiates = sum(valid_thread_loop)
   if (num_candiates == 0) || (nt ≤ 1) # it was called from `avx_body` but now `nt` was set to `1`
     avx_body(ls, UNROLL)
   elseif (num_candiates == 1) || (nt ≤ 3)
-    thread_one_loops_expr(ls, ua, valid_thread_loop, nt, c, UNROLL, OPS, ARF, AM, LPSYM)
+    thread_one_loops_expr(
+      ls,
+      ua,
+      valid_thread_loop,
+      nt,
+      c,
+      UNROLL,
+      OPS,
+      ARF,
+      AM,
+      LPSYM
+    )
   else # requires at least 4 threads
-    thread_two_loops_expr(ls, ua, valid_thread_loop, nt, c, UNROLL, OPS, ARF, AM, LPSYM)
+    thread_two_loops_expr(
+      ls,
+      ua,
+      valid_thread_loop,
+      nt,
+      c,
+      UNROLL,
+      OPS,
+      ARF,
+      AM,
+      LPSYM
+    )
   end
 end
diff --git a/src/codegen/lowering.jl b/src/codegen/lowering.jl
index c857d51cf..a058bc0b3 100644
--- a/src/codegen/lowering.jl
+++ b/src/codegen/lowering.jl
@@ -1,5 +1,4 @@
 
-
 # the `lowernonstore` and `lowerstore` options are there as a means of lowering all non-store operations before lowering the stores.
 function lower!(
   q::Expr,
@@ -11,7 +10,7 @@ function lower!(
   suffix::Int,
   mask::Bool,
   lowernonstore::Bool,
-  lowerstore::Bool,
+  lowerstore::Bool
 )
   ua = UnrollArgs(ls, u₁, unrollsyms, u₂, suffix)
   for op ∈ ops
@@ -37,7 +36,13 @@ end
 function isu₂invalidstorereorder(ls::LoopSet, us::UnrollSpecification)
   us.u₂ == -1 ? false : ls.validreorder[ls.loopordermap[us.u₂loopnum]] ≠ 0x03
 end
-function lower_block(ls::LoopSet, us::UnrollSpecification, n::Int, mask::Bool, UF::Int)
+function lower_block(
+  ls::LoopSet,
+  us::UnrollSpecification,
+  n::Int,
+  mask::Bool,
+  UF::Int
+)
   @unpack u₁loopnum, u₂loopnum, vloopnum, u₁, u₂ = us
   ops = oporder(ls)
   order = names(ls)
@@ -51,7 +56,18 @@ function lower_block(ls::LoopSet, us::UnrollSpecification, n::Int, mask::Bool, U
   cannot_reorder_u₂ = isu₂invalidstorereorder(ls, us)
   for prepost ∈ 1:2
     # !u₁ && !u₂
-    lower!(blockq, ops[1, 1, prepost, n], ls, unrollsyms, u₁, u₂, -1, mask, true, true)
+    lower!(
+      blockq,
+      ops[1, 1, prepost, n],
+      ls,
+      unrollsyms,
+      u₁,
+      u₂,
+      -1,
+      mask,
+      true,
+      true
+    )
     # isu₁unrolled, isu₂unrolled, after_loop, n
     opsv1 = ops[1, 2, prepost, n]
     opsv2 = ops[2, 2, prepost, n]
@@ -61,7 +77,18 @@ function lower_block(ls::LoopSet, us::UnrollSpecification, n::Int, mask::Bool, U
       iszero(length(opsv2)) || (nstores += sum(isstore, opsv2))
       # if nstores
       if (length(opsv1) + length(opsv2) == nstores) && u₂ > 1 # all_u₂_ops_store
-        lower!(blockq, ops[2, 1, prepost, n], ls, unrollsyms, u₁, u₂, -1, mask, true, true) # for u ∈ 0:u₁-1
+        lower!(
+          blockq,
+          ops[2, 1, prepost, n],
+          ls,
+          unrollsyms,
+          u₁,
+          u₂,
+          -1,
+          mask,
+          true,
+          true
+        ) # for u ∈ 0:u₁-1
         lower_tiled_store!(blockq, opsv1, opsv2, ls, unrollsyms, u₁, u₂, mask)
       else
         for store ∈ (false, true)
@@ -84,7 +111,7 @@ function lower_block(ls::LoopSet, us::UnrollSpecification, n::Int, mask::Bool, U
               t,
               mask & !(dontmaskfirsttiles & (t < u₂ - 1)),
               lowernonstore,
-              lowerstore,
+              lowerstore
             )
             if iszero(t) && !store #  u₁ && !u₂
               # for u ∈ 0:u₁-1
@@ -98,7 +125,7 @@ function lower_block(ls::LoopSet, us::UnrollSpecification, n::Int, mask::Bool, U
                 -1,
                 mask,
                 true,
-                true,
+                true
               )
               # end
             end
@@ -114,7 +141,7 @@ function lower_block(ls::LoopSet, us::UnrollSpecification, n::Int, mask::Bool, U
               t,
               mask & !(dontmaskfirsttiles & (t < u₂ - 1)),
               lowernonstore,
-              lowerstore,
+              lowerstore
             )
             # end
           end
@@ -122,11 +149,44 @@ function lower_block(ls::LoopSet, us::UnrollSpecification, n::Int, mask::Bool, U
         end
       end
     elseif cannot_reorder_u₂
-      lower!(blockq, ops[2, 1, prepost, n], ls, unrollsyms, u₁, u₂, -1, mask, true, true)
+      lower!(
+        blockq,
+        ops[2, 1, prepost, n],
+        ls,
+        unrollsyms,
+        u₁,
+        u₂,
+        -1,
+        mask,
+        true,
+        true
+      )
     else
       # for u ∈ 0:u₁-1     #  u₁ && !u₂
-      lower!(blockq, ops[2, 1, prepost, n], ls, unrollsyms, u₁, u₂, -1, mask, true, false)
-      lower!(blockq, ops[2, 1, prepost, n], ls, unrollsyms, u₁, u₂, -1, mask, false, true)
+      lower!(
+        blockq,
+        ops[2, 1, prepost, n],
+        ls,
+        unrollsyms,
+        u₁,
+        u₂,
+        -1,
+        mask,
+        true,
+        false
+      )
+      lower!(
+        blockq,
+        ops[2, 1, prepost, n],
+        ls,
+        unrollsyms,
+        u₁,
+        u₂,
+        -1,
+        mask,
+        false,
+        true
+      )
       # end
     end
     if n > 1 && prepost == 1
@@ -177,7 +237,12 @@ function allinteriorunrolled(ls::LoopSet, us::UnrollSpecification, N)
   unroll_total ≤ 16
 end
 
-function lower_no_unroll(ls::LoopSet, us::UnrollSpecification, n::Int, inclmask::Bool)
+function lower_no_unroll(
+  ls::LoopSet,
+  us::UnrollSpecification,
+  n::Int,
+  inclmask::Bool
+)
   nisvectorized = isvectorized(us, n)
   loop = getloop(ls, n)
   tc = terminatecondition(ls, us, n, inclmask, 1)
@@ -224,7 +289,7 @@ function lower_unrolled_dynamic(
   ls::LoopSet,
   us::UnrollSpecification,
   n::Int,
-  inclmask::Bool,
+  inclmask::Bool
 )
   UF = unrollfactor(us, n)
   isone(UF) && return lower_no_unroll(ls, us, n, inclmask)
@@ -240,7 +305,8 @@ function lower_unrolled_dynamic(
   if W ≠ 0 & isknown(first(loop)) & isknown(step(loop))
     loopisstatic = isknown(last(loop))
     # something other than the default hint currently means an UpperBoundedInteger was passed as an argument
-    loopisbounded = (looplength < UFW) & (loopisstatic | (gethint(last(loop)) ≠ 1024))
+    loopisbounded =
+      (looplength < UFW) & (loopisstatic | (gethint(last(loop)) ≠ 1024))
   else
     loopisstatic = false
     loopisbounded = false
@@ -251,14 +317,19 @@ function lower_unrolled_dynamic(
     UFWnew = cld(looplength, cld(looplength, UFW))
     UF = cld(UFWnew, W)
     UFW = UF * W
-    us = nisunrolled ? UnrollSpecification(us, UF, u₂) : UnrollSpecification(us, u₁, UF)
+    us =
+      nisunrolled ? UnrollSpecification(us, UF, u₂) :
+      UnrollSpecification(us, u₁, UF)
   end
   remmask = inclmask | nisvectorized
   sl = startloop(ls, us, n, false)
   UFt = loopisstatic ? cld(looplength % UFW, W) : 1
   # Don't place remainder first if we're going to have to mask this loop (i.e., if this loop is vectorized)
   remfirst =
-    loopisstatic & (!nisvectorized) & (UFt > 0) & !(unsigned(Ureduct) < unsigned(UF))
+    loopisstatic &
+    (!nisvectorized) &
+    (UFt > 0) &
+    !(unsigned(Ureduct) < unsigned(UF))
   tc = terminatecondition(ls, us, n, inclmask, remfirst ? 1 : UF)
   # Don't need to create the body if loop is dynamic and bounded
   dynamicbounded = ((!loopisstatic) & loopisbounded)
@@ -322,19 +393,30 @@ function lower_unrolled_dynamic(
       if length(loop) < UF * W
         Expr(:block)
       else
-        Expr(:block, add_upper_outer_reductions(ls, q, Ureduct, UF, loop, nisvectorized))
+        Expr(
+          :block,
+          add_upper_outer_reductions(ls, q, Ureduct, UF, loop, nisvectorized)
+        )
       end
     else
-      Expr(:block, add_upper_outer_reductions(ls, q, Ureduct, UF, loop, nisvectorized))
+      Expr(
+        :block,
+        add_upper_outer_reductions(ls, q, Ureduct, UF, loop, nisvectorized)
+      )
     end
     if add_cleanup
       cleanup_expr = Expr(blockhead)
-      blockhead === :block ||
-        push!(cleanup_expr.args, terminatecondition(ls, us, n, inclmask, UF_cleanup))
+      blockhead === :block || push!(
+        cleanup_expr.args,
+        terminatecondition(ls, us, n, inclmask, UF_cleanup)
+      )
       us_cleanup =
         nisunrolled ? UnrollSpecification(us, UF_cleanup, u₂) :
         UnrollSpecification(us, u₁, UF_cleanup)
-      push!(cleanup_expr.args, lower_block(ls, us_cleanup, n, inclmask, UF_cleanup))
+      push!(
+        cleanup_expr.args,
+        lower_block(ls, us_cleanup, n, inclmask, UF_cleanup)
+      )
       push!(_q.args, cleanup_expr)
     end
     UFt > 0 && push!(_q.args, remblock)
@@ -357,13 +439,19 @@ function lower_unrolled_dynamic(
     UF = rem_uf >> 1
     UFt = rem_uf - UF
     ust =
-      nisunrolled ? UnrollSpecification(us, UFt, u₂) : UnrollSpecification(us, u₁, UFt)
+      nisunrolled ? UnrollSpecification(us, UFt, u₂) :
+      UnrollSpecification(us, u₁, UFt)
     newblock = lower_block(ls, ust, n, remmask, UFt)
     # comparison = unrollremcomparison(ls, loop, UFt, n, nisvectorized, remfirst)
     comparison = terminatecondition(ls, us, n, inclmask, UFt)
     UFt = 1
     UF += 1 - iseven(rem_uf)
-    Expr(:block, q, Expr(iseven(rem_uf) ? :while : :if, comparison, newblock), remblock)
+    Expr(
+      :block,
+      q,
+      Expr(iseven(rem_uf) ? :while : :if, comparison, newblock),
+      remblock
+    )
   else
     Expr(:block, q, remblock)
   end
@@ -372,7 +460,8 @@ function lower_unrolled_dynamic(
     iforelseif = :if
     while true
       ust =
-        nisunrolled ? UnrollSpecification(us, UFt, u₂) : UnrollSpecification(us, u₁, UFt)
+        nisunrolled ? UnrollSpecification(us, UFt, u₂) :
+        UnrollSpecification(us, u₁, UFt)
       newblock = lower_block(ls, ust, n, remmask, UFt)
       if (UFt ≥ UF - 1 + nisvectorized) || UFt == Ureduct || loopisstatic
         if isone(num_loops(ls)) && isone(UFt) && isone(Ureduct)
@@ -381,10 +470,14 @@ function lower_unrolled_dynamic(
         push!(remblock.args, newblock)
         break
       end
-      comparison = unrollremcomparison(ls, loop, UFt, n, nisvectorized, remfirst)
+      comparison =
+        unrollremcomparison(ls, loop, UFt, n, nisvectorized, remfirst)
       if isone(num_loops(ls)) && isone(UFt)
         remblocknew = Expr(:if, comparison, newblock)
-        push!(remblock.args, Expr(:block, Expr(:let, definemask(loop), remblocknew)))
+        push!(
+          remblock.args,
+          Expr(:block, Expr(:let, definemask(loop), remblocknew))
+        )
         remblock = remblocknew
       else
         remblocknew = Expr(iforelseif, comparison, newblock)
@@ -408,7 +501,7 @@ function lower_unrolled_dynamic(
       ls,
       order[u₁loopnum],
       order[us.u₂loopnum],
-      vectorized,
+      vectorized
     )
     Expr(:block, pre, Expr(:let, sl, q), post)
   else
@@ -421,7 +514,7 @@ function unrollremcomparison(
   UFt::Int,
   n::Int,
   nisvectorized::Bool,
-  remfirst::Bool,
+  remfirst::Bool
 )
   termind = ls.lssm.terminators[n]
   if iszero(termind)
@@ -430,7 +523,12 @@ function unrollremcomparison(
     pointerremcomparison(ls, termind, UFt, n, nisvectorized, remfirst, loop)
   end
 end
-function loopvarremcomparison(loop::Loop, UFt::Int, nisvectorized::Bool, remfirst::Bool)
+function loopvarremcomparison(
+  loop::Loop,
+  UFt::Int,
+  nisvectorized::Bool,
+  remfirst::Bool
+)
   loopsym = loop.itersymbol
   loopstep = loop.step
   if nisvectorized
@@ -442,7 +540,7 @@ function loopvarremcomparison(loop::Loop, UFt::Int, nisvectorized::Bool, remfirs
       :call,
       GlobalRef(Base, :<),
       loopsym,
-      gethint(first(loop)) + UFt * gethint(loopstep) - 1,
+      gethint(first(loop)) + UFt * gethint(loopstep) - 1
     )
   elseif isknown(last(loop))
     if isknown(loopstep)
@@ -450,21 +548,21 @@ function loopvarremcomparison(loop::Loop, UFt::Int, nisvectorized::Bool, remfirs
         :call,
         GlobalRef(Base, :>),
         loopsym,
-        gethint(last(loop)) - UFt * gethint(loopstep),
+        gethint(last(loop)) - UFt * gethint(loopstep)
       )
     elseif isone(UFt)
       Expr(
         :call,
         GlobalRef(Base, :>),
         loopsym,
-        subexpr(gethint(last(loop)), getsym(loopstep)),
+        subexpr(gethint(last(loop)), getsym(loopstep))
       )
     else
       Expr(
         :call,
         GlobalRef(Base, :>),
         loopsym,
-        subexpr(gethint(last(loop)), mulexpr(getsym(loopstep), UFt)),
+        subexpr(gethint(last(loop)), mulexpr(getsym(loopstep), UFt))
       )
     end
   else
@@ -473,21 +571,26 @@ function loopvarremcomparison(loop::Loop, UFt::Int, nisvectorized::Bool, remfirs
         :call,
         GlobalRef(Base, :>),
         loopsym,
-        Expr(:call, lv(:vsub_nsw), getsym(last(loop)), UFt * gethint(loopstep)),
+        Expr(:call, lv(:vsub_nsw), getsym(last(loop)), UFt * gethint(loopstep))
       )
     elseif isone(UFt)
       Expr(
         :call,
         GlobalRef(Base, :>),
         loopsym,
-        Expr(:call, lv(:vsub_nsw), getsym(last(loop)), getsym(loopstep)),
+        Expr(:call, lv(:vsub_nsw), getsym(last(loop)), getsym(loopstep))
       )
     else
       Expr(
         :call,
         GlobalRef(Base, :>),
         loopsym,
-        Expr(:call, lv(:vsub_nsw), getsym(last(loop)), mulexpr(getsym(loopstep), UFt)),
+        Expr(
+          :call,
+          lv(:vsub_nsw),
+          getsym(last(loop)),
+          mulexpr(getsym(loopstep), UFt)
+        )
       )
     end
   end
@@ -499,7 +602,7 @@ function pointerremcomparison(
   n::Int,
   nisvectorized::Bool,
   remfirst::Bool,
-  loop::Loop,
+  loop::Loop
 )
   lssm = ls.lssm
   termar = lssm.incrementedptrs[n][termind]
@@ -508,14 +611,24 @@ function pointerremcomparison(
   ptroff = vptr_offset(ptr)
   if remfirst
     cmp = GlobalRef(VectorizationBase, :vlt)
-    Expr(:call, cmp, ptroff, pointermax(ls, ptrdef, n, 1 - UFt, nisvectorized, loop), ptr)
+    Expr(
+      :call,
+      cmp,
+      ptroff,
+      pointermax(ls, ptrdef, n, 1 - UFt, nisvectorized, loop),
+      ptr
+    )
   else
     cmp = GlobalRef(VectorizationBase, :vge)
     Expr(:call, cmp, ptroff, maxsym(ptr, UFt), ptr)
   end
 end
 
-@generated function of_same_size(::Type{T}, ::Type{S}, ::StaticInt{R}) where {T,S,R}
+@generated function of_same_size(
+  ::Type{T},
+  ::Type{S},
+  ::StaticInt{R}
+) where {T,S,R}
   sizeof_S = sizeof(S)
   sizeof_T = sizeof(T)
   if T <: Integer
@@ -546,7 +659,8 @@ end
   of_same_size(
     T,
     S,
-    VectorizationBase.register_size() ÷ VectorizationBase.simd_integer_register_size(),
+    VectorizationBase.register_size() ÷
+    VectorizationBase.simd_integer_register_size()
   )
 end
 function outer_reduction_zero(
@@ -554,7 +668,7 @@ function outer_reduction_zero(
   u₁u::Bool,
   Umax::Int,
   reduct_class::Float64,
-  rs::Union{Expr,StaticInt},
+  rs::Union{Expr,StaticInt}
 )
   isifelse = instruction(op).instr === :ifelse
   reduct_zero = if isifelse
@@ -565,7 +679,8 @@ function outer_reduction_zero(
   end
   # Tsym = outer_reduct_init_typename(op)
   # Tsym = ELTYPESYMBOL
-  Tsym = Expr(:call, lv(:of_same_size), outer_reduct_init_typename(op), ELTYPESYMBOL)
+  Tsym =
+    Expr(:call, lv(:of_same_size), outer_reduct_init_typename(op), ELTYPESYMBOL)
   if isvectorized(op)
     if Umax == 1 || !u₁u
       if reduct_zero === :zero
@@ -578,12 +693,19 @@ function outer_reduction_zero(
           lv(:_vbroadcast),
           VECTORWIDTHSYMBOL,
           Expr(:call, lv(reduct_zero), Tsym),
-          rs,
+          rs
         )
       end
     else
       if reduct_zero === :zero
-        Expr(:call, lv(:zero_vecunroll), staticexpr(Umax), VECTORWIDTHSYMBOL, Tsym, rs)
+        Expr(
+          :call,
+          lv(:zero_vecunroll),
+          staticexpr(Umax),
+          VECTORWIDTHSYMBOL,
+          Tsym,
+          rs
+        )
       elseif isifelse
         Expr(
           :call,
@@ -591,7 +713,7 @@ function outer_reduction_zero(
           staticexpr(Umax),
           VECTORWIDTHSYMBOL,
           reduct_zero,
-          rs,
+          rs
         )
       else
         Expr(
@@ -600,7 +722,7 @@ function outer_reduction_zero(
           staticexpr(Umax),
           VECTORWIDTHSYMBOL,
           Expr(:call, reduct_zero, Tsym),
-          rs,
+          rs
         )
       end
     end
@@ -618,7 +740,7 @@ function initialize_outer_reductions!(
   op::Operation,
   _Umax::Int,
   us::UnrollSpecification,
-  rs::Union{Expr,StaticInt},
+  rs::Union{Expr,StaticInt}
 )
   @unpack u₁, u₂ = us
   Umax = u₂ == -1 ? _Umax : u₁
@@ -628,9 +750,15 @@ function initialize_outer_reductions!(
     getloop(ls, us.u₁loopnum).itersymbol,
     getloop(ls, us.u₂loopnum).itersymbol,
     getloop(ls, us.vloopnum).itersymbol,
-    ls,
+    ls
   )#, u₂)
-  z = outer_reduction_zero(op, u₁u, Umax, reduction_instruction_class(instruction(op)), rs)
+  z = outer_reduction_zero(
+    op,
+    u₁u,
+    Umax,
+    reduction_instruction_class(instruction(op)),
+    rs
+  )
   mvar = variable_name(op, -1)
   if (u₂ == -1)
     push!(q.args, Expr(:(=), Symbol(mvar, '_', _Umax), z))
@@ -645,7 +773,11 @@ function initialize_outer_reductions!(
   end
   nothing
 end
-function initialize_outer_reductions!(q::Expr, ls::LoopSet, Umax::Union{Int,StaticInt})
+function initialize_outer_reductions!(
+  q::Expr,
+  ls::LoopSet,
+  Umax::Union{Int,StaticInt}
+)
   rs = staticexpr(reg_size(ls))
   us = ls.unrollspecification
   for or ∈ ls.outer_reductions
@@ -655,18 +787,22 @@ end
 initialize_outer_reductions!(ls::LoopSet, Umax::Int) =
   initialize_outer_reductions!(ls.preamble, ls, Umax)
 function add_upper_comp_check(unrolledloop, loopbuffer)
-
   if isstaticloop(unrolledloop)
     Expr(:call, Base.GlobalRef(Base, :≥), length(unrolledloop), loopbuffer)
   elseif isknown(first(unrolledloop))
     if isone(first(unrolledloop))
-      Expr(:call, Base.GlobalRef(Base, :≥), getsym(last(unrolledloop)), loopbuffer)
+      Expr(
+        :call,
+        Base.GlobalRef(Base, :≥),
+        getsym(last(unrolledloop)),
+        loopbuffer
+      )
     else
       Expr(
         :call,
         Base.GlobalRef(Base, :≥),
         getsym(last(unrolledloop)),
-        addexpr(loopbuffer, gethint(first(unrolledloop)) - 1),
+        addexpr(loopbuffer, gethint(first(unrolledloop)) - 1)
       )
     end
   elseif isknown(last(unrolledloop))
@@ -677,9 +813,9 @@ function add_upper_comp_check(unrolledloop, loopbuffer)
         :call,
         lv(:vsub_nsw),
         gethint(last(unrolledloop)) + 1,
-        getsym(first(unrolledloop)),
+        getsym(first(unrolledloop))
       ),
-      loopbuffer,
+      loopbuffer
     )
   else# both are given by symbols
     Expr(
@@ -689,9 +825,9 @@ function add_upper_comp_check(unrolledloop, loopbuffer)
         :call,
         lv(:vsub_nsw),
         getsym(last(unrolledloop)),
-        Expr(:call, lv(:vsub_nsw), getsym(first(unrolledloop)), staticexpr(1)),
+        Expr(:call, lv(:vsub_nsw), getsym(first(unrolledloop)), staticexpr(1))
       ),
-      loopbuffer,
+      loopbuffer
     )
   end
 end
@@ -701,7 +837,7 @@ function add_upper_outer_reductions(
   Ulow::Int,
   Uhigh::Int,
   unrolledloop::Loop,
-  reductisvectorized::Bool,
+  reductisvectorized::Bool
 )
   ifq = Expr(:block)
   ifqlet = Expr(:block)
@@ -737,7 +873,10 @@ function add_upper_outer_reductions(
     end
   end
   ncomparison = if reductisvectorized
-    add_upper_comp_check(unrolledloop, mulexpr(VECTORWIDTHSYMBOL, Uhigh, step(unrolledloop)))
+    add_upper_comp_check(
+      unrolledloop,
+      mulexpr(VECTORWIDTHSYMBOL, Uhigh, step(unrolledloop))
+    )
   elseif isknown(step(unrolledloop))
     add_upper_comp_check(unrolledloop, Uhigh * gethint(step(unrolledloop)))
   else
@@ -776,8 +915,13 @@ function reduce_expr!(q::Expr, ls::LoopSet, U::Int)
             Expr(
               :(=),
               var,
-              Expr(:call, reduction_scalar_combine(op), Symbol(mvar, "##onevec##"), var),
-            ),
+              Expr(
+                :call,
+                reduction_scalar_combine(op),
+                Symbol(mvar, "##onevec##"),
+                var
+              )
+            )
           )
         else
           reductexpr = ifelse_reduction(:IfElseReduced, op) do opv
@@ -785,7 +929,11 @@ function reduce_expr!(q::Expr, ls::LoopSet, U::Int)
           end
           push!(
             q.args,
-            Expr(:(=), var, Expr(:call, reductexpr, Symbol(mvar, "##onevec##"), var)),
+            Expr(
+              :(=),
+              var,
+              Expr(:call, reductexpr, Symbol(mvar, "##onevec##"), var)
+            )
           )
         end
       else
@@ -801,7 +949,7 @@ function reinit_push_preblockpost!(
   post::Expr,
   z::Expr,
   s::Symbol,
-  reduct::Symbol,
+  reduct::Symbol
 )
   push!(letblock.args, Expr(:(=), s, z))
   tempsym = gensym(s) # placeholder
@@ -816,7 +964,7 @@ function reinit_and_update_tiled_outer_reduct!(
   ls::LoopSet,
   u₁loopsym::Symbol,
   u₂loopsym::Symbol,
-  vloopsym::Symbol,
+  vloopsym::Symbol
 )
   rs = staticexpr(reg_size(ls))
   usorig = ls.unrollspecification
@@ -840,14 +988,29 @@ function reinit_and_update_tiled_outer_reduct!(
         post,
         z,
         Symbol(mvar, '_', usorig.u₁),
-        reduct,
+        reduct
       )
     else # it's u₂unrolled
       for u ∈ 0:Umax-1
-        reinit_push_preblockpost!(letblock, pre, block, post, z, Symbol(mvar, u), reduct)
+        reinit_push_preblockpost!(
+          letblock,
+          pre,
+          block,
+          post,
+          z,
+          Symbol(mvar, u),
+          reduct
+        )
       end
     end
-    initialize_outer_reductions!(letblock, ls, ls.operations[or], ureduct(ls), usorig, rs)
+    initialize_outer_reductions!(
+      letblock,
+      ls,
+      ls.operations[or],
+      ureduct(ls),
+      usorig,
+      rs
+    )
   end
   pre, post
 end
@@ -918,8 +1081,12 @@ function init_remblock(unrolledloop::Loop, lssm::LoopStartStopManager, n::Int)#u
   if iszero(termind)
     rangesym = unrolledloop.rangesym
     if rangesym === Symbol("")
-      condition =
-        Expr(:call, lv(:cmpend), unrolledloop.itersymbol, staticloopexpr(unrolledloop))
+      condition = Expr(
+        :call,
+        lv(:cmpend),
+        unrolledloop.itersymbol,
+        staticloopexpr(unrolledloop)
+      )
     else
       condition = Expr(:call, lv(:cmpend), unrolledloop.itersymbol, rangesym)
     end
@@ -927,7 +1094,13 @@ function init_remblock(unrolledloop::Loop, lssm::LoopStartStopManager, n::Int)#u
     termar = lssm.incrementedptrs[n][termind]
     ptr = vptr(termar)
     ptroff = vptr_offset(ptr)
-    condition = Expr(:call, GlobalRef(VectorizationBase, :vlt), ptroff, maxsym(ptr, 0), ptr)
+    condition = Expr(
+      :call,
+      GlobalRef(VectorizationBase, :vlt),
+      ptroff,
+      maxsym(ptr, 0),
+      ptr
+    )
   end
   Expr(:if, condition)
 end
@@ -967,7 +1140,12 @@ function definemask(loop::Loop)
     maskexpr(addexpr(lenexpr, 1))
   end
 end
-function define_eltype_vec_width!(q::Expr, ls::LoopSet, vectorized, ortypdefined::Bool)
+function define_eltype_vec_width!(
+  q::Expr,
+  ls::LoopSet,
+  vectorized,
+  ortypdefined::Bool
+)
   push!(q.args, Expr(:(=), ELTYPESYMBOL, determine_eltype(ls, ortypdefined)))
   push!(q.args, Expr(:(=), VECTORWIDTHSYMBOL, determine_width(ls, vectorized)))
   nothing
@@ -998,15 +1176,13 @@ function setup_preamble!(ls::LoopSet, us::UnrollSpecification, Ureduct::Int)
         getloop(ls, us.vloopnum),
         u₁,
         u₂,
-        -1,
+        -1
       )
       lower_compute!(ls.preamble, op, ls, ua, false)
     end
   end
 end
-function lsexpr(ls::LoopSet, q)
-  Expr(:block, ls.preamble, q)
-end
+lsexpr(ls::LoopSet, q) = Expr(:block, ls.preamble, q)
 
 function isanouterreduction(ls::LoopSet, op::Operation)
   opname = name(op)
@@ -1058,8 +1234,8 @@ function calc_Ureduct!(ls::LoopSet, us::UnrollSpecification)
       elseif !((u₁ui == Int(u₁u)) & (u₂ui == Int(u₁u)))
         throw(
           ArgumentError(
-            "Doesn't currenly handle differently unrolled reductions yet, please file an issue with an example.",
-          ),
+            "Doesn't currenly handle differently unrolled reductions yet, please file an issue with an example."
+          )
         )
       end
     end
@@ -1079,16 +1255,27 @@ function lower_unrollspec(ls::LoopSet)
   Ureduct = calc_Ureduct!(ls, us)
   setup_preamble!(ls, us, Ureduct)
   initgesps = add_loop_start_stop_manager!(ls)
-  q = Expr(:let, initgesps, lower_unrolled_dynamic(ls, us, num_loops(ls), false))
+  q =
+    Expr(:let, initgesps, lower_unrolled_dynamic(ls, us, num_loops(ls), false))
   q = gc_preserve(ls, Expr(:block, q))
   reduce_expr!(q, ls, Ureduct)
   lsexpr(ls, q)
 end
 
-function lower(ls::LoopSet, order, u₁loop, u₂loop, vectorized, u₁, u₂, inline::Bool)
+function lower(
+  ls::LoopSet,
+  order,
+  u₁loop,
+  u₂loop,
+  vectorized,
+  u₁,
+  u₂,
+  inline::Bool
+)
   cacheunrolled!(ls, u₁loop, u₂loop, vectorized)
   fillorder!(ls, order, u₁loop, u₂loop, u₂, vectorized)
-  ls.unrollspecification = UnrollSpecification(ls, u₁loop, u₂loop, vectorized, u₁, u₂)
+  ls.unrollspecification =
+    UnrollSpecification(ls, u₁loop, u₂loop, vectorized, u₁, u₂)
   q = lower_unrollspec(ls)
   inline && pushfirst!(q.args, Expr(:meta, :inline))
   q
@@ -1096,8 +1283,18 @@ end
 
 function lower(ls::LoopSet, inline::Int = -1)
   fill_offset_memop_collection!(ls)
-  order, u₁loop, u₂loop, vectorized, u₁, u₂, c, shouldinline = choose_order_cost(ls)
-  lower(ls, order, u₁loop, u₂loop, vectorized, u₁, u₂, inlinedecision(inline, shouldinline))
+  order, u₁loop, u₂loop, vectorized, u₁, u₂, c, shouldinline =
+    choose_order_cost(ls)
+  lower(
+    ls,
+    order,
+    u₁loop,
+    u₂loop,
+    vectorized,
+    u₁,
+    u₂,
+    inlinedecision(inline, shouldinline)
+  )
 end
 function lower(ls::LoopSet, u₁::Int, u₂::Int, v::Int, inline::Int)
   fill_offset_memop_collection!(ls)
@@ -1108,13 +1305,15 @@ function lower(ls::LoopSet, u₁::Int, u₂::Int, v::Int, inline::Int)
     copyto!(ls.loop_order.bestorder, order)
   elseif u₁ > 0
     u₂ = -1
-    order, vectorized, c = choose_unroll_order(ls, Inf, store_load_deps(operations(ls)), v)
+    order, vectorized, c =
+      choose_unroll_order(ls, Inf, store_load_deps(operations(ls)), v)
     u₁loop = first(order)
     u₂loop = Symbol("##undefined##")
     shouldinline = true
     copyto!(ls.loop_order.bestorder, order)
   else
-    order, u₁loop, u₂loop, vectorized, u₁, u₂, c, shouldinline = choose_order_cost(ls, v)
+    order, u₁loop, u₂loop, vectorized, u₁, u₂, c, shouldinline =
+      choose_order_cost(ls, v)
   end
   doinline = inlinedecision(inline, shouldinline)
   lower(ls, order, u₁loop, u₂loop, vectorized, u₁, u₂, doinline)
@@ -1143,16 +1342,14 @@ isunrolled_sym(op, u₁loop, u₂loop)
 It returns `true`/`false` for each loop, indicating whether they're unrolled.
 
 If there is a third argument, it will avoid unrolling that symbol along reductions if said symbol is part of the reduction chain.
-
 """
 function isunrolled_sym(
   op::Operation,
   u₁loop::Symbol,
   u₂loop::Symbol,
   vloop::Symbol,
-  (u₁ild, u₂ild)::Tuple{Bool,Bool} = (isu₁unrolled(op), isu₂unrolled(op)),
+  (u₁ild, u₂ild)::Tuple{Bool,Bool} = (isu₁unrolled(op), isu₂unrolled(op))
 )
-
   (accesses_memory(op) | isloopvalue(op)) && return (u₁ild, u₂ild)
   if isconstant(op)
     if length(loopdependencies(op)) == 0
@@ -1193,7 +1390,7 @@ function isunrolled_sym(
   u₁loop::Symbol,
   u₂loop::Symbol,
   vloop::Symbol,
-  ls::LoopSet,
+  ls::LoopSet
 )
   us = ls.unrollspecification
   isunrolled_sym(op, u₁loop, u₂loop, vloop, us)
@@ -1203,12 +1400,13 @@ function isunrolled_sym(
   u₁loop::Symbol,
   u₂loop::Symbol,
   vloop::Symbol,
-  us::UnrollSpecification,
+  us::UnrollSpecification
 )
   @unpack u₁, u₂ = us
   u₁u = (u₁ > 1) & isu₁unrolled(op)
   u₂u = (u₂ > 1) & isu₂unrolled(op)
-  ((u₂ > 1) | accesses_memory(op)) ? isunrolled_sym(op, u₁loop, u₂loop, vloop, (u₁u, u₂u)) :
+  ((u₂ > 1) | accesses_memory(op)) ?
+  isunrolled_sym(op, u₁loop, u₂loop, vloop, (u₁u, u₂u)) :
   (isunrolled_sym(op, u₁loop, u₁u), false)
 end
 
@@ -1218,7 +1416,11 @@ function isunrolled_sym(op::Operation, u₁loop::Symbol, us::UnrollSpecification
   u₁u = (us.u₁ > 1) & isu₁unrolled(op)
   isunrolled_sym(op, u₁loop, u₁u)
 end
-function isunrolled_sym(op::Operation, u₁loop::Symbol, u₁u::Bool = isu₁unrolled(op))
+function isunrolled_sym(
+  op::Operation,
+  u₁loop::Symbol,
+  u₁u::Bool = isu₁unrolled(op)
+)
   u₁u || (isconstant(op) & (u₁loop ∈ reducedchildren(op)))
 end
 
@@ -1227,14 +1429,13 @@ function isunrolled_sym(
   u₁loop::Symbol,
   u₂loop::Symbol,
   vloop::Symbol,
-  u₂max::Int,
+  u₂max::Int
 )
-  ((u₂max > 1) | accesses_memory(op)) ? isunrolled_sym(op, u₁loop, u₂loop, vloop) :
+  ((u₂max > 1) | accesses_memory(op)) ?
+  isunrolled_sym(op, u₁loop, u₂loop, vloop) :
   (isunrolled_sym(op, u₁loop), false)
 end
 
-
-
 function variable_name(op::Operation, suffix::Int)
   mvar = mangledvar(op)
   suffix == -1 ? mvar : Symbol(mvar, suffix, :_)
@@ -1246,7 +1447,7 @@ function variable_name_and_unrolled(
   u₂loop::Symbol,
   vloop::Symbol,
   u₂iter::Int,
-  ls::LoopSet,
+  ls::LoopSet
 )
   u₁op, u₂op = isunrolled_sym(op, u₁loop, u₂loop, vloop, ls)
   mvar = u₂op ? variable_name(op, u₂iter) : mangledvar(op)
diff --git a/src/codegen/operation_evaluation_order.jl b/src/codegen/operation_evaluation_order.jl
index c0af709fb..02ab7b905 100644
--- a/src/codegen/operation_evaluation_order.jl
+++ b/src/codegen/operation_evaluation_order.jl
@@ -29,7 +29,7 @@ function isnopidentity(
   u₁loop::Symbol,
   u₂loop::Symbol,
   vectorized::Symbol,
-  u₂max::Int,
+  u₂max::Int
 )
   parents_op = parents(op)
   if iscompute(op) && instruction(op).instr === :identity
@@ -42,7 +42,8 @@ function isnopidentity(
     Base.iterate(parents_op, state) === nothing || return false
     name(opp) === name(op) || return false
     # @show op opp isu₁unrolled(op), isu₁unrolled(opp), isu₂unrolled(op), isu₂unrolled(opp)
-    (isu₁unrolled(op) == isu₁unrolled(opp)) & (isu₂unrolled(op) == isu₂unrolled(opp))
+    (isu₁unrolled(op) == isu₁unrolled(opp)) &
+    (isu₂unrolled(op) == isu₂unrolled(opp))
   else
     false
   end
@@ -53,7 +54,7 @@ function set_upstream_family!(
   op::Operation,
   val::T,
   ld::Vector{Symbol},
-  id::Int,
+  id::Int
 ) where {T}
   adal[identifier(op)] == val && return # must already have been set
   if ld != loopdependencies(op) || id == identifier(op)
@@ -68,7 +69,7 @@ function search_for_reductinit!(
   op::Operation,
   opswap::Operation,
   var::Symbol,
-  loopdeps::Vector{Symbol},
+  loopdeps::Vector{Symbol}
 )
   for (i, opp) ∈ enumerate(parents(op))
     if (name(opp) === var) &&
@@ -95,7 +96,7 @@ function addoptoorder!(
   u₁loop::Symbol,
   u₂loop::Symbol,
   vectorized::Symbol,
-  u₂max::Int,
+  u₂max::Int
 )
   lo = ls.loop_order
   id = identifier(op)
@@ -112,7 +113,7 @@ function addoptoorder!(
       u₁loop,
       u₂loop,
       vectorized,
-      u₂max,
+      u₂max
     )
   end
   included_vars[id] || return nothing
@@ -133,14 +134,20 @@ function addoptoorder!(
   # @show op, after_loop
   # isloopvalue(op) || push!(lo[isunrolled,istiled,after_loop,_n], op)
   # all(opp -> iszero(length(reduceddependencies(opp))), parents(op)) &&
-  set_upstream_family!(place_after_loop, op, false, loopdependencies(op), identifier(op)) # parents that have already been included are not moved, so no need to check included_vars to filter
+  set_upstream_family!(
+    place_after_loop,
+    op,
+    false,
+    loopdependencies(op),
+    identifier(op)
+  ) # parents that have already been included are not moved, so no need to check included_vars to filter
   nothing
 end
 function replace_reduct_init!(
   ls::LoopSet,
   op::Operation,
   opsub::Operation,
-  opcheck::Operation,
+  opcheck::Operation
 )
   deleteat!(parents(op), 2)
   op.variable = opcheck.variable
@@ -154,7 +161,7 @@ function nounrollreduction(
   op::Operation,
   u₁loop::Symbol,
   u₂loop::Symbol,
-  vectorized::Symbol,
+  vectorized::Symbol
 )
   reduceddeps = reduceddependencies(op)
   (vectorized ∉ reduceddeps) && (u₁loop ∉ reduceddeps) && (u₂loop ∉ reduceddeps)
@@ -163,7 +170,7 @@ function load_short_static_reduction_first!(
   ls::LoopSet,
   u₁loop::Symbol,
   u₂loop::Symbol,
-  vectorized::Symbol,
+  vectorized::Symbol
 )
   for op ∈ operations(ls)
     iscompute(op) || continue
@@ -188,8 +195,12 @@ function load_short_static_reduction_first!(
         opsub = parents(op)[2]
         length(children(opsub)) == 1 || continue
         opsearch = parents(op)[1]
-        opcheck =
-          search_for_reductinit!(opsearch, opsub, name(opsearch), loopdependencies(op))
+        opcheck = search_for_reductinit!(
+          opsearch,
+          opsub,
+          name(opsearch),
+          loopdependencies(op)
+        )
         opcheck === opsearch || replace_reduct_init!(ls, op, opsub, opcheck)
       end
     elseif (instruction(op).instr === :add_fast) &&
@@ -199,11 +210,17 @@ function load_short_static_reduction_first!(
          (length(vecloop) ≤ 16) &&
          nounrollreduction(op, u₁loop, u₂loop, vectorized)
         opsub = parents(op)[2]
-        ((length(reduceddependencies(opsub)) == 0) & (length(children(opsub)) == 1)) ||
-          continue
+        (
+          (length(reduceddependencies(opsub)) == 0) &
+          (length(children(opsub)) == 1)
+        ) || continue
         opsearch = parents(op)[1]
-        opcheck =
-          search_for_reductinit!(opsearch, opsub, name(opsearch), loopdependencies(op))
+        opcheck = search_for_reductinit!(
+          opsearch,
+          opsub,
+          name(opsearch),
+          loopdependencies(op)
+        )
         opcheck === opsearch || replace_reduct_init!(ls, op, opsub, opcheck)
       end
     end
@@ -216,7 +233,7 @@ function fillorder!(
   u₁loop::Symbol,
   u₂loop::Symbol,
   u₂max::Int,
-  vectorized::Symbol,
+  vectorized::Symbol
 )
   load_short_static_reduction_first!(ls, u₁loop, u₂loop, vectorized)
   lo = ls.loop_order
@@ -248,7 +265,7 @@ function fillorder!(
         u₁loop,
         u₂loop,
         vectorized,
-        u₂max,
+        u₂max
       )
     end
   end
diff --git a/src/codegen/split_loops.jl b/src/codegen/split_loops.jl
index 3440704a1..af1eafba8 100644
--- a/src/codegen/split_loops.jl
+++ b/src/codegen/split_loops.jl
@@ -1,12 +1,11 @@
 
-
 function add_operation!(
   ls_new::LoopSet,
   included::Vector{Int},
   ls::LoopSet,
   op::Operation,
   ids::Vector{Int},
-  issecond::Bool,
+  issecond::Bool
 )
   newid = included[identifier(op)]
   iszero(newid) || return operations(ls_new)[newid]
@@ -32,7 +31,7 @@ function add_operation!(
             reduceddependencies(opc),
             parentsnew,
             opc.ref,
-            reducedchildren(opc),
+            reducedchildren(opc)
           )
           addsetv!(ls_new.includedactualarrays, vptr(opc.ref))
           push!(operations(ls_new), opnew)
@@ -40,7 +39,14 @@ function add_operation!(
           for i ∈ 2:length(parentsopc)
             push!(
               parentsnew,
-              add_operation!(ls_new, included, ls, parentsopc[i], ids, issecond),
+              add_operation!(
+                ls_new,
+                included,
+                ls,
+                parentsopc[i],
+                ids,
+                issecond
+              )
             )
           end
           included[identifier(opp)] = identifier(opnew)
@@ -62,7 +68,7 @@ function add_operation!(
     reduceddependencies(op),
     vparents,
     op.ref,
-    reducedchildren(op),
+    reducedchildren(op)
   )
   accesses_memory(op) && addsetv!(ls_new.includedactualarrays, vptr(op.ref))
   push!(operations(ls_new), opnew)
@@ -70,12 +76,10 @@ function add_operation!(
   opnew
 end
 
-function append_if_included!(vnew, vold, included)
-  for (i, v) ∈ vold
+append_if_included!(vnew, vold, included) = for (i, v) ∈ vold
     id = included[i]
     iszero(id) || push!(vnew, (id, v))
   end
-end
 
 function split_loopset(ls::LoopSet, ids::Vector{Int}, issecond::Bool)
   ls_new = LoopSet(:LoopVectorization)
@@ -95,7 +99,11 @@ function split_loopset(ls::LoopSet, ids::Vector{Int}, issecond::Bool)
   append_if_included!(ls_new.preamble_symint, ls.preamble_symint, included)
   append_if_included!(ls_new.preamble_symfloat, ls.preamble_symfloat, included)
   append_if_included!(ls_new.preamble_zeros, ls.preamble_zeros, included)
-  append_if_included!(ls_new.preamble_funcofeltypes, ls.preamble_funcofeltypes, included)
+  append_if_included!(
+    ls_new.preamble_funcofeltypes,
+    ls.preamble_funcofeltypes,
+    included
+  )
   for i ∈ ls.outer_reductions
     id = included[i]
     iszero(id) || push!(ls_new.outer_reductions, id)
@@ -147,29 +155,61 @@ function lower_and_split_loops(ls::LoopSet, inline::Int)
   for (ind, i) ∈ enumerate(split_candidates)
     split_1[1] = i
     ls_1 = split_loopset(ls, split_1, false)
-    order_1, unrolled_1, tiled_1, vectorized_1, U_1, T_1, cost_1, shouldinline_1 =
-      choose_order_cost(ls_1)
+    order_1,
+    unrolled_1,
+    tiled_1,
+    vectorized_1,
+    U_1,
+    T_1,
+    cost_1,
+    shouldinline_1 = choose_order_cost(ls_1)
     remaining_ops[1:ind-1] .= @view(split_candidates[1:ind-1])
     remaining_ops[ind:end] .= @view(split_candidates[ind+1:end])
     ls_2 = split_loopset(ls, remaining_ops, true)
-    order_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2, cost_2, shouldinline_2 =
-      choose_order_cost(ls_2)
+    order_2,
+    unrolled_2,
+    tiled_2,
+    vectorized_2,
+    U_2,
+    T_2,
+    cost_2,
+    shouldinline_2 = choose_order_cost(ls_2)
     # U_1 = T_1 = U_2 = T_2 = 2
-    if cost_1 + cost_2 + looplenpen * (looplengthprod(ls_1) + looplengthprod(ls_2)) ≤
+    if cost_1 +
+       cost_2 +
+       looplenpen * (looplengthprod(ls_1) + looplengthprod(ls_2)) ≤
        muladd(0.9, cost_fused, ls_looplen)
       ls_2_lowered = if length(remaining_ops) > 1
         inline = iszero(inline) ? (shouldinline_1 % Int) : inline
         lower_and_split_loops(ls_2, inline)
       else
         doinline = inlinedecision(inline, shouldinline_1 | shouldinline_2)
-        lower(ls_2, order_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2, doinline)
+        lower(
+          ls_2,
+          order_2,
+          unrolled_2,
+          tiled_2,
+          vectorized_2,
+          U_2,
+          T_2,
+          doinline
+        )
       end
       return Expr(
         :block,
         ls.preamble,
-        lower(ls_1, order_1, unrolled_1, tiled_1, vectorized_1, U_1, T_1, false),
+        lower(
+          ls_1,
+          order_1,
+          unrolled_1,
+          tiled_1,
+          vectorized_1,
+          U_1,
+          T_1,
+          false
+        ),
         ls_2_lowered,
-        nothing,
+        nothing
       )
     end
     length(split_candidates) == 2 && break
@@ -183,6 +223,6 @@ function lower_and_split_loops(ls::LoopSet, inline::Int)
     vectorized_fused,
     U_fused,
     T_fused,
-    doinline,
+    doinline
   )
 end
diff --git a/src/condense_loopset.jl b/src/condense_loopset.jl
index ed25d0453..b984f2c92 100644
--- a/src/condense_loopset.jl
+++ b/src/condense_loopset.jl
@@ -1,4 +1,5 @@
-@enum IndexType::UInt8 NotAnIndex = 0 LoopIndex = 1 ComputedIndex = 2 SymbolicIndex = 3
+@enum IndexType::UInt8 NotAnIndex = 0 LoopIndex = 1 ComputedIndex = 2 SymbolicIndex =
+  3
 
 Base.:|(u::Unsigned, it::IndexType) = u | UInt8(it)
 Base.:(==)(u::Unsigned, it::IndexType) = (u % UInt8) == UInt8(it)
@@ -44,7 +45,10 @@ function rebuild_fields(offset::Int, ::Type{T}) where {T}
     elseif fieldcount(TF) ≡ 0
       push!(call.args, Expr(:call, getfield, :t, (offset += 1)))
     elseif TF <: DataType
-      push!(call.args, Expr(:call, lv(:gettype), Expr(:call, getfield, :t, (offset += 1))))
+      push!(
+        call.args,
+        Expr(:call, lv(:gettype), Expr(:call, getfield, :t, (offset += 1)))
+      )
     else
       arg, offset = rebuild_fields(offset, TF)
       push!(call.args, arg)
@@ -78,7 +82,8 @@ struct ArrayRefStruct{array,ptr}
   offsets::UInt128
   strides::UInt128
 end
-array_and_ptr(@nospecialize(ar::ArrayRefStruct{a,p})) where {a,p} = (a::Symbol, p::Symbol)
+array_and_ptr(@nospecialize(ar::ArrayRefStruct{a,p})) where {a,p} =
+  (a::Symbol, p::Symbol)
 # array(@nospecialize(ar::ArrayRefStruct{a,p})) where {a,p} = a::Symbol
 # ptr(@nospecialize(ar::ArrayRefStruct{a,p})) where {a,p}   = p::Symbol
 
@@ -92,7 +97,7 @@ function ArrayRefStruct(
   ls::LoopSet,
   mref::ArrayReferenceMeta,
   arraysymbolinds::Vector{Symbol},
-  ids::Vector{Int},
+  ids::Vector{Int}
 )
   index_types = zero(UInt128)
   indices = zero(UInt128)
@@ -128,7 +133,12 @@ function ArrayRefStruct(
       # end
     end
   end
-  ArrayRefStruct{mref.ref.array,mref.ptr}(index_types, indices, offsets, strides)
+  ArrayRefStruct{mref.ref.array,mref.ptr}(
+    index_types,
+    indices,
+    offsets,
+    strides
+  )
 end
 
 """
@@ -159,7 +169,8 @@ function findmatchingarray(ls::LoopSet, mref::ArrayReferenceMeta)
   end
   0x0000
 end
-filled_8byte_chunks(u::T) where {T<:Unsigned} = sizeof(T) - (leading_zeros(u) >>> 3)
+filled_8byte_chunks(u::T) where {T<:Unsigned} =
+  sizeof(T) - (leading_zeros(u) >>> 3)
 
 function shifted_loopset(ls::LoopSet, loopsyms::Vector{Symbol})
   ld = zero(UInt128) # leading_zeros(ld) >> 2 yields the number of loopdeps
@@ -169,9 +180,12 @@ function shifted_loopset(ls::LoopSet, loopsyms::Vector{Symbol})
   end
   ld
 end
-loopdeps_uint(ls::LoopSet, op::Operation) = shifted_loopset(ls, loopdependencies(op))
-reduceddeps_uint(ls::LoopSet, op::Operation) = shifted_loopset(ls, reduceddependencies(op))
-childdeps_uint(ls::LoopSet, op::Operation) = shifted_loopset(ls, reducedchildren(op))
+loopdeps_uint(ls::LoopSet, op::Operation) =
+  shifted_loopset(ls, loopdependencies(op))
+reduceddeps_uint(ls::LoopSet, op::Operation) =
+  shifted_loopset(ls, reduceddependencies(op))
+childdeps_uint(ls::LoopSet, op::Operation) =
+  shifted_loopset(ls, reducedchildren(op))
 function parents_uint(oppv::AbstractVector{Operation})
   p = zero(UInt128)
   for parent ∈ oppv
@@ -218,7 +232,7 @@ function OperationStruct!(
   varnames::Vector{Symbol},
   ids::Vector{Int},
   ls::LoopSet,
-  op::Operation,
+  op::Operation
 )
   ld = loopdeps_uint(ls, op)
   rd = reduceddeps_uint(ls, op)
@@ -239,8 +253,9 @@ end
   Zero():static_step(r):(maybestaticlast(r)-maybestaticfirst(r))
 @inline zerorangestart(r::CartesianIndices) =
   CartesianIndices(map(zerorangestart, r.indices))
-@inline zerorangestart(r::ArrayInterface.OptionallyStaticUnitRange{StaticInt{1}}) =
-  CloseOpen(maybestaticlast(r))
+@inline zerorangestart(
+  r::ArrayInterface.OptionallyStaticUnitRange{StaticInt{1}}
+) = CloseOpen(maybestaticlast(r))
 
 function loop_boundary!(q::Expr, loop::Loop, shouldindbyind::Bool)
   if isstaticloop(loop) || loop.rangesym === Symbol("")
@@ -289,7 +304,7 @@ function argmeta_and_consts_description(ls::LoopSet, arraysymbolinds)
     tuple_expr(ls.preamble_symint),
     tuple_expr(ls.preamble_symfloat),
     tuple_expr(ls.preamble_zeros),
-    tuple_expr(ls.preamble_funcofeltypes),
+    tuple_expr(ls.preamble_funcofeltypes)
   )
 end
 @inline vdata(v::Vec) = getfield(v, :data)
@@ -315,7 +330,10 @@ function loopset_return_value(ls::LoopSet, ::Val{extract}) where {extract}
     for or ∈ ls.outer_reductions
       op = ops[or]
       if extract
-        push!(ret.args, Expr(:call, :vdata, Symbol(mangledvar(op), "##onevec##")))
+        push!(
+          ret.args,
+          Expr(:call, :vdata, Symbol(mangledvar(op), "##onevec##"))
+        )
       else
         push!(ret.args, Symbol(mangledvar(ops[or]), "##onevec##"))
       end
@@ -359,40 +377,46 @@ end
 val(x) = Expr(:call, Expr(:curly, :Val, x))
 
 @inline gespf1(x, i) = gesp(x, i)
-@inline gespf1(x::StridedPointer{T,1}, i::Tuple{I}) where {T,I<:Union{Integer,StaticInt}} =
-  gesp(x, i)
+@inline gespf1(
+  x::StridedPointer{T,1},
+  i::Tuple{I}
+) where {T,I<:Union{Integer,StaticInt}} = gesp(x, i)
 @inline gespf1(
   x::StridedBitPointer{T,1},
-  i::Tuple{I},
+  i::Tuple{I}
 ) where {T,I<:Union{Integer,StaticInt}} = gesp(x, i)
 @inline gespf1(x::StridedPointer{T,1}, i::Tuple{Zero}) where {T} = x
 @inline gespf1(x::StridedBitPointer{T,1}, i::Tuple{Zero}) where {T} = x
 @generated function gespf1(
   x::AbstractStridedPointer{T,N,C,B,R},
-  i::Tuple{I},
+  i::Tuple{I}
 ) where {T,N,I<:Union{Integer,StaticInt},C,B,R}
   ri = argmin(R)
   quote
     $(Expr(:meta, :inline))
-    p, li = VectorizationBase.tdot(x, (vsub_nsw(getfield(i, 1), one($I)),), strides(x))
+    p, li = VectorizationBase.tdot(
+      x,
+      (vsub_nsw(getfield(i, 1), one($I)),),
+      strides(x)
+    )
     ptr = gep(p, li)
     si = ArrayInterface.StrideIndex{1,$(R[ri],),$(C === 1 ? 1 : 0)}(
       (getfield(strides(x), $ri),),
-      (Zero(),),
+      (Zero(),)
     )
     stridedpointer(ptr, si, StaticInt{$(B === 1 ? 1 : 0)}())
   end
 end
 @generated function gespf1(
   x::AbstractStridedPointer{T,N,C,B,R},
-  ::Tuple{VectorizationBase.NullStep},
+  ::Tuple{VectorizationBase.NullStep}
 ) where {T,N,C,B,R}
   ri = argmin(R)
   quote
     $(Expr(:meta, :inline))
     si = ArrayInterface.StrideIndex{1,$(R[ri],),$(C === 1 ? 1 : 0)}(
       (getfield(strides(x), $ri),),
-      (getfield(offsets(x), $ri),),
+      (getfield(offsets(x), $ri),)
     )
     stridedpointer(pointer(x), si, StaticInt{$(B == 1 ? 1 : 0)}())
   end
@@ -407,7 +431,7 @@ function should_zerorangestart(
   ls::LoopSet,
   allarrayrefs::Vector{ArrayReferenceMeta},
   name_to_array_map::Vector{Vector{Int}},
-  isrooted::Vector{Bool},
+  isrooted::Vector{Bool}
 )
   loops = ls.loops
   shouldindbyind = fill(false, length(loops))
@@ -423,7 +447,8 @@ function should_zerorangestart(
       baseref = allarrayrefs[first(namev)]
       # firstcontainsind relies on stripping of duplicate inds in parsing
       firstcontainsind = findfirstcontaining(baseref, ind)
-      basestride = firstcontainsind == 0 ? 0 : getstrides(baseref)[firstcontainsind]
+      basestride =
+        firstcontainsind == 0 ? 0 : getstrides(baseref)[firstcontainsind]
       allsame = true
       # The idea here is that if any ref to the same array doesn't have `ind`,
       # we can't offset that dimension because different inds will clash.
@@ -431,8 +456,10 @@ function should_zerorangestart(
       # to be consistent, and check that all arrays are valid first.
       for j ∈ @view(namev[2:end])
         ref = allarrayrefs[j]
-        if (firstcontainsind ≠ findfirstcontaining(ref, ind)) ||
-           ((firstcontainsind ≠ 0) && (basestride ≠ getstrides(ref)[firstcontainsind]))
+        if (firstcontainsind ≠ findfirstcontaining(ref, ind)) || (
+          (firstcontainsind ≠ 0) &&
+          (basestride ≠ getstrides(ref)[firstcontainsind])
+        )
           allsame = false
           break
         end
@@ -445,17 +472,22 @@ function should_zerorangestart(
   end
   return shouldindbyind
 end
-function check_shouldindbyind(ls::LoopSet, ind::Symbol, shouldindbyind::Vector{Bool})
+function check_shouldindbyind(
+  ls::LoopSet,
+  ind::Symbol,
+  shouldindbyind::Vector{Bool}
+)
   for (i, loop) ∈ enumerate(ls.loops)
     loop.itersymbol === ind && return shouldindbyind[i]
   end
   true
 end
 
-
 @inline densewrapper(sp, A) = sp
-@inline densewrapper(sp::AbstractStridedPointer{T,N}, A::AbstractArray{T,N}) where {T,N} =
-  _densewrapper(sp, VectorizationBase.val_dense_dims(A))
+@inline densewrapper(
+  sp::AbstractStridedPointer{T,N},
+  A::AbstractArray{T,N}
+) where {T,N} = _densewrapper(sp, VectorizationBase.val_dense_dims(A))
 @inline _densewrapper(sp, ::Nothing) = sp
 @inline _densewrapper(sp::AbstractStridedPointer, ::Val{D}) where {D} =
   VectorizationBase.DensePointerWrapper{D}(sp)
@@ -501,7 +533,7 @@ function add_grouped_strided_pointer!(extra_args::Expr, ls::LoopSet)
           allarrayrefs,
           k,
           name_to_array_map,
-          unique_to_name_and_op_map,
+          unique_to_name_and_op_map
         )
         push!(gespsummaries, (k, gespindsummary))
         found = true
@@ -512,7 +544,8 @@ function add_grouped_strided_pointer!(extra_args::Expr, ls::LoopSet)
     push!(preserve, presbufsym(ref.ref.array))
   end
   roots = getroots(ls)
-  shouldindbyind = should_zerorangestart(ls, allarrayrefs, name_to_array_map, roots)
+  shouldindbyind =
+    should_zerorangestart(ls, allarrayrefs, name_to_array_map, roots)
   for (k, gespindsummary) ∈ gespsummaries
     ref = allarrayrefs[k]
     gespinds = calcgespinds(
@@ -521,7 +554,7 @@ function add_grouped_strided_pointer!(extra_args::Expr, ls::LoopSet)
       gespindsummary,
       shouldindbyind,
       name_to_array_map[first(first(unique_to_name_and_op_map[k]))],
-      unique_to_name_and_op_map,
+      unique_to_name_and_op_map
     )
     push!(
       tgarrays.args,
@@ -529,8 +562,8 @@ function add_grouped_strided_pointer!(extra_args::Expr, ls::LoopSet)
         :call,
         lv(:densewrapper),
         Expr(:call, lv(:gespf1), vptr(ref), gespinds),
-        name(ref),
-      ),
+        name(ref)
+      )
     )
   end
   push!(gsp.args, tgarrays)
@@ -557,11 +590,25 @@ end
   ::StaticInt{RS},
   ::StaticInt{AR},
   ::StaticInt{NT},
-  ::StaticInt{CLS},
+  ::StaticInt{CLS}
 ) where {CNFARG,W,RS,AR,CLS,NT}
   inline, u₁, u₂, v, BROADCAST, thread, warncheckarg, safe = CNFARG
   nt = min(thread % UInt, NT % UInt)
-  t = Expr(:tuple, inline, u₁, u₂, v, BROADCAST, W, RS, AR, CLS, nt, warncheckarg, safe)
+  t = Expr(
+    :tuple,
+    inline,
+    u₁,
+    u₂,
+    v,
+    BROADCAST,
+    W,
+    RS,
+    AR,
+    CLS,
+    nt,
+    warncheckarg,
+    safe
+  )
   length(CNFARG) == 7 && push!(t.args, CNFARG[7])
   Expr(:call, Expr(:curly, :Val, t))
 end
@@ -572,13 +619,15 @@ end
     register_size(),
     available_registers(),
     num_cores(), #FIXME
-    cache_linesize(),
+    cache_linesize()
   )
 end
 function find_samename_constparent(op::Operation, opname::Symbol)
   for opp ∈ parents(op)
-    (((isconstant(opp) && instruction(opp) == LOOPCONSTANT) && (name(opp) === opname))) &&
-      return opp
+    ((
+      (isconstant(opp) && instruction(opp) == LOOPCONSTANT) &&
+      (name(opp) === opname)
+    )) && return opp
     opptemp = find_samename_constparent(opp, opname)
     opptemp === opp || return opptemp
   end
@@ -595,8 +644,6 @@ function remove_outer_reducts!(roots::Vector{Bool}, ls::LoopSet)
   end
 end
 
-
-
 function split_ifelse!(
   ls::LoopSet,
   preserve::Vector{Symbol},
@@ -608,7 +655,7 @@ function split_ifelse!(
   thread::UInt,
   warncheckarg::Int,
   safe::Bool,
-  debug::Bool,
+  debug::Bool
 )
   roots[k] = false
   op = operations(ls)[k]
@@ -667,7 +714,7 @@ function split_ifelse!(
         thread,
         warncheckarg,
         safe,
-        debug,
+        debug
       ))
     else
       $(generate_call_split(
@@ -680,7 +727,7 @@ function split_ifelse!(
         thread,
         warncheckarg,
         safe,
-        debug,
+        debug
       ))
     end
   )
@@ -694,7 +741,7 @@ function generate_call(
   thread::UInt,
   warncheckarg::Int,
   safe::Bool,
-  debug::Bool,
+  debug::Bool
 )
   extra_args = Expr(:tuple)
   fill_children!(ls)
@@ -709,7 +756,7 @@ function generate_call(
     thread,
     warncheckarg,
     safe,
-    debug,
+    debug
   )
 end
 function generate_call_split(
@@ -722,7 +769,7 @@ function generate_call_split(
   thread::UInt,
   warncheckarg::Int,
   safe::Bool,
-  debug::Bool,
+  debug::Bool
 )
   for (k, op) ∈ enumerate(operations(ls))
     parents_op = parents(op)
@@ -740,7 +787,7 @@ function generate_call_split(
         thread,
         warncheckarg,
         safe,
-        debug,
+        debug
       )
     end
   end
@@ -754,7 +801,7 @@ function generate_call_split(
     thread,
     warncheckarg,
     safe,
-    debug,
+    debug
   )
 end
 
@@ -769,7 +816,7 @@ function generate_call_types(
   thread::UInt,
   warncheckarg::Int,
   safe::Bool,
-  debug::Bool,
+  debug::Bool
 )
   # good place to check for split  
   operation_descriptions = Expr(:tuple)
@@ -794,7 +841,10 @@ function generate_call_types(
   for (j, ref) ∈ enumerate(ls.refs_aliasing_syms)
     # duplicate_ref[j] ≠ 0 && continue
     duplicate_ref[j] && continue
-    push!(arrayref_descriptions.args, ArrayRefStruct(ls, ref, arraysymbolinds, ids))
+    push!(
+      arrayref_descriptions.args,
+      ArrayRefStruct(ls, ref, arraysymbolinds, ids)
+    )
   end
   argmeta = argmeta_and_consts_description(ls, arraysymbolinds)
   loop_bounds = loop_boundaries(ls, shouldindbyind)
@@ -818,7 +868,8 @@ function generate_call_types(
   end
   manyarg = !debug && (argcestimate > 16)
   func =
-    debug ? lv(:_turbo_loopset_debug) : (manyarg ? lv(:_turbo_manyarg!) : lv(:_turbo_!))
+    debug ? lv(:_turbo_loopset_debug) :
+    (manyarg ? lv(:_turbo_manyarg!) : lv(:_turbo_!))
   q = Expr(
     :call,
     func,
@@ -826,7 +877,7 @@ function generate_call_types(
     val(operation_descriptions),
     val(arrayref_descriptions),
     val(argmeta),
-    val(loop_syms),
+    val(loop_syms)
   )
   vecwidthdefq = if debug
     push!(q.args, Expr(:tuple, lbarg, extra_args))
@@ -835,7 +886,11 @@ function generate_call_types(
     vargsym = gensym(:vargsym)
     push!(
       q.args,
-      Expr(:call, GlobalRef(Base, :Val), Expr(:call, GlobalRef(Base, :typeof), vargsym)),
+      Expr(
+        :call,
+        GlobalRef(Base, :Val),
+        Expr(:call, GlobalRef(Base, :typeof), vargsym)
+      )
     )
     if manyarg
       push!(q.args, Expr(:call, lv(:flatten_to_tuple), vargsym))
@@ -869,7 +924,6 @@ end
 """
     check_args(::Vararg{AbstractArray})
 
-
 LoopVectorization will optimize an `@turbo` loop if `check_args` on each on the indexed abstract arrays returns true.
 It returns true for `AbstractArray{T}`s when `check_type(T) == true` and the array or its parent is a `StridedArray` or `AbstractRange`.
 
@@ -886,7 +940,8 @@ end
   # @info "`LoopVectorization.check_args(::$(typeof(x))) == false`, therefore compiling a probably slow `@inbounds @fastmath` fallback loop." maxlog=1
   false
 end
-@inline check_args(A, B, C::Vararg{Any,K}) where {K} = check_args(A) && check_args(B, C...)
+@inline check_args(A, B, C::Vararg{Any,K}) where {K} =
+  check_args(A) && check_args(B, C...)
 @inline check_args(::AbstractRange{T}) where {T} = check_type(T)
 @inline check_args(::UpTri) = false
 @inline check_args(::LoTri) = false
@@ -916,7 +971,7 @@ end
 struct RetVec2Int end
 (::RetVec2Int)(_) = Vec{2,Int}
 """
-  can_turbo(f::Function, ::Val{NARGS})
+can_turbo(f::Function, ::Val{NARGS})
 
 Check whether a given function with a specified number of arguments
 can be used inside a `@turbo` loop.
@@ -962,10 +1017,18 @@ function check_turbo_safe(ls::LoopSet)
   q
 end
 
-make_fast(q) =
-  Expr(:macrocall, Symbol("@fastmath"), LineNumberNode(@__LINE__, Symbol(@__FILE__)), q)
-make_crashy(q) =
-  Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__, Symbol(@__FILE__)), q)
+make_fast(q) = Expr(
+  :macrocall,
+  Symbol("@fastmath"),
+  LineNumberNode(@__LINE__, Symbol(@__FILE__)),
+  q
+)
+make_crashy(q) = Expr(
+  :macrocall,
+  Symbol("@inbounds"),
+  LineNumberNode(@__LINE__, Symbol(@__FILE__)),
+  q
+)
 
 @inline vecmemaybe(x::NativeTypes) = x
 @inline vecmemaybe(x::VectorizationBase._Vec) = Vec(x)
@@ -998,24 +1061,39 @@ end
 #   call, preserve = generate_call_split(ls, (inline,u₁,u₂), thread % UInt, false)
 #   setup_call_ret!(ls, call, preserve)
 # end
-setup_outerreduct_preserve_mangler(op::Operation) = Symbol(mangledvar(op), "##onevec##")
+setup_outerreduct_preserve_mangler(op::Operation) =
+  Symbol(mangledvar(op), "##onevec##")
 
-function outer_reduction_to_scalar_reduceq!(q::Expr, op::Operation, var = name(op))
+function outer_reduction_to_scalar_reduceq!(
+  q::Expr,
+  op::Operation,
+  var = name(op)
+)
   instr = instruction(op)
   out = setup_outerreduct_preserve_mangler(op)
   if instr.instr ≢ :ifelse
-    Expr(:call, reduction_scalar_combine(op), Expr(:call, lv(:vecmemaybe), out), var)
+    Expr(
+      :call,
+      reduction_scalar_combine(op),
+      Expr(:call, lv(:vecmemaybe), out),
+      var
+    )
   else
     opinstr = ifelse_reduction(:IfElseReduced, op) do opv
       opvname = name(opv)
       oporig = gensym(opvname)
       pushfirst!(q.args, Expr(:(=), oporig, opvname))
-      Expr(:call, lv(:vecmemaybe), setup_outerreduct_preserve_mangler(opv)), (oporig,)
+      Expr(:call, lv(:vecmemaybe), setup_outerreduct_preserve_mangler(opv)),
+      (oporig,)
     end
     Expr(:call, opinstr, Expr(:call, lv(:vecmemaybe), out), var)
   end
 end
-function setup_outerreduct_preserve(ls::LoopSet, call::Expr, preserve::Vector{Symbol})
+function setup_outerreduct_preserve(
+  ls::LoopSet,
+  call::Expr,
+  preserve::Vector{Symbol}
+)
   iszero(length(ls.outer_reductions)) && return gc_preserve(call, preserve)
   retv = loopset_return_value(ls, Val(false))
   q = Expr(:block, gc_preserve(Expr(:(=), retv, call), preserve))
@@ -1033,7 +1111,14 @@ function setup_call_final(ls::LoopSet, q::Expr)
   return ls.preamble
 end
 function setup_call_debug(ls::LoopSet)
-  generate_call(ls, (false, zero(Int8), zero(Int8), zero(Int8)), zero(UInt), 1, true, true)
+  generate_call(
+    ls,
+    (false, zero(Int8), zero(Int8), zero(Int8)),
+    zero(UInt),
+    1,
+    true,
+    true
+  )
 end
 function setup_call(
   ls::LoopSet,
@@ -1046,7 +1131,7 @@ function setup_call(
   v::Int8,
   thread::Int,
   warncheckarg::Int,
-  safe::Bool,
+  safe::Bool
 )
   # We outline/inline at the macro level by creating/not creating an anonymous function.
   # The old API instead was based on inlining or not inline the generated function, but
diff --git a/src/constructors.jl b/src/constructors.jl
index 80340e92f..121f8c290 100644
--- a/src/constructors.jl
+++ b/src/constructors.jl
@@ -23,7 +23,7 @@ function add_ci_call!(
   syms::Vector{Symbol},
   i::Int,
   @nospecialize(valarg) = nothing,
-  @nospecialize(mod) = nothing,
+  @nospecialize(mod) = nothing
 )
   call = if f isa Core.SSAValue
     Expr(:call, syms[f.id])
@@ -52,14 +52,15 @@ function substitute_broadcast(
   v::Int8,
   threads::Int,
   warncheckarg::Int,
-  safe::Bool,
+  safe::Bool
 )
   ci = first(Meta.lower(LoopVectorization, q).args).code
   nargs = length(ci) - 1
   lb = Expr(:block)
   syms = Vector{Symbol}(undef, nargs)
   configarg = (inline, u₁, u₂, v, true, threads, warncheckarg, safe)
-  unroll_param_tup = Expr(:call, lv(:avx_config_val), :(Val{$configarg}()), staticexpr(0))
+  unroll_param_tup =
+    Expr(:call, lv(:avx_config_val), :(Val{$configarg}()), staticexpr(0))
   for n ∈ 1:nargs
     _ciₙ = ci[n]
     if _ciₙ isa Symbol
@@ -72,9 +73,25 @@ function substitute_broadcast(
       if ciₙ.head === :(=)
         push!(lb.args, Expr(:(=), f, syms[((ciₙargs[2])::Core.SSAValue).id]))
       elseif isglobalref(f, Base, :materialize!)
-        add_ci_call!(lb, lv(:vmaterialize!), ciₙargs, syms, n, unroll_param_tup, mod)
+        add_ci_call!(
+          lb,
+          lv(:vmaterialize!),
+          ciₙargs,
+          syms,
+          n,
+          unroll_param_tup,
+          mod
+        )
       elseif isglobalref(f, Base, :materialize)
-        add_ci_call!(lb, lv(:vmaterialize), ciₙargs, syms, n, unroll_param_tup, mod)
+        add_ci_call!(
+          lb,
+          lv(:vmaterialize),
+          ciₙargs,
+          syms,
+          n,
+          unroll_param_tup,
+          mod
+        )
       else
         add_ci_call!(lb, f, ciₙargs, syms, n)
       end
@@ -87,7 +104,6 @@ function substitute_broadcast(
   esc(Expr(:let, lb, Expr(:block, ret)))
 end
 
-
 function LoopSet(q::Expr, mod::Symbol = :Main)
   ls = LoopSet(mod)
   check_inputs!(q, ls.prepreamble)
@@ -113,7 +129,7 @@ function check_macro_kwarg(
   v::Int8,
   threads::Int,
   warncheckarg::Int,
-  safe::Bool,
+  safe::Bool
 )
   ((arg.head === :(=)) && (length(arg.args) == 2)) ||
     throw(ArgumentError("macro kwarg should be of the form `argname = value`."))
@@ -128,7 +144,9 @@ function check_macro_kwarg(
       u₁ = convert(Int8, value.args[1])::Int8
       u₂ = convert(Int8, value.args[2])::Int8
     else
-      throw(ArgumentError("Don't know how to process argument in `unroll=$value`."))
+      throw(
+        ArgumentError("Don't know how to process argument in `unroll=$value`.")
+      )
     end
   elseif kw === :vectorize
     v = convert(Int8, value)
@@ -140,7 +158,9 @@ function check_macro_kwarg(
     elseif value isa Integer
       threads = max(1, convert(Int, value)::Int)
     else
-      throw(ArgumentError("Don't know how to process argument in `thread=$value`."))
+      throw(
+        ArgumentError("Don't know how to process argument in `thread=$value`.")
+      )
     end
   elseif kw === :warn_check_args
     warncheckarg = convert(Int, value)::Int
@@ -149,8 +169,8 @@ function check_macro_kwarg(
   else
     throw(
       ArgumentError(
-        "Received unrecognized keyword argument $kw. Recognized arguments include:\n`inline`, `unroll`, `check_empty`, and `thread`.",
-      ),
+        "Received unrecognized keyword argument $kw. Recognized arguments include:\n`inline`, `unroll`, `check_empty`, and `thread`."
+      )
     )
   end
   inline, check_empty, u₁, u₂, v, threads, warncheckarg, safe
@@ -164,11 +184,21 @@ function process_args(
   v::Int8 = zero(Int8),
   threads::Int = 1,
   warncheckarg::Int = 1,
-  safe::Bool = true,
+  safe::Bool = true
 )
   for arg ∈ args
     inline, check_empty, u₁, u₂, v, threads, warncheckarg, safe =
-      check_macro_kwarg(arg, inline, check_empty, u₁, u₂, v, threads, warncheckarg, safe)
+      check_macro_kwarg(
+        arg,
+        inline,
+        check_empty,
+        u₁,
+        u₂,
+        v,
+        threads,
+        warncheckarg,
+        safe
+      )
   end
   inline, check_empty, u₁, u₂, v, threads, warncheckarg, safe
 end
@@ -223,8 +253,10 @@ function replace_single_enumerate!(q, prepreamble, i = nothing)
       indsym = itersyms.args[1]::Symbol
       _replace_looprange!(q, i, indsym, iter)
     elseif itersyms isa Symbol # if itersyms are not unbox in loop range
-      throw(ArgumentError("`for $itersyms in enumerate($r)` is not supported,
-        please use `for ($(itersyms)_i, $(itersyms)_v) in enumerate($r)` instead."))
+      throw(
+        ArgumentError("`for $itersyms in enumerate($r)` is not supported,
+    please use `for ($(itersyms)_i, $(itersyms)_v) in enumerate($r)` instead.")
+      )
     else
       throw(ArgumentError("Don't know how to handle expression `$itersyms`."))
     end
@@ -240,12 +272,37 @@ function turbo_macro(mod, src, q, args...)
   q = macroexpand(mod, q)
   if q.head === :for
     ls = LoopSet(q, mod)
-    inline, check_empty, u₁, u₂, v, threads, warncheckarg, safe = process_args(args)
-    esc(setup_call(ls, q, src, inline, check_empty, u₁, u₂, v, threads, warncheckarg, safe))
+    inline, check_empty, u₁, u₂, v, threads, warncheckarg, safe =
+      process_args(args)
+    esc(
+      setup_call(
+        ls,
+        q,
+        src,
+        inline,
+        check_empty,
+        u₁,
+        u₂,
+        v,
+        threads,
+        warncheckarg,
+        safe
+      )
+    )
   else
     inline, check_empty, u₁, u₂, v, threads, warncheckarg, safe =
-      process_args(args, inline = true)
-    substitute_broadcast(q, Symbol(mod), inline, u₁, u₂, v, threads, warncheckarg, safe)
+      process_args(args; inline = true)
+    substitute_broadcast(
+      q,
+      Symbol(mod),
+      inline,
+      u₁,
+      u₂,
+      v,
+      threads,
+      warncheckarg,
+      safe
+    )
   end
 end
 """
@@ -267,10 +324,10 @@ The macro models the set of nested loops, and chooses an ordering of the three l
 
 Current limitations:
 
-1. It assumes that loop iterations are independent.
-2. It does not perform bounds checks.
-3. It assumes that each loop iterates at least once. (Use `@turbo check_empty=true` to lift this assumption.)
-4. That there is only one loop at each level of the nest.
+ 1. It assumes that loop iterations are independent.
+ 2. It does not perform bounds checks.
+ 3. It assumes that each loop iterates at least once. (Use `@turbo check_empty=true` to lift this assumption.)
+ 4. That there is only one loop at each level of the nest.
 
 It may also apply to broadcasts:
 
@@ -295,7 +352,7 @@ Advanced users can customize the implementation of the `@turbo`-annotated block
 using keyword arguments:
 
 ```julia
-@turbo inline=false unroll=2 thread=4 body
+@turbo inline = false unroll = 2 thread = 4 body
 ```
 
 where `body` is the code of the block (e.g., `for ... end`).
@@ -355,13 +412,22 @@ Note that later arguments take precendence.
 Meant for convenience, as `@tturbo` is shorter than `@turbo thread=true`.
 """
 macro tturbo(args...)
-  turbo_macro(__module__, __source__, last(args), :(thread = true), Base.front(args)...)
+  turbo_macro(
+    __module__,
+    __source__,
+    last(args),
+    :(thread = true),
+    Base.front(args)...
+  )
 end
 
 function def_outer_reduct_types!(ls::LoopSet)
   for or ∈ ls.outer_reductions
     op = operations(ls)[or]
-    pushpreamble!(ls, Expr(:(=), outer_reduct_init_typename(op), typeof_expr(op)))
+    pushpreamble!(
+      ls,
+      Expr(:(=), outer_reduct_init_typename(op), typeof_expr(op))
+    )
   end
 end
 """
@@ -383,8 +449,17 @@ end
 macro _turbo(arg, q)
   @assert q.head === :for
   q = macroexpand(__module__, q)
-  inline, check_empty, u₁, u₂, v =
-    check_macro_kwarg(arg, false, false, zero(Int8), zero(Int8), zero(Int8), 1, 0, true)
+  inline, check_empty, u₁, u₂, v = check_macro_kwarg(
+    arg,
+    false,
+    false,
+    zero(Int8),
+    zero(Int8),
+    zero(Int8),
+    1,
+    0,
+    true
+  )
   ls = LoopSet(q, __module__)
   set_hw!(ls)
   def_outer_reduct_types!(ls)
diff --git a/src/getconstindexes.jl b/src/getconstindexes.jl
index d23d545b3..596a107f5 100644
--- a/src/getconstindexes.jl
+++ b/src/getconstindexes.jl
@@ -37,7 +37,7 @@ const EXTRACTFUNS = (
   :thirtysixth,
   :thirtyseventh,
   :thirtyeighth,
-  :last,
+  :last
 )
 
 for (i, f) ∈ enumerate(EXTRACTFUNS)
diff --git a/src/modeling/costs.jl b/src/modeling/costs.jl
index 712c15bc4..8c2c2bbb5 100644
--- a/src/modeling/costs.jl
+++ b/src/modeling/costs.jl
@@ -1,7 +1,6 @@
 
 lv(x) = GlobalRef(LoopVectorization, x)
 
-
 """
     Instruction
 
@@ -14,13 +13,11 @@ struct Instruction
 end
 # lower(instr::Instruction) = Expr(:(.), instr.mod, QuoteNode(instr.instr))
 # Base.convert(::Type{Expr}, instr::Instruction) = Expr(:(.), instr.mod, QuoteNode(instr.instr))
-function callexpr(instr::Instruction)
-  if instr.mod === :LoopVectorization
+callexpr(instr::Instruction) = if instr.mod === :LoopVectorization
     Expr(:call, lv(instr.instr))
   else#if instr.mod === :Main
     Expr(:call, instr.instr)
   end
-end
 function callexpr(instr::Instruction, arg)
   ce = callexpr(instr)
   append!(ce.args, arg)
@@ -93,8 +90,10 @@ end
 const OPAQUE_INSTRUCTION = InstructionCost(-1.0, 20, 20.0, 16)
 
 instruction_cost(instruction::Instruction) =
-  instruction.mod === :LoopVectorization ? COST[instruction.instr] : OPAQUE_INSTRUCTION
-instruction_cost(instruction::Symbol) = get(COST, instruction, OPAQUE_INSTRUCTION)
+  instruction.mod === :LoopVectorization ? COST[instruction.instr] :
+  OPAQUE_INSTRUCTION
+instruction_cost(instruction::Symbol) =
+  get(COST, instruction, OPAQUE_INSTRUCTION)
 scalar_cost(instr::Instruction) = scalar_cost(instruction_cost(instr))
 vector_cost(instr::Instruction, Wshift, sizeof_T) =
   vector_cost(instruction_cost(instr), Wshift, sizeof_T)
@@ -106,10 +105,8 @@ vector_cost(instr::Instruction, Wshift, sizeof_T) =
 #     cost( instruction_cost(instruction), Wshift, sizeof_T )
 # end
 
-
 # Just a semi-reasonable assumption; should not be that sensitive to anything other than loads
 
-
 # Comments on setindex!
 # 1. Not a part of dependency chains, so not really twice as expensive as getindex?
 # 2. getindex loads a register, not setindex!, but we place cost on setindex!
@@ -263,7 +260,7 @@ const COST = Dict{Symbol,InstructionCost}(
   :vmovsldup => InstructionCost(1, 1.0),
   :vmovshdup => InstructionCost(1, 1.0),
   :exponent => InstructionCost(8, 1.0),
-  :significand => InstructionCost(8, 1.0),
+  :significand => InstructionCost(8, 1.0)
 )
 
 for f in EXTRACTFUNS
@@ -288,7 +285,8 @@ Base.convert(::Type{Instruction}, instr::Symbol) = Instruction(instr)
 function instruction(f::Symbol)
   # f === :ifelse && return Instruction(:LoopVectorization, :ifelse)
   # @assert f ∈ keys(COST)
-  f ∈ keys(COST) ? Instruction(:LoopVectorization, f) : Instruction(Symbol(""), f)
+  f ∈ keys(COST) ? Instruction(:LoopVectorization, f) :
+  Instruction(Symbol(""), f)
 end
 # instruction(f::Symbol, m::Symbol) = f ∈ keys(COST) ? Instruction(:LoopVectorization, f) : Instruction(m, f)
 Instruction(instr::Symbol) = instruction(instr)
@@ -319,11 +317,15 @@ end
 @inline (ier::IfElseReducer)(a::VecUnroll) =
   VecUnroll(VectorizationBase.fmap(ier, VectorizationBase.data(a)))
 @inline (ier::IfElseReducer)(a::VecUnroll, b::VecUnroll) = VecUnroll(
-  VectorizationBase.fmap(ier, VectorizationBase.data(a), VectorizationBase.data(b)),
+  VectorizationBase.fmap(
+    ier,
+    VectorizationBase.data(a),
+    VectorizationBase.data(b)
+  )
 )
 
-
-@inline (ier::IfElseReduced)(x::NativeTypes, y::NativeTypes) = ifelse(ier.f(x, y), x, y)
+@inline (ier::IfElseReduced)(x::NativeTypes, y::NativeTypes) =
+  ifelse(ier.f(x, y), x, y)
 @inline (ier::IfElseReduced)(x::AbstractSIMD{W}, y::AbstractSIMD{W}) where {W} =
   ifelse(ier.f(x, y), x, y)
 @inline function (ier::IfElseReduced)(x::AbstractSIMD, y::AbstractSIMD)
@@ -338,17 +340,21 @@ end
   ifelse(f(r, y), r, y)
 end
 
-
 @inline (ier::IfElseReduceTo)(a::NativeTypes, ::NativeTypes) = a
 @inline (ier::IfElseReduceTo)(a::AbstractSIMD, ::NativeTypes) =
   VectorizationBase.ifelse_reduce(ier.f, a)
-@inline (ier::IfElseReduceTo)(a::AbstractSIMD{W}, ::AbstractSIMD{W}) where {W} = a
+@inline (ier::IfElseReduceTo)(a::AbstractSIMD{W}, ::AbstractSIMD{W}) where {W} =
+  a
 @inline function (ier::IfElseReduceTo)(a::AbstractSIMD, b::AbstractSIMD)
   x, y = VectorizationBase.splitvector(a) # halve recursively
   ier(ifelse(ier.f(x, y), x, y), b)
 end
 @inline (ier::IfElseReduceTo)(a::VecUnroll, b::VecUnroll) = VecUnroll(
-  VectorizationBase.fmap(ier, VectorizationBase.data(a), VectorizationBase.data(b)),
+  VectorizationBase.fmap(
+    ier,
+    VectorizationBase.data(a),
+    VectorizationBase.data(b)
+  )
 )
 
 @inline (iec::IfElseCollapser)(a) =
@@ -383,19 +389,21 @@ end
 
 @inline _first_ifelse_reduce_mirror(f::F, a, b) where {F} =
   getfield(VectorizationBase.ifelse_reduce_mirror(f, a, b), 1)
-@inline (ier::IfElseReducerMirror)(a) = _first_ifelse_reduce_mirror(ier.f, a, ier.a)
+@inline (ier::IfElseReducerMirror)(a) =
+  _first_ifelse_reduce_mirror(ier.f, a, ier.a)
 @inline function _ifelse_reduce_mirror(f::F, a, b, c, d) where {F}
   r, rm = VectorizationBase.ifelse_reduce_mirror(f, b, d)
   ifelse(f(c, rm), a, r)
 end
-@inline (ier::IfElseReducerMirror)(a, b) = _ifelse_reduce_mirror(ier.f, a, b, ier.a, ier.b)
+@inline (ier::IfElseReducerMirror)(a, b) =
+  _ifelse_reduce_mirror(ier.f, a, b, ier.a, ier.b)
 @inline (ier::IfElseReducerMirror)(a::VecUnroll) = VecUnroll(
   VectorizationBase.fmap(
     _first_ifelse_reduce_mirror,
     ier.f,
     VectorizationBase.data(a),
-    VectorizationBase.data(ier.a),
-  ),
+    VectorizationBase.data(ier.a)
+  )
 )
 @inline function (ier::IfElseReducerMirror)(a::VecUnroll, b::VecUnroll)
   VecUnroll(
@@ -405,8 +413,8 @@ end
       VectorizationBase.data(a),
       VectorizationBase.data(b),
       VectorizationBase.data(ier.a),
-      VectorizationBase.data(ier.b),
-    ),
+      VectorizationBase.data(ier.b)
+    )
   )
 end
 
@@ -414,8 +422,10 @@ end
   IfElseReducedMirror{F,A,Nothing}(f, a, nothing)
 @inline (ier::IfElseReducedMirror)(x::NativeTypes, y::NativeTypes) =
   ifelse(ier.f(ier.a, ier.b), x, y)
-@inline (ier::IfElseReducedMirror)(x::AbstractSIMD{W}, y::AbstractSIMD{W}) where {W} =
-  ifelse(ier.f(ier.a, ier.b), x, y)
+@inline (ier::IfElseReducedMirror)(
+  x::AbstractSIMD{W},
+  y::AbstractSIMD{W}
+) where {W} = ifelse(ier.f(ier.a, ier.b), x, y)
 @inline function _reduce_mirror(f::F, x, y, a, b) where {F}
   r, _ = IfElseReducedMirror(f, a, b)(x, y)
   ifelse(f(r, y), r, y)
@@ -429,8 +439,8 @@ end
     getfield(x, :data),
     getfield(y, :data),
     getfield(ier.a, :data),
-    getfield(ier.b, :data),
-  ),
+    getfield(ier.b, :data)
+  )
 )
 @inline function (ier::IfElseReducedMirror)(x::AbstractSIMD, y::NativeTypes)
   f = ier.f
@@ -439,11 +449,13 @@ end
   ifelse(f(rm, ier.b), r, y)
 end
 
-
 @inline (ier::IfElseReduceToMirror)(a::NativeTypes, ::NativeTypes) = a
 @inline (ier::IfElseReduceToMirror)(a::AbstractSIMD, ::NativeTypes) =
   VectorizationBase.ifelse_reduce_mirror(ier.f, a, ier.a)
-@inline (ier::IfElseReduceToMirror)(a::AbstractSIMD{W}, ::AbstractSIMD{W}) where {W} = a
+@inline (ier::IfElseReduceToMirror)(
+  a::AbstractSIMD{W},
+  ::AbstractSIMD{W}
+) where {W} = a
 @inline function (ier::IfElseReduceToMirror)(a::AbstractSIMD, b::AbstractSIMD)
   x, y = VectorizationBase.splitvector(a) # halve recursively
   w, z = VectorizationBase.splitvector(ier.a) # halve recursively
@@ -452,7 +464,11 @@ end
   IfElseReduceToMirror(f, ifelse(fwz, w, z))(ifelse(fwz, x, y), b)
 end
 @inline (ier::IfElseReduceToMirror)(a::VecUnroll, b::VecUnroll) = VecUnroll(
-  VectorizationBase.fmap(ier, VectorizationBase.data(a), VectorizationBase.data(b)),
+  VectorizationBase.fmap(
+    ier,
+    VectorizationBase.data(a),
+    VectorizationBase.data(b)
+  )
 )
 
 # @inline (iec::IfElseCollapserMirror)(a) = getfield(VectorizationBase.ifelse_collapse_mirror(iec.f, a, iec.a), 1, false)
@@ -525,10 +541,11 @@ const REDUCTION_CLASS = Dict{Symbol,Float64}(
   :max_fast => MAX,
   :min_fast => MIN,
   :vfmaddsub => ADDITIVE_IN_REDUCTIONS,
-  :vfmsubadd => ADDITIVE_IN_REDUCTIONS,
+  :vfmsubadd => ADDITIVE_IN_REDUCTIONS
 )
 reduction_instruction_class(instr::Symbol) = get(REDUCTION_CLASS, instr, NaN)
-reduction_instruction_class(instr::Instruction) = reduction_instruction_class(instr.instr)
+reduction_instruction_class(instr::Instruction) =
+  reduction_instruction_class(instr.instr)
 function reduction_to_single_vector(x::Float64)
   if x == ADDITIVE_IN_REDUCTIONS
     :collapse_add
@@ -546,8 +563,7 @@ function reduction_to_single_vector(x::Float64)
     throw("Reduction not found.")
   end
 end
-function reduce_to_onevecunroll(x::Float64)
-  if x == ADDITIVE_IN_REDUCTIONS
+reduce_to_onevecunroll(x::Float64) = if x == ADDITIVE_IN_REDUCTIONS
     :+
   elseif x == MULTIPLICATIVE_IN_REDUCTIONS
     :*
@@ -562,9 +578,7 @@ function reduce_to_onevecunroll(x::Float64)
   else
     throw("Reduction not found.")
   end
-end
-function reduce_number_of_vectors(x::Float64)
-  if x == ADDITIVE_IN_REDUCTIONS
+reduce_number_of_vectors(x::Float64) = if x == ADDITIVE_IN_REDUCTIONS
     :contract_add
   elseif x == MULTIPLICATIVE_IN_REDUCTIONS
     :contract_mul
@@ -579,9 +593,7 @@ function reduce_number_of_vectors(x::Float64)
   else
     throw("Reduction not found.")
   end
-end
-function reduction_to_scalar(x::Float64)
-  if x == ADDITIVE_IN_REDUCTIONS
+reduction_to_scalar(x::Float64) = if x == ADDITIVE_IN_REDUCTIONS
     :vsum
   elseif x == MULTIPLICATIVE_IN_REDUCTIONS
     :vprod
@@ -596,7 +608,6 @@ function reduction_to_scalar(x::Float64)
   else
     throw("Reduction not found.")
   end
-end
 function reduction_scalar_combine(x::Float64)
   # x == 1.0 ? :reduced_add : x == 2.0 ? :reduced_prod : x == 3.0 ? :reduced_any : x == 4.0 ? :reduced_all : x == 5.0 ? :reduced_max : x == 6.0 ? :reduced_min : throw("Reduction not found.")
   if x == ADDITIVE_IN_REDUCTIONS
@@ -653,7 +664,6 @@ function reduction_zero_class(x::Symbol)::Float64
 end
 reduction_zero(x) = reduction_zero(reduction_instruction_class(x))
 
-
 function isreductcombineinstr(instr::Symbol)
   instr ∈ (
     :reduced_add,
@@ -663,7 +673,7 @@ function isreductcombineinstr(instr::Symbol)
     :reduced_max,
     :reduced_min,
     :reduce_to_max,
-    :reduce_to_min,
+    :reduce_to_min
   )
 end
 isreductcombineinstr(instr::Instruction) = isreductcombineinstr(instr.instr)
@@ -765,7 +775,7 @@ const FUNCTIONSYMBOLS = IdDict{Type{<:Function},Instruction}(
   typeof(ifelse) => :ifelse,
   typeof(identity) => :identity,
   typeof(conj) => :identity,#conj,
-  typeof(÷) => :vdiv_fast,
+  typeof(÷) => :vdiv_fast
   # typeof(zero) => :zero,
   # typeof(one) => :one,
   # typeof(axes) => :axes,
diff --git a/src/modeling/determinestrategy.jl b/src/modeling/determinestrategy.jl
index ae4576234..5cc04d68c 100644
--- a/src/modeling/determinestrategy.jl
+++ b/src/modeling/determinestrategy.jl
@@ -19,8 +19,8 @@ function check_linear_parents(ls::LoopSet, op::Operation, s::Symbol)
       :add_fast,
       :(-),
       :vsub,
-      :sub_fast,
-    ),
+      :sub_fast
+    )
   ) || return false
   for opp ∈ parents(op)
     check_linear_parents(ls, opp, s) || return false
@@ -61,7 +61,13 @@ function unitstride(ls::LoopSet, op::Operation, s::Symbol)
   end
   true
 end
-function cannot_shuffle(op::Operation, u₁::Symbol, u₂::Symbol, contigind::Symbol, indices) # assumes isvectorized and !unitstride
+function cannot_shuffle(
+  op::Operation,
+  u₁::Symbol,
+  u₂::Symbol,
+  contigind::Symbol,
+  indices
+) # assumes isvectorized and !unitstride
   !((
     !rejectcurly(op) && (
       (
@@ -77,7 +83,7 @@ function cost(
   (u₁, u₂)::Tuple{Symbol,Symbol},
   vloopsym::Symbol,
   Wshift::Int,
-  size_T::Int = op.elementbytes,
+  size_T::Int = op.elementbytes
 )
   isconstant(op) && return 0.0, 0, 1.0#Float64(length(loopdependencies(op)) > 0)
   isloopvalue(op) && return 0.0, 0, 0.0
@@ -92,14 +98,15 @@ function cost(
   elseif iscompute(op) && (
     Base.sym_in(
       instruction(op).instr,
-      (:vadd_nsw, :vsub_nsw, :(+), :(-), :add_fast, :sub_fast),
+      (:vadd_nsw, :vsub_nsw, :(+), :(-), :add_fast, :sub_fast)
     ) && all(opp -> (isloopvalue(opp)), parents(op))
   )# || (reg_count(ls) == 32) && (instruction(op).instr === :ifelse))
     # all(opp -> (isloopvalue(opp) | isconstant(opp)), parents(op))
     return 0.0, 0, 0.0
   end
   opisvectorized = isvectorized(op)
-  srt, sl, srp = opisvectorized ? vector_cost(instr, Wshift, size_T) : scalar_cost(instr)
+  srt, sl, srp =
+    opisvectorized ? vector_cost(instr, Wshift, size_T) : scalar_cost(instr)
   if accesses_memory(op)
     # either vbroadcast/reductionstore, vmov(a/u)pd, or gather/scatter
     if opisvectorized
@@ -111,8 +118,10 @@ function cost(
         # cannot shuffle false means reject curly
         # either false means shuffle
         dont_shuffle =
-          (Wshift > 3) ||
-          (rejectinterleave(op) && (cannot_shuffle(op, u₁, u₂, contigind, indices)))
+          (Wshift > 3) || (
+            rejectinterleave(op) &&
+            (cannot_shuffle(op, u₁, u₂, contigind, indices))
+          )
         if dont_shuffle
           # offset = 0.0 # gather/scatter, alignment doesn't matter
           r = 1 << shifter
@@ -146,9 +155,7 @@ end
 
 # Base._return_type()
 
-function biggest_type_size(ls::LoopSet)
-  maximum(elsize, operations(ls))
-end
+biggest_type_size(ls::LoopSet) = maximum(elsize, operations(ls))
 function hasintersection(a, b)
   for aᵢ ∈ a, bᵢ ∈ b
     aᵢ === bᵢ && return true
@@ -182,9 +189,10 @@ function evaluate_cost_unroll(
   order::Vector{Symbol},
   vloopsym::Symbol,
   max_cost::Float64 = typemax(Float64),
-  sld::Vector{Vector{Symbol}} = store_load_deps(operations(ls)),
+  sld::Vector{Vector{Symbol}} = store_load_deps(operations(ls))
 )
-  included_vars = fill!(resize!(ls.included_vars, length(operations(ls))), false)
+  included_vars =
+    fill!(resize!(ls.included_vars, length(operations(ls))), false)
   nested_loop_syms = Symbol[]#Set{Symbol}()
   total_cost = 0.0
   iter = 1.0
@@ -206,10 +214,12 @@ function evaluate_cost_unroll(
       # it must also be a subset of defined symbols
       loopdependencies(op) ⊆ nested_loop_syms || continue
       # hasintersection(reduceddependencies(op), nested_loop_syms) && return Inf
-      (isassigned(sld, id) && any(s -> (s ∉ sld[id]), nested_loop_syms)) && return Inf
+      (isassigned(sld, id) && any(s -> (s ∉ sld[id]), nested_loop_syms)) &&
+        return Inf
       included_vars[id] = true
       # TODO: use actual unrolls here?
-      c = first(cost(ls, op, (Symbol(""), Symbol("")), vloopsym, Wshift, size_T))
+      c =
+        first(cost(ls, op, (Symbol(""), Symbol("")), vloopsym, Wshift, size_T))
       total_cost += iter * c
       0.9total_cost > max_cost && return total_cost # abort if more expensive; we only want to know the cheapest
     end
@@ -227,13 +237,14 @@ function depchain_cost!(
   Wshift::Int,
   size_T::Int,
   rt::Float64 = 0.0,
-  sl::Int = 0,
+  sl::Int = 0
 )
   skip[identifier(op)] = true
   # depth first search
   for opp ∈ parents(op)
     skip[identifier(opp)] && continue
-    rt, sl = depchain_cost!(ls, skip, opp, unrolled, vloopsym, Wshift, size_T, rt, sl)
+    rt, sl =
+      depchain_cost!(ls, skip, opp, unrolled, vloopsym, Wshift, size_T, rt, sl)
   end
   # Basically assuming memory and compute don't conflict, but everything else does
   # Ie, ignoring the fact that integer and floating point operations likely don't either
@@ -284,7 +295,8 @@ function unroll_no_reductions(ls, order, vloopsym)
   rpc = 0 # register pressure independent of unroll factor
   for op ∈ operations(ls)
     isu₁unrolled(op) || continue
-    rt, sl, rpop = cost(ls, op, (unrolled, Symbol("")), vloopsym, Wshift, size_T)
+    rt, sl, rpop =
+      cost(ls, op, (unrolled, Symbol("")), vloopsym, Wshift, size_T)
     if iscompute(op)
       compute_rt += rt
       compute_l += sl
@@ -305,7 +317,11 @@ function unroll_no_reductions(ls, order, vloopsym)
     # if compute_rt > 40
     # max(VectorizationBase.nextpow2( min( 4, round(Int, compute_rt / memory_rt) ) ), 1)
     # else
-    clamp(round(Int, compute_l / compute_rt), 1, Core.ifelse(compute_rt > 80, 2, 4))
+    clamp(
+      round(Int, compute_l / compute_rt),
+      1,
+      Core.ifelse(compute_rt > 80, 2, 4)
+    )
     # end
   elseif iszero(load_rt)
     iszero(store_rt) ? 4 : max(1, min(4, round(Int, 2compute_rt / store_rt)))
@@ -335,7 +351,8 @@ function unroll_no_reductions(ls, order, vloopsym)
   else
     reg_constraint = max(1, remaining_reg ÷ max(1, round(Int, rpp)))
   end
-  maybe_demote_unroll(ls, clamp(u, 1, reg_constraint), unrolled, vloopsym), unrolled
+  maybe_demote_unroll(ls, clamp(u, 1, reg_constraint), unrolled, vloopsym),
+  unrolled
   # rt = max(compute_rt, load_rt + store_rt)
   # # (iszero(rt) ? 4 : max(1, roundpow2( min( 4, round(Int, 16 / rt) ) ))), unrolled
   # (iszero(rt) ? 4 : max(1, VectorizationBase.nextpow2( min( 4, round(Int, 8 / rt) ) ))), unrolled
@@ -344,7 +361,7 @@ function determine_unroll_factor(
   ls::LoopSet,
   order::Vector{Symbol},
   unrolled::Symbol,
-  vloopsym::Symbol,
+  vloopsym::Symbol
 )
   cacheunrolled!(ls, unrolled, Symbol(""), vloopsym)
   size_T = biggest_type_size(ls)
@@ -361,7 +378,15 @@ function determine_unroll_factor(
   store_recip_throughput = 0.0
   for op ∈ operations(ls)
     if isreduction(op)
-      rt, sl = depchain_cost!(ls, visited_nodes, op, unrolled, vloopsym, Wshift, size_T)
+      rt, sl = depchain_cost!(
+        ls,
+        visited_nodes,
+        op,
+        unrolled,
+        vloopsym,
+        Wshift,
+        size_T
+      )
       if isouterreduction(ls, op) ≠ -1 || unrolled ∉ reduceddependencies(op)
         latency = max(sl, latency)
       end
@@ -403,12 +428,17 @@ function demote_unroll_factor(ls::LoopSet, UF, loop::Loop)
   UF
 end
 
-function determine_unroll_factor(ls::LoopSet, order::Vector{Symbol}, vloopsym::Symbol)
+function determine_unroll_factor(
+  ls::LoopSet,
+  order::Vector{Symbol},
+  vloopsym::Symbol
+)
   num_reductions = count_reductions(ls)
   # The strategy is to use an unroll factor of 1, unless there appears to be loop carried dependencies (ie, num_reductions > 0)
   # The assumption here is that unrolling provides no real benefit, unless it is needed to enable OOO execution by breaking up these dependency chains
   loopindexesbit = ls.loopindexesbit
-  if iszero(length(loopindexesbit)) || ((!loopindexesbit[getloopid(ls, vloopsym)]))
+  if iszero(length(loopindexesbit)) ||
+     ((!loopindexesbit[getloopid(ls, vloopsym)]))
     if iszero(num_reductions)
       return unroll_no_reductions(ls, order, vloopsym)
     else
@@ -418,7 +448,8 @@ function determine_unroll_factor(ls::LoopSet, order::Vector{Symbol}, vloopsym::S
     return 8 ÷ ls.vector_width, vloopsym
   else # handle `BitArray` loops with reductions
     rttemp, ltemp = determine_unroll_factor(ls, order, vloopsym, vloopsym)
-    UF = min(8, VectorizationBase.nextpow2(max(1, round(Int, ltemp / (rttemp)))))
+    UF =
+      min(8, VectorizationBase.nextpow2(max(1, round(Int, ltemp / (rttemp)))))
     UFfactor = 8 ÷ ls.vector_width
     cld(UF, UFfactor) * UFfactor, vloopsym
     # UF2 = cld(UF, UFfactor)*UFfactor, vloopsym
@@ -431,7 +462,7 @@ function determine_unroll_factor(
   ls::LoopSet,
   order::Vector{Symbol},
   vloopsym::Symbol,
-  num_reductions::Int,
+  num_reductions::Int
 )
   innermost_loop = last(order)
   rt = Inf
@@ -442,7 +473,10 @@ function determine_unroll_factor(
     reject_reorder(ls, unrolled, false) && continue
     rttemp, ltemp = determine_unroll_factor(ls, order, unrolled, vloopsym)
     rtcomptemp =
-      rttemp + (0.01 * ((vloopsym === unrolled) + (unrolled === innermost_loop) - latency))
+      rttemp + (
+        0.01 *
+        ((vloopsym === unrolled) + (unrolled === innermost_loop) - latency)
+      )
     if rtcomptemp < rtcomp
       rt = rttemp
       rtcomp = rtcomptemp
@@ -455,12 +489,18 @@ function determine_unroll_factor(
   if lrtratio ≥ 7.0
     UF = 8
   else
-    UF = VectorizationBase.nextpow2(round(Int, clamp(lrtratio, 1.0, 4.0), RoundUp))
+    UF =
+      VectorizationBase.nextpow2(round(Int, clamp(lrtratio, 1.0, 4.0), RoundUp))
   end
   UF = maybe_demote_unroll(ls, UF, best_unrolled, vloopsym)
   UF, best_unrolled
 end
-function maybe_demote_unroll(ls::LoopSet, UF::Int, unrollsym::Symbol, vloopsym::Symbol)::Int
+function maybe_demote_unroll(
+  ls::LoopSet,
+  UF::Int,
+  unrollsym::Symbol,
+  vloopsym::Symbol
+)::Int
   if unrollsym === vloopsym
     return demote_unroll_factor(ls, UF, vloopsym)
   else
@@ -513,12 +553,13 @@ function solve_unroll_lagrange(
   u₂L,
   u₁step::Int,
   u₂step::Int,
-  atleast31registers::Bool,
+  atleast31registers::Bool
 )
   X₁, X₂, X₃, X₄ = X[1], X[2], X[3], X[4]
   # If we don't have opmask registers, masks probably occupy a vector register (e.g., on CPUs with AVX but not AVX512)
   R₁, R₂, R₃, R₄ = R[1], R[2], R[3], R[4]
-  iszero(R₃) || return solve_unroll_iter(X, R, u₁L, u₂L, u₁step:u₁step:10, u₂step:u₂step:10)
+  iszero(R₃) ||
+    return solve_unroll_iter(X, R, u₁L, u₂L, u₁step:u₁step:10, u₂step:u₂step:10)
   RR = R₄
   a = R₂^2 * X₃ - R₁ * X₄ * R₂ - R₁ * X₂ * RR
   b = R₁ * X₄ * RR - R₁ * X₄ * RR - 2X₃ * RR * R₂
@@ -530,9 +571,18 @@ function solve_unroll_lagrange(
   u₁float_finite = isfinite(u₁float)
   u₂float_finite = isfinite(u₂float)
   if !(u₁float_finite & u₂float_finite) # brute force
-    u₁high = Core.ifelse(iszero(X₃), u₁step, Core.ifelse(atleast31registers, 8, 6))
-    u₂high = Core.ifelse(iszero(X₂), u₂step, Core.ifelse(atleast31registers, 8, 6))
-    return solve_unroll_iter(X, R, u₁L, u₂L, u₁step:u₁step:u₁high, u₂step:u₂step:u₂high)
+    u₁high =
+      Core.ifelse(iszero(X₃), u₁step, Core.ifelse(atleast31registers, 8, 6))
+    u₂high =
+      Core.ifelse(iszero(X₂), u₂step, Core.ifelse(atleast31registers, 8, 6))
+    return solve_unroll_iter(
+      X,
+      R,
+      u₁L,
+      u₂L,
+      u₁step:u₁step:u₁high,
+      u₂step:u₂step:u₂high
+    )
   end
   u₁low = floor(Int, u₁float)
   u₂low = max(u₂step, floor(Int, 0.8u₂float)) # must be at least 1
@@ -555,7 +605,7 @@ function solve_unroll_lagrange(
     u₁L,
     u₂L,
     reverse(u₁low:u₁step:u₁high),
-    reverse(u₂low:u₂step:u₂high),
+    reverse(u₂low:u₂step:u₂high)
   )
 end
 
@@ -585,10 +635,11 @@ function solve_unroll(
   u₂L,
   u₁step,
   u₂step,
-  atleast31registers::Bool,
+  atleast31registers::Bool
 )
   # iszero(first(R)) && return -1,-1,Inf #solve_smalltilesize(X, R, u₁max, u₂max)
-  u₁, u₂, cost = solve_unroll_lagrange(X, R, u₁L, u₂L, u₁step, u₂step, atleast31registers)
+  u₁, u₂, cost =
+    solve_unroll_lagrange(X, R, u₁L, u₂L, u₁step, u₂step, atleast31registers)
   # u₂ -= u₂ & 1
   # u₁ = min(u₁, u₂)
   u₁_too_large = u₁ > u₁max
@@ -608,9 +659,7 @@ function solve_unroll(
   end
   u₁, u₂, cost
 end
-function maybedemotesize(U::Int, N::Int)
-  num_iterations(N, num_iterations(N, U))
-end
+maybedemotesize(U::Int, N::Int) = num_iterations(N, num_iterations(N, U))
 function maybedemotesize(u₂::Int, N::Int, U::Int, Uloop::Loop, maxu₂base::Int)
   u₂ > 1 || return 1
   u₂ == N && return u₂
@@ -631,7 +680,7 @@ function solve_unroll(
   reg_pressure::AbstractVector{Float64},
   W::Int,
   vloopsym::Symbol,
-  rounduᵢ::Int,
+  rounduᵢ::Int
 )
   (u₁step, u₂step) = if rounduᵢ == 1 # max is to safeguard against some weird arch I've never heard of.
     (clamp(cache_lnsze(ls) ÷ reg_size(ls), 1, 4), 1)
@@ -657,7 +706,7 @@ function solve_unroll(
     u₂loop,
     u₁step,
     u₂step,
-    reg_count(ls) ≥ 31,
+    reg_count(ls) ≥ 31
   )
 end
 
@@ -672,7 +721,7 @@ function solve_unroll(
   u₂loop::Loop,
   u₁step::Int,
   u₂step::Int,
-  atleast31registers::Bool,
+  atleast31registers::Bool
 )
   maxu₂base = maxu₁base = atleast31registers ? 10 : 6#8
   maxu₂ = maxu₂base#8
@@ -724,7 +773,7 @@ function solve_unroll(
     u₂Lf,
     u₁step,
     u₂step,
-    atleast31registers,
+    atleast31registers
   )
   # heuristic to more evenly divide small numbers of iterations
   if isstaticloop(u₂loop)
@@ -765,7 +814,12 @@ function loopdepindices(ls::LoopSet, op::Operation)
   end
   loopdepsret
 end
-function stride_penalty(ls::LoopSet, op::Operation, order::Vector{Symbol}, loopfreqs)
+function stride_penalty(
+  ls::LoopSet,
+  op::Operation,
+  order::Vector{Symbol},
+  loopfreqs
+)
   loopdeps = loopdepindices(ls, op)
   opstrides = Vector{Int}(undef, length(loopdeps))
   # very minor stride assumption here, because we don't really want to base optimization decisions on it...
@@ -931,7 +985,11 @@ function maxnegativeoffset(ls::LoopSet, op::Operation, u::Symbol)
   end
   mno, id
 end
-function maxnegativeoffset(ls::LoopSet, op::Operation, unrollsyms::UnrollSymbols)
+function maxnegativeoffset(
+  ls::LoopSet,
+  op::Operation,
+  unrollsyms::UnrollSymbols
+)
   @unpack u₁loopsym, u₂loopsym, vloopsym = unrollsyms
   mno = typemin(Int)
   i = 0
@@ -960,7 +1018,7 @@ function load_elimination_cost_factor!(
   iters,
   unrollsyms::UnrollSymbols,
   Wshift,
-  size_T,
+  size_T
 )
   @unpack u₁loopsym, u₂loopsym, vloopsym = unrollsyms
   if !iszero(first(isoptranslation(ls, op, unrollsyms)))
@@ -1052,7 +1110,7 @@ function add_constant_offset_load_elmination_cost!(
   u₂reduces::Bool,
   Wshift::Int,
   size_T::Int,
-  opisininnerloop::Bool,
+  opisininnerloop::Bool
 )
   @unpack u₁loopsym, u₂loopsym, vloopsym = unrollsyms
   offset, uid = maxnegativeoffset(ls, op, unrollsyms)
@@ -1096,8 +1154,8 @@ function add_constant_offset_load_elmination_cost!(
   end
 end
 
-function update_cost_vec!(costs, cost, u₁reduces, u₂reduces)
-  @inbounds if u₁reduces & u₂reduces
+update_cost_vec!(costs, cost, u₁reduces, u₂reduces) = @inbounds if u₁reduces &
+                                                                   u₂reduces
     costs[4] += cost
   elseif u₂reduces # cost decreased by unrolling u₂loop
     costs[2] += cost
@@ -1106,9 +1164,7 @@ function update_cost_vec!(costs, cost, u₁reduces, u₂reduces)
   else # no cost decrease; cost must be repeated
     costs[1] += cost
   end
-end
-function update_reg_pres!(rp, cost, u₁reduces, u₂reduces)
-  @inbounds if u₁reduces# & u₂reduces
+update_reg_pres!(rp, cost, u₁reduces, u₂reduces) = @inbounds if u₁reduces# & u₂reduces
     rp[4] -= cost
   elseif u₂reduces # cost decreased by unrolling u₂loop
     rp[2] += cost
@@ -1117,7 +1173,6 @@ function update_reg_pres!(rp, cost, u₁reduces, u₂reduces)
   else # no cost decrease; cost must be repeated
     rp[1] += cost
   end
-end
 function child_dependent_u₁u₂(op::Operation)
   u₁ = u₂ = false
   for opc ∈ children(op)
@@ -1130,13 +1185,20 @@ function evaluate_cost_tile(
   ls::LoopSet,
   order::Vector{Symbol},
   unrollsyms::UnrollSymbols,
-  anyisbit::Bool = false,
+  anyisbit::Bool = false
 )
   nops = length(operations(ls))
   iters = Vector{Float64}(undef, nops)
   reduced_by_unrolling = Array{Bool}(undef, 2, 2, nops)
   fill_children!(ls)
-  evaluate_cost_tile!(iters, reduced_by_unrolling, ls, order, unrollsyms, anyisbit)
+  evaluate_cost_tile!(
+    iters,
+    reduced_by_unrolling,
+    ls,
+    order,
+    unrollsyms,
+    anyisbit
+  )
 end
 function evaluate_cost_tile!(
   iters::Vector{Float64},
@@ -1146,7 +1208,7 @@ function evaluate_cost_tile!(
   unrollsyms::UnrollSymbols,
   anyisbit::Bool,
   sld::Vector{Vector{Symbol}} = store_load_deps(operations(ls)),
-  holdopinreg::Vector{Bool} = holdopinregister(ls),
+  holdopinreg::Vector{Bool} = holdopinregister(ls)
 )
   N = length(order)
   @assert N ≥ 2 "Cannot tile merely $N loops!"
@@ -1231,7 +1293,8 @@ function evaluate_cost_tile!(
         return 0, 0, Inf, false
       included_vars[id] = true
       if isconstant(op)
-        depends_on_u₁, depends_on_u₂ = isunrolled_sym(op, u₁loopsym, u₂loopsym, vloopsym)
+        depends_on_u₁, depends_on_u₂ =
+          isunrolled_sym(op, u₁loopsym, u₂loopsym, vloopsym)
         reduced_by_unrolling[1, 1, id] = !depends_on_u₁
         reduced_by_unrolling[2, 1, id] = !depends_on_u₂
       else
@@ -1276,7 +1339,7 @@ function evaluate_cost_tile!(
         u₂reducesrp,
         Wshift,
         size_T,
-        opisininnerloop,
+        opisininnerloop
       )
         continue
       elseif load_elimination_cost_factor!(
@@ -1288,7 +1351,7 @@ function evaluate_cost_tile!(
         iters[id],
         unrollsyms,
         Wshift,
-        size_T,
+        size_T
       )
         continue
       end
@@ -1336,7 +1399,8 @@ function evaluate_cost_tile!(
   # @inbounds ((cost_vec[4] > 0) || ((cost_vec[2] > 0) & (cost_vec[3] > 0))) || return 0,0,Inf,false
   # reg_pres[4] == remaining_registers
   costpenalty =
-    ((reg_pressure[1] + reg_pressure[2] + reg_pressure[3]) > reg_pressure[4]) ? 2 : 1
+    ((reg_pressure[1] + reg_pressure[2] + reg_pressure[3]) > reg_pressure[4]) ?
+    2 : 1
   u₁v = vloopsym === u₁loopsym
   u₂v = vloopsym === u₂loopsym
   visbit = anyisbit && ls.loopindexesbit[getloopid(ls, vloopsym)]
@@ -1360,18 +1424,27 @@ function evaluate_cost_tile!(
       1,
       1,
       length(getloop(ls, u₁loopsym)),
-      length(getloop(ls, u₂loopsym)),
+      length(getloop(ls, u₂loopsym))
     )
   else
-    u₁, u₂, ucost =
-      solve_unroll(ls, u₁loopsym, u₂loopsym, cost_vec, reg_pressure, W, vloopsym, round_uᵢ)
+    u₁, u₂, ucost = solve_unroll(
+      ls,
+      u₁loopsym,
+      u₂loopsym,
+      cost_vec,
+      reg_pressure,
+      W,
+      vloopsym,
+      round_uᵢ
+    )
   end
   outer_reduct_penalty = length(ls.outer_reductions) * (u₁ + isodd(u₁))
   favor_bigger_u₂ = u₁ - u₂
   # favor_smaller_vloopsym = (u₁v ? u₁ : -u₁) + (u₂v ?  u₂ : -u₂)
   favor_smaller_vectorized = (u₁v ⊻ u₂v) ? (u₁v ? u₁ - u₂ : u₂ - u₁) : 0
   favor_u₁_vectorized = -0.2u₁v
-  favoring_heuristics = favor_bigger_u₂ + 0.5favor_smaller_vectorized + favor_u₁_vectorized
+  favoring_heuristics =
+    favor_bigger_u₂ + 0.5favor_smaller_vectorized + favor_u₁_vectorized
   costpenalty =
     costpenalty * ucost +
     stride_penalty(ls, order) +
@@ -1461,7 +1534,11 @@ function LoopOrders(ls::LoopSet)
   else
     reductsyms, nonreductsyms = outer_reduct_loopordersplit(ls)
   end
-  LoopOrders(nonreductsyms, reductsyms, Vector{Symbol}(undef, length(ls.loopsymbols)))
+  LoopOrders(
+    nonreductsyms,
+    reductsyms,
+    Vector{Symbol}(undef, length(ls.loopsymbols))
+  )
 end
 
 nonreductview(lo::LoopOrders) = view(lo.buff, 1:length(lo.syms_nr))
@@ -1503,7 +1580,7 @@ swap!(x::AbstractVector, i::Int, j::Int) = (x[j], x[i]) = (x[i], x[j])
 function swap!(
   dest::AbstractVector{Symbol},
   src::AbstractVector{Symbol},
-  offs::AbstractVector{Int},
+  offs::AbstractVector{Int}
 )
   copyto!(dest, src)
   for i ∈ eachindex(offs)
@@ -1539,7 +1616,7 @@ function choose_unroll_order(
   ls::LoopSet,
   lowest_cost::Float64 = Inf,
   sld::Vector{Vector{Symbol}} = store_load_deps(operations(ls)),
-  v::Int = 0,
+  v::Int = 0
 )
   iszero(length(offsetloadcollection(ls).opidcollectionmap)) &&
     fill_offset_memop_collection!(ls)
@@ -1565,8 +1642,6 @@ function choose_unroll_order(
   end
 end
 
-
-
 """
 This function searches for unrolling combinations that will cause LoopVectorization to generate invalid code.
 
@@ -1577,7 +1652,6 @@ But separate vectors for a reduced loop are not needed. Separate vectors will he
 so you want to unroll at least one of the loops. However, reductions demand combining all the separate vectors,
 and each vector also eats a valuable register, so it's best to avoid excessive numbers these accumulation vectors.
 
-
 If a reduced op depends on both unrolled loops (u1 and u2), it will check over which of these it is reduced. If...
 neither: cannot avoid unrolling it along both
 one of them: don't unroll the reduced loop
@@ -1632,7 +1706,7 @@ inlinedecision(inline::Int, shouldinline::Bool) =
 function choose_tile(
   ls::LoopSet,
   sld::Vector{Vector{Symbol}} = store_load_deps(operations(ls)),
-  v::Int = 0,
+  v::Int = 0
 )
   iszero(length(offsetloadcollection(ls).opidcollectionmap)) &&
     fill_offset_memop_collection!(ls)
@@ -1673,7 +1747,7 @@ function choose_tile(
             UnrollSymbols(newu₁, newu₂, new_vec),
             anyisbit,
             sld,
-            holdopinreg,
+            holdopinreg
           )
           # if cost_temp < lowest_cost # leads to 4 vmovapds
           if cost_temp ≤ lowest_cost # lead to 2 vmovapds
@@ -1724,7 +1798,8 @@ function choose_order_cost(ls::LoopSet, v::Int = 0)
   resize!(ls.loop_order, length(ls.loopsymbols))
   sld = store_load_deps(operations(ls))
   if (num_loops(ls) > 1) && (length(ls.operations) ≤ 100)
-    torder, tunroll, ttile, tvec, tU, tT, tc, shouldinline = choose_tile(ls, sld, v)
+    torder, tunroll, ttile, tvec, tU, tT, tc, shouldinline =
+      choose_tile(ls, sld, v)
   else
     torder = names(ls) # dummy
     tunroll = ttile = tvec = Symbol("##undefined##") # dummy
diff --git a/src/modeling/graphs.jl b/src/modeling/graphs.jl
index 5f38d4a07..8d41cf726 100644
--- a/src/modeling/graphs.jl
+++ b/src/modeling/graphs.jl
@@ -31,9 +31,12 @@ UnPack.unpack(ua::UnrollArgs, ::Val{:u₂loopsym}) =
   getfield(getfield(ua, :u₂loop), :itersymbol)
 UnPack.unpack(ua::UnrollArgs, ::Val{:vloopsym}) =
   getfield(getfield(ua, :vloop), :itersymbol)
-UnPack.unpack(ua::UnrollArgs, ::Val{:u₁step}) = getfield(getfield(ua, :u₁loop), :step)
-UnPack.unpack(ua::UnrollArgs, ::Val{:u₂step}) = getfield(getfield(ua, :u₂loop), :step)
-UnPack.unpack(ua::UnrollArgs, ::Val{:vstep}) = getfield(getfield(ua, :vloop), :step)
+UnPack.unpack(ua::UnrollArgs, ::Val{:u₁step}) =
+  getfield(getfield(ua, :u₁loop), :step)
+UnPack.unpack(ua::UnrollArgs, ::Val{:u₂step}) =
+  getfield(getfield(ua, :u₂loop), :step)
+UnPack.unpack(ua::UnrollArgs, ::Val{:vstep}) =
+  getfield(getfield(ua, :vloop), :step)
 
 struct UnrollSpecification
   u₁loopnum::Int
@@ -52,7 +55,8 @@ end
 #     UnrollSpecification(u₁loopnum, u₂loopnum, vloopnum, u₁, u₂)
 # end
 isunrolled1(us::UnrollSpecification, n::Int) = us.u₁loopnum == n
-isunrolled2(us::UnrollSpecification, n::Int) = !isunrolled1(us, n) && us.u₂loopnum == n
+isunrolled2(us::UnrollSpecification, n::Int) =
+  !isunrolled1(us, n) && us.u₂loopnum == n
 isvectorized(us::UnrollSpecification, n::Int) = us.vloopnum == n
 function unrollfactor(us::UnrollSpecification, n::Int)
   @unpack u₁loopnum, u₂loopnum, u₁, u₂ = us
@@ -67,9 +71,11 @@ function pushexpr!(ex::Expr, mk::MaybeKnown)
   nothing
 end
 pushexpr!(ex::Expr, x::Union{Symbol,Expr}) = (push!(ex.args, x); nothing)
-pushexpr!(ex::Expr, x::Integer) = (push!(ex.args, staticexpr(convert(Int, x))); nothing)
+pushexpr!(ex::Expr, x::Integer) =
+  (push!(ex.args, staticexpr(convert(Int, x))); nothing)
 pushexpr!(ex::Expr, @nospecialize(x::StaticInt)) = (push!(ex.args, x); nothing)
-MaybeKnown(x::Integer) = MaybeKnown(convert(Int, x), Symbol("##UNDEFINED##"), true)
+MaybeKnown(x::Integer) =
+  MaybeKnown(convert(Int, x), Symbol("##UNDEFINED##"), true)
 MaybeKnown(x::Integer, default::Int) = MaybeKnown(x)
 MaybeKnown(x::Symbol, default::Int) = MaybeKnown(default, x, false)
 
@@ -86,7 +92,7 @@ function Loop(
   stop::Union{Int,Symbol},
   step::Union{Int,Symbol},
   rangename::Symbol,
-  lensym::Symbol,
+  lensym::Symbol
 )
   Loop(
     itersymbol,
@@ -94,7 +100,7 @@ function Loop(
     MaybeKnown(stop, 1024),
     MaybeKnown(step, 1),
     rangename,
-    lensym,
+    lensym
   )
 end
 startstopΔ(loop::Loop) = gethint(last(loop)) - gethint(first(loop))
@@ -110,8 +116,6 @@ Base.step(l::Loop) = getfield(l, :step)
 isstaticloop(l::Loop) = isknown(first(l)) & isknown(last(l)) & isknown(step(l))
 unitstep(l::Loop) = isone(step(l))
 
-
-
 function startloop(loop::Loop, itersymbol, staticinit::Bool = false)
   start = first(loop)
   if isknown(start)
@@ -142,7 +146,12 @@ addexpr(a, b) = arithmeticexpr(+, :vadd_nsw, a, b)
 subexpr(a, b) = arithmeticexpr(-, :vsub_nsw, a, b)
 mulexpr(a, b) = arithmeticexpr(*, :vmul_nsw, a, b)
 lazymulexpr(a, b) = arithmeticexpr(*, :lazymul, a, b)
-function arithmeticexpr(op, f, a::Union{Integer,MaybeKnown}, b::Union{Integer,MaybeKnown})
+function arithmeticexpr(
+  op,
+  f,
+  a::Union{Integer,MaybeKnown},
+  b::Union{Integer,MaybeKnown}
+)
   if isknown(a) & isknown(b)
     return staticexpr(op(gethint(a), gethint(b)))
   else
@@ -213,7 +222,8 @@ function addexpr(ex, incr::Integer)
   pushexpr!(expr, convert(Int, incr))
   expr
 end
-staticmulincr(ptr, incr) = Expr(:call, lv(:staticmul), Expr(:call, :eltype, ptr), incr)
+staticmulincr(ptr, incr) =
+  Expr(:call, lv(:staticmul), Expr(:call, :eltype, ptr), incr)
 
 @inline cmpend(i::Int, r::AbstractCloseOpen) = i < getfield(r, :upper)
 @inline cmpend(i::Int, r::AbstractUnitRange) = i ≤ last(r)
@@ -237,14 +247,19 @@ function staticloopexpr(loop::Loop)
   s = step(loop)
   l = last(loop)
   if isone(s)
-    Expr(:call, GlobalRef(Base, :(:)), staticexpr(gethint(f)), staticexpr(gethint(l)))
+    Expr(
+      :call,
+      GlobalRef(Base, :(:)),
+      staticexpr(gethint(f)),
+      staticexpr(gethint(l))
+    )
   else
     Expr(
       :call,
       GlobalRef(Base, :(:)),
       staticexpr(gethint(f)),
       staticexpr(gethint(s)),
-      staticexpr(gethint(l)),
+      staticexpr(gethint(l))
     )
   end
 end
@@ -256,7 +271,12 @@ function vec_looprange(loop::Loop, UF::Int, mangledname)
     vec_looprange(UF, mangledname, loop.rangesym, fast)
   end
 end
-function vec_looprange(UF::Int, mangledname, r::Union{Expr,Symbol}, zerostart::Bool)
+function vec_looprange(
+  UF::Int,
+  mangledname,
+  r::Union{Expr,Symbol},
+  zerostart::Bool
+)
   cmp = zerostart ? lv(:vcmpendzs) : lv(:vcmpend)
   if isone(UF)
     Expr(:call, cmp, mangledname, r, VECTORWIDTHSYMBOL)
@@ -285,7 +305,7 @@ function terminatecondition(
   n::Int,
   mangledname::Symbol,
   inclmask::Bool,
-  UF::Int = unrollfactor(us, n),
+  UF::Int = unrollfactor(us, n)
 )
   if !isvectorized(us, n)
     looprange(loop, UF, mangledname)
@@ -301,7 +321,7 @@ function incrementloopcounter(
   n::Int,
   mangledname::Symbol,
   UF::Int,
-  l::Loop,
+  l::Loop
 )
   incr = step(l)
   if isknown(incr)
@@ -310,7 +330,12 @@ function incrementloopcounter(
     incrementloopcounter(us, n, mangledname, UF, getsym(incr))
   end
 end
-function incrementloopcounter(us::UnrollSpecification, n::Int, mangledname::Symbol, UF::Int)
+function incrementloopcounter(
+  us::UnrollSpecification,
+  n::Int,
+  mangledname::Symbol,
+  UF::Int
+)
   if isvectorized(us, n)
     if isone(UF)
       Expr(:(=), mangledname, addexpr(VECTORWIDTHSYMBOL, mangledname))
@@ -318,7 +343,7 @@ function incrementloopcounter(us::UnrollSpecification, n::Int, mangledname::Symb
       Expr(
         :(=),
         mangledname,
-        addexpr(mulexpr(VECTORWIDTHSYMBOL, staticexpr(UF)), mangledname),
+        addexpr(mulexpr(VECTORWIDTHSYMBOL, staticexpr(UF)), mangledname)
       )
     end
   else
@@ -330,16 +355,23 @@ function incrementloopcounter(
   n::Int,
   mangledname::Symbol,
   UF::Int,
-  incr::Symbol,
+  incr::Symbol
 )
   if isvectorized(us, n)
     if isone(UF)
-      Expr(:(=), mangledname, addexpr(mulexpr(VECTORWIDTHSYMBOL, incr), mangledname))
+      Expr(
+        :(=),
+        mangledname,
+        addexpr(mulexpr(VECTORWIDTHSYMBOL, incr), mangledname)
+      )
     else
       Expr(
         :(=),
         mangledname,
-        addexpr(mulexpr(mulexpr(VECTORWIDTHSYMBOL, staticexpr(UF)), incr), mangledname),
+        addexpr(
+          mulexpr(mulexpr(VECTORWIDTHSYMBOL, staticexpr(UF)), incr),
+          mangledname
+        )
       )
     end
   else
@@ -347,7 +379,13 @@ function incrementloopcounter(
   end
 end
 
-function incrementloopcounter!(q, us::UnrollSpecification, n::Int, UF::Int, l::Loop)
+function incrementloopcounter!(
+  q,
+  us::UnrollSpecification,
+  n::Int,
+  UF::Int,
+  l::Loop
+)
   incr = step(l)
   if isknown(incr)
     incrementloopcounter!(q, us, n, UF * gethint(incr))
@@ -366,7 +404,13 @@ function incrementloopcounter!(q, us::UnrollSpecification, n::Int, UF::Int)
     push!(q.args, staticexpr(UF))
   end
 end
-function incrementloopcounter!(q, us::UnrollSpecification, n::Int, UF::Int, incr::Symbol)
+function incrementloopcounter!(
+  q,
+  us::UnrollSpecification,
+  n::Int,
+  UF::Int,
+  incr::Symbol
+)
   if isvectorized(us, n)
     if isone(UF)
       push!(q.args, mulexpr(VECTORWIDTHSYMBOL, incr))
@@ -404,12 +448,13 @@ function Base.resize!(lo::LoopOrder, N::Int)
 end
 Base.size(lo::LoopOrder) = (2, 2, 2, length(lo.loopnames))
 Base.@propagate_inbounds Base.getindex(lo::LoopOrder, i::Int) = lo.oporder[i]
-Base.@propagate_inbounds Base.getindex(lo::LoopOrder, i::Vararg{Int,K}) where {K} =
-  lo.oporder[LinearIndices(size(lo))[i...]]
+Base.@propagate_inbounds Base.getindex(
+  lo::LoopOrder,
+  i::Vararg{Int,K}
+) where {K} = lo.oporder[LinearIndices(size(lo))[i...]]
 
 @enum NumberType::Int8 HardInt HardFloat IntOrFloat INVALID
 
-
 struct LoopStartStopManager
   terminators::Vector{Int}
   incrementedptrs::Vector{Vector{ArrayReferenceMeta}}
@@ -466,16 +511,16 @@ function UnrollArgs(
   u₁::Int,
   unrollsyms::UnrollSymbols,
   u₂max::Int,
-  suffix::Int,
+  suffix::Int
 )
   @unpack u₁loopsym, u₂loopsym, vloopsym = unrollsyms
   u₁loop = getloop(ls, u₁loopsym)
-  u₂loop = u₂loopsym === Symbol("##undefined##") ? u₁loop : getloop(ls, u₂loopsym)
+  u₂loop =
+    u₂loopsym === Symbol("##undefined##") ? u₁loop : getloop(ls, u₂loopsym)
   vloop = getloop(ls, vloopsym)
   UnrollArgs(u₁loop, u₂loop, vloop, u₁, u₂max, suffix)
 end
 
-
 function cost_vec_buf(ls::LoopSet)
   cv = @view(ls.cost_vec[:, 2])
   @inbounds for i ∈ 1:4
@@ -509,7 +554,12 @@ end
 available_registers() =
   ifelse(has_opmask_registers(), register_count(), register_count() - One())
 function set_hw!(ls::LoopSet)
-  set_hw!(ls, Int(register_size()), Int(available_registers()), Int(cache_linesize()))
+  set_hw!(
+    ls,
+    Int(register_size()),
+    Int(available_registers()),
+    Int(cache_linesize())
+  )
 end
 reg_size(ls::LoopSet) = ls.register_size
 reg_count(ls::LoopSet) = ls.register_count
@@ -609,27 +659,25 @@ function LoopSet(mod::Symbol)
 end
 
 """
-  Used internally to create symbols unique for this loopset.
-  This is used so that identical loops will create identical `_turbo_!` calls in the macroexpansions, hopefully reducing recompilation.
-  """
+Used internally to create symbols unique for this loopset.
+This is used so that identical loops will create identical `_turbo_!` calls in the macroexpansions, hopefully reducing recompilation.
+"""
 gensym!(ls::LoopSet, s) = Symbol("###$(s)###$(ls.symcounter += 1)###")
 
-function fill_children!(ls::LoopSet)
-  for op ∈ operations(ls)
+fill_children!(ls::LoopSet) = for op ∈ operations(ls)
     empty!(children(op))
     for opp ∈ parents(op)
       @assert children(opp) !== NOPARENTS
       push!(children(opp), op)
     end
   end
-end
 function rejectinterleave!(
   ls::LoopSet,
   op::Operation,
   u₁loop::Symbol,
   u₂loop::Symbol,
   vloopsym::Symbol,
-  vloop::Loop,
+  vloop::Loop
 )
   setunrolled!(ls, op, u₁loop, u₂loop, vloopsym)
   if accesses_memory(op)
@@ -647,7 +695,12 @@ function rejectinterleave!(
     end
   end
 end
-function cacheunrolled!(ls::LoopSet, u₁loop::Symbol, u₂loop::Symbol, vloopsym::Symbol)
+function cacheunrolled!(
+  ls::LoopSet,
+  u₁loop::Symbol,
+  u₂loop::Symbol,
+  vloopsym::Symbol
+)
   vloop = getloop(ls, vloopsym)
   for op ∈ operations(ls)
     rejectinterleave!(ls, op, u₁loop, u₂loop, vloopsym, vloop)
@@ -658,7 +711,7 @@ function setunrolled!(
   op::Operation,
   u₁loopsym::Symbol,
   u₂loopsym::Symbol,
-  vectorized::Symbol,
+  vectorized::Symbol
 )
   u₁::Bool = u₂::Bool = v::Bool = false
   for ld ∈ loopdependencies(op)
@@ -735,9 +788,10 @@ end
 # looprangesym(ls::LoopSet, s::Symbol) = getloop(ls, s).rangesym
 
 """
-  getop only works while construction a LoopSet object. You cannot use it while lowering.
-  """
-getop(ls::LoopSet, var::Number, elementbytes) = add_constant!(ls, var, elementbytes)
+getop only works while construction a LoopSet object. You cannot use it while lowering.
+"""
+getop(ls::LoopSet, var::Number, elementbytes) =
+  add_constant!(ls, var, elementbytes)
 function getop(ls::LoopSet, var::Symbol, elementbytes::Int)
   get!(ls.opdict, var) do
     add_constant!(ls, var, elementbytes)
@@ -777,7 +831,7 @@ function Operation(
   dependencies,
   reduced_deps,
   parents,
-  ref = NOTAREFERENCE,
+  ref = NOTAREFERENCE
 )
   Operation(
     length(operations(ls)),
@@ -788,7 +842,7 @@ function Operation(
     dependencies,
     reduced_deps,
     parents,
-    ref,
+    ref
   )
 end
 function Operation(
@@ -797,14 +851,24 @@ function Operation(
   elementbytes,
   instr,
   optype,
-  mpref::ArrayReferenceMetaPosition,
+  mpref::ArrayReferenceMetaPosition
 )
-  Operation(length(operations(ls)), variable, elementbytes, instr, optype, mpref)
+  Operation(
+    length(operations(ls)),
+    variable,
+    elementbytes,
+    instr,
+    optype,
+    mpref
+  )
 end
 
 operations(ls::LoopSet) = ls.operations
 
-function getconstvalues(ls::LoopSet, opparents::Vector{Operation})::Tuple{Bool,Vector{Any}}
+function getconstvalues(
+  ls::LoopSet,
+  opparents::Vector{Operation}
+)::Tuple{Bool,Vector{Any}}
   vals = sizehint!(Any[], length(opparents))
   for i ∈ eachindex(opparents)
     pushconstvalue!(vals, ls, opparents[i]) && return true, vals
@@ -812,7 +876,11 @@ function getconstvalues(ls::LoopSet, opparents::Vector{Operation})::Tuple{Bool,V
   false, vals
 end
 
-function add_constant_compute!(ls::LoopSet, op::Operation, var::Symbol)::Operation
+function add_constant_compute!(
+  ls::LoopSet,
+  op::Operation,
+  var::Symbol
+)::Operation
   op.node_type = constant
   instr = instruction(op)
   opparents = parents(op)
@@ -827,8 +895,8 @@ function add_constant_compute!(ls::LoopSet, op::Operation, var::Symbol)::Operati
       :vfmadd_fast,
       :vfnmadd_fast,
       :vfmsub_fast,
-      :vfnmsub_fast,
-    ),
+      :vfnmsub_fast
+    )
   )
     getconstfailed, vals = getconstvalues(ls, opparents)
     if !getconstfailed
@@ -855,25 +923,25 @@ function add_constant_compute!(ls::LoopSet, op::Operation, var::Symbol)::Operati
           return add_constant!(
             ls,
             T((big(vals[1]) * big(vals[2]) + big(vals[3]))),
-            8,
+            8
           )::Operation
         elseif f === :vfnmadd_fast
           return add_constant!(
             ls,
             T(big(vals[3]) - big(vals[1]) * big(vals[2])),
-            8,
+            8
           )::Operation
         elseif f === :vfmsub_fast
           return add_constant!(
             ls,
             T((big(vals[1]) * big(vals[2]) - big(vals[3]))),
-            8,
+            8
           )::Operation
         elseif f === :vfnmsub_fast
           return add_constant!(
             ls,
             T(-(big(vals[1]) * big(vals[2]) + big(vals[3]))),
-            8,
+            8
           )::Operation
         end
       end
@@ -942,20 +1010,21 @@ add_loop_bound!(
   itersym::Symbol,
   bound::Union{Integer,Symbol},
   upper::Bool,
-  step::Bool,
+  step::Bool
 )::MaybeKnown = MaybeKnown(bound, upper ? 1024 : 1)
 function add_loop_bound!(
   ls::LoopSet,
   itersym::Symbol,
   bound::Expr,
   upper::Bool,
-  step::Bool,
+  step::Bool
 )::MaybeKnown
   makestatic!(bound)
   N = gensym!(
     ls,
-    string(itersym) *
-    (upper ? "_loop_upper_bound" : (step ? "_loop_step" : "_loop_lower_bound")),
+    string(itersym) * (
+      upper ? "_loop_upper_bound" : (step ? "_loop_step" : "_loop_lower_bound")
+    )
   )
   pushprepreamble!(ls, Expr(:(=), N, bound))
   MaybeKnown(N, upper ? 1024 : 1)
@@ -965,7 +1034,7 @@ function range_loop!(
   itersym::Symbol,
   l::MaybeKnown,
   u::MaybeKnown,
-  s::MaybeKnown,
+  s::MaybeKnown
 )
   rangename = gensym!(ls, "range")
   lenname = gensym!(ls, "length")
@@ -976,7 +1045,11 @@ function range_loop!(
   pushprepreamble!(ls, Expr(:(=), rangename, range))
   pushprepreamble!(
     ls,
-    Expr(:(=), lenname, Expr(:call, GlobalRef(ArrayInterface, :static_length), rangename)),
+    Expr(
+      :(=),
+      lenname,
+      Expr(:call, GlobalRef(ArrayInterface, :static_length), rangename)
+    )
   )
   Loop(itersym, l, u, s, rangename, lenname)
 end
@@ -1008,7 +1081,10 @@ function oneto_loop!(ls::LoopSet, r::Expr, itersym::Symbol)::Loop
     lensym = N = gensym!(ls, "loop" * string(itersym))
     rangename = gensym!(ls, "range")
     pushprepreamble!(ls, Expr(:(=), N, otN))
-    pushprepreamble!(ls, Expr(:(=), rangename, Expr(:call, :(:), staticexpr(1), N)))
+    pushprepreamble!(
+      ls,
+      Expr(:(=), rangename, Expr(:call, :(:), staticexpr(1), N))
+    )
     MaybeKnown(N, 1024)
   end
   Loop(itersym, l, u, s, rangename, lensym)
@@ -1016,19 +1092,26 @@ end
 
 @inline _reverse(r) = maybestaticlast(r):-static_step(r):maybestaticfirst(r)
 @inline canonicalize_range(r::OptionallyStaticUnitRange) = r
-@inline function canonicalize_range(r::OptionallyStaticRange, ::StaticInt{S}) where {S}
+@inline function canonicalize_range(
+  r::OptionallyStaticRange,
+  ::StaticInt{S}
+) where {S}
   ifelse(ArrayInterface.gt(StaticInt{S}(), Zero()), r, _reverse(r))
 end
-@inline canonicalize_range(r::OptionallyStaticRange, s::Integer) = s > 0 ? r : _reverse(r)
+@inline canonicalize_range(r::OptionallyStaticRange, s::Integer) =
+  s > 0 ? r : _reverse(r)
 @inline canonicalize_range(r::AbstractCloseOpen) = r
-@inline canonicalize_range(r::AbstractUnitRange) = maybestaticfirst(r):maybestaticlast(r)
-@inline canonicalize_range(r::OptionallyStaticRange) = canonicalize_range(r, static_step(r))
+@inline canonicalize_range(r::AbstractUnitRange) =
+  maybestaticfirst(r):maybestaticlast(r)
+@inline canonicalize_range(r::OptionallyStaticRange) =
+  canonicalize_range(r, static_step(r))
 @inline canonicalize_range(r::AbstractRange) =
   canonicalize_range(maybestaticfirst(r):static_step(r):maybestaticlast(r))
 @inline canonicalize_range(r::StepRange{T,T}) where {T<:Base.BitInteger} = r
 @inline canonicalize_range(r::CartesianIndices) =
   CartesianIndices(map(canonicalize_range, r.indices))
-@inline canonicalize_range(r::Base.OneTo{U}) where {U<:Unsigned} = One():(last(r)%Int)
+@inline canonicalize_range(r::Base.OneTo{U}) where {U<:Unsigned} =
+  One():(last(r)%Int)
 
 function canonicalize_range(x)
   throw(
@@ -1049,8 +1132,8 @@ function canonicalize_range(x)
       ...
     end
   ```
-""",
-    ),
+"""
+    )
   )
 end
 
@@ -1058,7 +1141,7 @@ function misc_loop!(
   ls::LoopSet,
   r::Union{Expr,Symbol},
   itersym::Symbol,
-  staticstepone::Bool,
+  staticstepone::Bool
 )::Loop
   rangename = gensym!(ls, "looprange" * string(itersym))
   lenname = gensym!(ls, "looplen" * string(itersym))
@@ -1067,26 +1150,41 @@ function misc_loop!(
     Expr(
       :(=),
       rangename,
-      Expr(:call, lv(:canonicalize_range), :(@inbounds $(makestatic!(r)))),
-    ),
+      Expr(:call, lv(:canonicalize_range), :(@inbounds $(makestatic!(r))))
+    )
   )
   pushprepreamble!(
     ls,
-    Expr(:(=), lenname, Expr(:call, GlobalRef(ArrayInterface, :static_length), rangename)),
+    Expr(
+      :(=),
+      lenname,
+      Expr(:call, GlobalRef(ArrayInterface, :static_length), rangename)
+    )
   )
   L = add_loop_bound!(
     ls,
     itersym,
     Expr(:call, lv(:maybestaticfirst), rangename),
     false,
-    false,
+    false
+  )
+  U = add_loop_bound!(
+    ls,
+    itersym,
+    Expr(:call, lv(:maybestaticlast), rangename),
+    true,
+    false
   )
-  U =
-    add_loop_bound!(ls, itersym, Expr(:call, lv(:maybestaticlast), rangename), true, false)
   if staticstepone
     Loop(itersym, L, U, MaybeKnown(1), rangename, lenname)
   else
-    S = add_loop_bound!(ls, itersym, Expr(:call, lv(:static_step), rangename), false, true)
+    S = add_loop_bound!(
+      ls,
+      itersym,
+      Expr(:call, lv(:static_step), rangename),
+      false,
+      true
+    )
     Loop(itersym, L, U, S, rangename, lenname)
   end
 end
@@ -1114,8 +1212,13 @@ function indices_loop!(ls::LoopSet, r::Expr, itersym::Symbol)::Loop
             Expr(
               :(=),
               axsym,
-              Expr(:call, GlobalRef(ArrayInterface, :axes), a_s, staticexpr(dims::Int)),
-            ),
+              Expr(
+                :call,
+                GlobalRef(ArrayInterface, :axes),
+                a_s,
+                staticexpr(dims::Int)
+              )
+            )
           )
           if n > 1
             axsym_prev = axessyms[n-1]
@@ -1128,9 +1231,13 @@ function indices_loop!(ls::LoopSet, r::Expr, itersym::Symbol)::Loop
                   :call,
                   GlobalRef(Base, :(==)),
                   Expr(:call, GlobalRef(ArrayInterface, :static_first), axsym),
-                  Expr(:call, GlobalRef(ArrayInterface, :static_first), axsym_prev),
-                ),
-              ),
+                  Expr(
+                    :call,
+                    GlobalRef(ArrayInterface, :static_first),
+                    axsym_prev
+                  )
+                )
+              )
             )
             pushprepreamble!(
               ls,
@@ -1141,9 +1248,13 @@ function indices_loop!(ls::LoopSet, r::Expr, itersym::Symbol)::Loop
                   :call,
                   GlobalRef(Base, :(==)),
                   Expr(:call, GlobalRef(ArrayInterface, :static_last), axsym),
-                  Expr(:call, GlobalRef(ArrayInterface, :static_last), axsym_prev),
-                ),
-              ),
+                  Expr(
+                    :call,
+                    GlobalRef(ArrayInterface, :static_last),
+                    axsym_prev
+                  )
+                )
+              )
             )
           end
         end
@@ -1166,8 +1277,13 @@ function indices_loop!(ls::LoopSet, r::Expr, itersym::Symbol)::Loop
             Expr(
               :(=),
               axsym,
-              Expr(:call, GlobalRef(ArrayInterface, :axes), a_s, staticexpr(mdim)),
-            ),
+              Expr(
+                :call,
+                GlobalRef(ArrayInterface, :axes),
+                a_s,
+                staticexpr(mdim)
+              )
+            )
           )
           if n > 1
             axsym_prev = axessyms[n-1]
@@ -1180,9 +1296,13 @@ function indices_loop!(ls::LoopSet, r::Expr, itersym::Symbol)::Loop
                   :call,
                   GlobalRef(Base, :(==)),
                   Expr(:call, GlobalRef(ArrayInterface, :static_first), axsym),
-                  Expr(:call, GlobalRef(ArrayInterface, :static_first), axsym_prev),
-                ),
-              ),
+                  Expr(
+                    :call,
+                    GlobalRef(ArrayInterface, :static_first),
+                    axsym_prev
+                  )
+                )
+              )
             )
             pushprepreamble!(
               ls,
@@ -1193,9 +1313,13 @@ function indices_loop!(ls::LoopSet, r::Expr, itersym::Symbol)::Loop
                   :call,
                   GlobalRef(Base, :(==)),
                   Expr(:call, GlobalRef(ArrayInterface, :static_last), axsym),
-                  Expr(:call, GlobalRef(ArrayInterface, :static_last), axsym_prev),
-                ),
-              ),
+                  Expr(
+                    :call,
+                    GlobalRef(ArrayInterface, :static_last),
+                    axsym_prev
+                  )
+                )
+              )
             )
           end
         end
@@ -1207,8 +1331,8 @@ function indices_loop!(ls::LoopSet, r::Expr, itersym::Symbol)::Loop
 end
 
 """
-  This function creates a loop, while switching from 1 to 0 based indices
-  """
+This function creates a loop, while switching from 1 to 0 based indices
+"""
 function register_single_loop!(ls::LoopSet, looprange::Expr)
   itersym = (looprange.args[1])::Symbol
   r = looprange.args[2]
@@ -1277,7 +1401,9 @@ function instruction!(ls::LoopSet, x::Expr)
   end
   # if x.head ≢ :(->)
   instr = last(x.args).value
-  isa(instr, Symbol) && instr ∈ keys(COST) && return Instruction(:LoopVectorization, instr)
+  isa(instr, Symbol) &&
+    instr ∈ keys(COST) &&
+    return Instruction(:LoopVectorization, instr)
   # end
   instr = gensym!(ls, "f")
   pushprepreamble!(ls, Expr(:(=), instr, x))
@@ -1292,13 +1418,12 @@ function instruction!(ls::LoopSet, f::F) where {F<:Function}
   end
 end
 
-
 function maybe_const_compute!(
   ls::LoopSet,
   LHS::Symbol,
   op::Operation,
   elementbytes::Int,
-  position::Int,
+  position::Int
 )
   # return op
   if iscompute(op) && iszero(length(loopdependencies(op)))
@@ -1308,21 +1433,22 @@ function maybe_const_compute!(
       ls.loopsymbols[1:position],
       gensym!(ls, instruction(op).instr),
       elementbytes,
-      :numericconstant,
+      :numericconstant
     )
   else
     # op.dependencies = ls.loopsymbols[1:position]
     op
   end
 end
-strip_op_linenumber_nodes(q::Expr) = only(filter(x -> !isa(x, LineNumberNode), q.args))
+strip_op_linenumber_nodes(q::Expr) =
+  only(filter(x -> !isa(x, LineNumberNode), q.args))
 
 function add_operation!(
   ls::LoopSet,
   LHS::Symbol,
   RHS::Symbol,
   elementbytes::Int,
-  position::Int,
+  position::Int
 )
   add_constant!(ls, RHS, ls.loopsymbols[1:position], LHS, elementbytes)
 end
@@ -1331,7 +1457,7 @@ function add_comparison!(
   LHS::Symbol,
   RHS::Expr,
   elementbytes::Int,
-  position::Int,
+  position::Int
 )
   Nargs = length(RHS.args)
   @assert (Nargs ≥ 5) & isodd(Nargs)
@@ -1340,34 +1466,41 @@ function add_comparison!(
     gensym!(ls, "leftcmp"),
     RHS.args[1],
     elementbytes,
-    position,
+    position
   )::Operation
   p2 = add_assignment!(
     ls,
     gensym!(ls, "middlecmp"),
     RHS.args[3],
     elementbytes,
-    position,
+    position
   )::Operation
   cmpname = Nargs == 3 ? LHS : gensym!(ls, "cmp")
-  cmp = add_compute!(ls, cmpname, RHS.args[2], Operation[p1, p2], elementbytes)::Operation
+  cmp = add_compute!(
+    ls,
+    cmpname,
+    RHS.args[2],
+    Operation[p1, p2],
+    elementbytes
+  )::Operation
   for i ∈ 5:2:Nargs
     pnew = add_assignment!(
       ls,
       gensym!(ls, "rightcmp"),
       RHS.args[i],
       elementbytes,
-      position,
+      position
     )::Operation
     cmpchain = add_compute!(
       ls,
       gensym!(ls, "cmpchain"),
       RHS.args[i-1],
       Operation[p2, pnew],
-      elementbytes,
+      elementbytes
     )::Operation
     cmpname = Nargs == i ? LHS : gensym!(ls, "cmp")
-    cmp = add_compute!(ls, cmpname, :&, [cmp, cmpchain], elementbytes)::Operation
+    cmp =
+      add_compute!(ls, cmpname, :&, [cmp, cmpchain], elementbytes)::Operation
     p2 = pnew
   end
   return cmp
@@ -1377,7 +1510,7 @@ function add_operation!(
   LHS::Symbol,
   RHS::Expr,
   elementbytes::Int,
-  position::Int,
+  position::Int
 )
   if RHS.head === :ref
     add_load_ref!(ls, LHS, RHS, elementbytes)
@@ -1393,12 +1526,15 @@ function add_operation!(
         ls.loopsymbols[1:position],
         LHS,
         elementbytes,
-        :numericconstant,
+        :numericconstant
       )
       if f === :zero
         push!(ls.preamble_zeros, (identifier(op), IntOrFloat))
       else
-        push!(ls.preamble_funcofeltypes, (identifier(op), reduction_zero_class(f)))
+        push!(
+          ls.preamble_funcofeltypes,
+          (identifier(op), reduction_zero_class(f))
+        )
       end
       op
     else
@@ -1408,7 +1544,13 @@ function add_operation!(
   elseif RHS.head === :if
     add_if!(ls, LHS, RHS, elementbytes, position)
   elseif RHS.head === :block
-    add_operation!(ls, LHS, strip_op_linenumber_nodes(RHS), elementbytes, position)
+    add_operation!(
+      ls,
+      LHS,
+      strip_op_linenumber_nodes(RHS),
+      elementbytes,
+      position
+    )
   elseif RHS.head === :(.)
     c = gensym!(ls, "getproperty")
     pushprepreamble!(ls, Expr(:(=), c, RHS))
@@ -1430,12 +1572,17 @@ function add_operation!(
   RHS::Expr,
   LHS_ref::ArrayReferenceMetaPosition,
   elementbytes::Int,
-  position::Int,
+  position::Int
 )
   if RHS.head === :ref# || (RHS.head === :call && first(RHS.args) === :getindex)
     array, rawindices = ref_from_expr!(ls, RHS)
-    RHS_ref =
-      array_reference_meta!(ls, array, rawindices, elementbytes, gensym!(ls, LHS_sym))
+    RHS_ref = array_reference_meta!(
+      ls,
+      array,
+      rawindices,
+      elementbytes,
+      gensym!(ls, LHS_sym)
+    )
     op = add_load!(ls, RHS_ref, elementbytes)
     add_compute!(ls, LHS_sym, :identity, [op], elementbytes)
     # pushfirst!(LHS_ref.parents, iop)
@@ -1451,13 +1598,16 @@ function add_operation!(
         ls.loopsymbols[1:position],
         LHS_sym,
         elementbytes,
-        :numericconstant,
+        :numericconstant
       )
       # op = add_constant!(ls, c, Symbol[], LHS_sym, elementbytes, :numericconstant)
       if f === :zero
         push!(ls.preamble_zeros, (identifier(op), IntOrFloat))
       else
-        push!(ls.preamble_funcofeltypes, (identifier(op), reduction_zero_class(f)))
+        push!(
+          ls.preamble_funcofeltypes,
+          (identifier(op), reduction_zero_class(f))
+        )
       end
       op
     else
@@ -1466,7 +1616,13 @@ function add_operation!(
   elseif RHS.head === :if
     add_if!(ls, LHS_sym, RHS, elementbytes, position, LHS_ref)
   elseif RHS.head === :block
-    add_operation!(ls, LHS_sym, strip_op_linenumber_nodes(RHS), elementbytes, position)
+    add_operation!(
+      ls,
+      LHS_sym,
+      strip_op_linenumber_nodes(RHS),
+      elementbytes,
+      position
+    )
   elseif RHS.head === :(.)
     c = gensym!(ls, "getproperty")
     pushpreamble!(ls, Expr(:(=), c, RHS))
@@ -1487,7 +1643,7 @@ function prepare_rhs_for_storage!(
   array,
   rawindices,
   elementbytes::Int,
-  position::Int,
+  position::Int
 )::Operation
   RHS isa Symbol && return add_store!(ls, RHS, array, rawindices, elementbytes)
   mpref = array_reference_meta!(ls, array, rawindices, elementbytes)
@@ -1498,12 +1654,19 @@ function prepare_rhs_for_storage!(
   mpref.parents = cachedparents
   op = add_store!(ls, mpref, elementbytes)
   if lrhs ∈ keys(ls.opdict)
-    ls.syms_aliasing_refs[findfirst(==(mpref.mref), ls.refs_aliasing_syms)] = lrhs
+    ls.syms_aliasing_refs[findfirst(==(mpref.mref), ls.refs_aliasing_syms)] =
+      lrhs
   end
   return op
 end
 
-function unpack_tuple!(ls::LoopSet, LHS::Expr, RHS, elementbytes::Int, position::Int)
+function unpack_tuple!(
+  ls::LoopSet,
+  LHS::Expr,
+  RHS,
+  elementbytes::Int,
+  position::Int
+)
   if Meta.isexpr(RHS, :tuple)
     for i ∈ eachindex(LHS.args)
       add_assignment!(ls, LHS.args[i], RHS.args[i], elementbytes, position)
@@ -1517,7 +1680,7 @@ function unpack_tuple!(ls::LoopSet, LHS::Expr, RHS, elementbytes::Int, position:
     lhstemp,
     add_operation!(ls, lhstemp, RHS, elementbytes, position),
     elementbytes,
-    position,
+    position
   )]
   unpack_tuple!(ls, LHS, vparents, elementbytes, position)
 end
@@ -1527,7 +1690,7 @@ function unpack_tuple!(
   LHS::Expr,
   vparents::Vector{Operation},
   elementbytes::Int,
-  position::Int,
+  position::Int
 )
   for i ∈ eachindex(LHS.args)
     f = EXTRACTFUNS[i]
@@ -1551,14 +1714,20 @@ function unpack_tuple!(
     throw(
       LoopError(
         "Unpacking the above expression in the left hand side was not understood/supported.",
-        lhsi,
-      ),
+        lhsi
+      )
     )
   end
   first(vparents)
 end
 
-function add_assignment!(ls::LoopSet, LHS, RHS, elementbytes::Int, position::Int)
+function add_assignment!(
+  ls::LoopSet,
+  LHS,
+  RHS,
+  elementbytes::Int,
+  position::Int
+)
   if LHS isa Symbol
     if RHS isa Expr
       maybe_const_compute!(
@@ -1566,7 +1735,7 @@ function add_assignment!(ls::LoopSet, LHS, RHS, elementbytes::Int, position::Int
         LHS,
         add_operation!(ls, LHS, RHS, elementbytes, position),
         elementbytes,
-        position,
+        position
       )
     else
       add_constant!(ls, RHS, ls.loopsymbols[1:position], LHS, elementbytes)
@@ -1579,7 +1748,14 @@ function add_assignment!(ls::LoopSet, LHS, RHS, elementbytes::Int, position::Int
         # need to check if LHS appears in RHS
         # assign RHS to lrhs
         array, rawindices = ref_from_expr!(ls, LHS)
-        prepare_rhs_for_storage!(ls, RHS, array, rawindices, elementbytes, position)
+        prepare_rhs_for_storage!(
+          ls,
+          RHS,
+          array,
+          rawindices,
+          elementbytes,
+          position
+        )
       else
         add_store_ref!(ls, RHS, LHS, elementbytes)  # is this necessary? (Extension API?)
       end
@@ -1589,8 +1765,8 @@ function add_assignment!(ls::LoopSet, LHS, RHS, elementbytes::Int, position::Int
       throw(
         LoopError(
           "LHS not understood; only `:ref`s and `:tuple`s are currently supported.",
-          LHS,
-        ),
+          LHS
+        )
       )
     end
   else
@@ -1603,7 +1779,7 @@ function push_op!(
   ex::Expr,
   elementbytes::Int,
   position::Int,
-  mpref::Union{Nothing,ArrayReferenceMetaPosition} = nothing,
+  mpref::Union{Nothing,ArrayReferenceMetaPosition} = nothing
 )::Operation
   if ex.head === :call
     finex = first(ex.args)::Symbol
@@ -1615,7 +1791,7 @@ function push_op!(
         array,
         rawindices,
         elementbytes,
-        position,
+        position
       )
     else
       throw(LoopError("Don't know how to handle expression.", finex))
@@ -1655,7 +1831,7 @@ function Base.push!(
   ex::Expr,
   elementbytes::Int,
   position::Int,
-  mpref::Union{Nothing,ArrayReferenceMetaPosition} = nothing,
+  mpref::Union{Nothing,ArrayReferenceMetaPosition} = nothing
 )
   if ex.head === :block
     add_block!(ls, ex, elementbytes, position)
@@ -1672,7 +1848,7 @@ function UnrollSpecification(
   u₂loop::Symbol,
   vloopsym::Symbol,
   u₁,
-  u₂,
+  u₂
 )
   order = names(ls)
   nu₁ = findfirst(Base.Fix2(===, u₁loop), order)::Int
@@ -1682,10 +1858,10 @@ function UnrollSpecification(
 end
 
 """
-    looplengthprod(ls::LoopSet)
+looplengthprod(ls::LoopSet)
 
-  Convert to `Float64` for the sake of non-64 bit platforms.
-  """
+Convert to `Float64` for the sake of non-64 bit platforms.
+"""
 function looplengthprod(ls::LoopSet)
   l = 1.0
   for loop ∈ ls.loops
@@ -1695,7 +1871,6 @@ function looplengthprod(ls::LoopSet)
 end
 # prod(Float64 ∘ length, ls.loops)
 
-
 function looplength(ls::LoopSet, s::Symbol)
   # search_tree(parents(operations(ls)[i]), name(op)) && return true
   id = getloopid_or_nothing(ls, s)
@@ -1722,9 +1897,13 @@ function looplength(ls::LoopSet, s::Symbol)
   end
 end
 
-function accept_reorder_according_to_tracked_reductions(ls::LoopSet, reordered::Symbol)
+function accept_reorder_according_to_tracked_reductions(
+  ls::LoopSet,
+  reordered::Symbol
+)
   for op ∈ operations(ls)
-    if (reordered ∈ loopdependencies(op)) && !(iscompute(op) & iszero(length(children(op))))
+    if (reordered ∈ loopdependencies(op)) &&
+       !(iscompute(op) & iszero(length(children(op))))
       for opp ∈ parents(op)
         (iscompute(opp) && isanouterreduction(ls, opp)) && return 0x00
       end
@@ -1762,7 +1941,7 @@ function check_valid_reorder_dims!(ls::LoopSet)
           firstoff = opiref.offsets[l]
           maxdiff = max(
             checkmismatch(ops, opidsᵢ, l, firstoff, 2:length(opidsᵢ)),
-            checkmismatch(ops, opidsⱼ, l, firstoff, 1:length(opidsⱼ)),
+            checkmismatch(ops, opidsⱼ, l, firstoff, 1:length(opidsⱼ))
           )
           if maxdiff ≥ (isknown(step(loopk)) ? abs(gethint(step(loopk))) : 1)
             validreorder[k] = 0x00#0x01
@@ -1782,7 +1961,7 @@ function checkmismatch(
   opids::Vector{Int},
   l::Int,
   firstoff::Int8,
-  checkrange::UnitRange{Int},
+  checkrange::UnitRange{Int}
 )
   maxabsdiff = 0
   for m ∈ checkrange
@@ -1798,7 +1977,8 @@ function fill_offset_memop_collection!(ls::LoopSet)
   omop = offsetloadcollection(ls)
   ops = operations(ls)
   num_ops = length(ops)
-  @unpack opids, opidcollectionmap, batchedcollections, batchedcollectionmap = omop
+  @unpack opids, opidcollectionmap, batchedcollections, batchedcollectionmap =
+    omop
   length(opidcollectionmap) == 0 || return
   resize!(opidcollectionmap, num_ops)
   fill!(opidcollectionmap, (0, 0))
@@ -1871,7 +2051,7 @@ function fill_offset_memop_collection!(ls::LoopSet)
       collen = length(collectionⱼ)
       collen ≤ 1 && continue
       # we have multiple, easiest to process if we sort them
-      sort!(collectionⱼ, by = last)
+      sort!(collectionⱼ; by = last)
       istart = 1
       ostart = last(first(collectionⱼ))
       oprev = ostart
@@ -1890,7 +2070,7 @@ function fill_offset_memop_collection!(ls::LoopSet)
             ops,
             collectionⱼ,
             istart,
-            i - 1,
+            i - 1
           )
         end
         # restart istart and ostart
@@ -1906,7 +2086,7 @@ function fill_offset_memop_collection!(ls::LoopSet)
           ops,
           collectionⱼ,
           istart,
-          collen,
+          collen
         )
       end
     end
@@ -1921,7 +2101,7 @@ function pushbatchedcollection!(
   ops,
   collectionⱼ,
   istart,
-  istop,
+  istop
 )
   colview = view(collectionⱼ, istart:istop)
   push!(batchedcollections, colview)
@@ -1933,14 +2113,15 @@ function pushbatchedcollection!(
 end
 
 """
-  Returns `0` if the op is the declaration of the constant outerreduction variable.
-  Returns `n`, where `n` is the constant declarations's index among parents(op), if op is an outter reduction.
-  Returns `-1` if not an outerreduction.
-  """
+Returns `0` if the op is the declaration of the constant outerreduction variable.
+Returns `n`, where `n` is the constant declarations's index among parents(op), if op is an outter reduction.
+Returns `-1` if not an outerreduction.
+"""
 function isouterreduction(ls::LoopSet, op::Operation)
   if isconstant(op) # equivalent to checking if length(loopdependencies(op)) == 0
     instr = op.instruction
-    instr == LOOPCONSTANT && return Core.ifelse(length(loopdependencies(op)) == 0, 0, -1)
+    instr == LOOPCONSTANT &&
+      return Core.ifelse(length(loopdependencies(op)) == 0, 0, -1)
     instr.mod === GLOBALCONSTANT && return -1
     ops = operations(ls)
     for or ∈ ls.outer_reductions
diff --git a/src/modeling/operations.jl b/src/modeling/operations.jl
index 36b6c8ed0..51b5b9509 100644
--- a/src/modeling/operations.jl
+++ b/src/modeling/operations.jl
@@ -1,10 +1,9 @@
 const DISCONTIGUOUS = Symbol("##DISCONTIGUOUSSUBARRAY##")
 const CONSTANTZEROINDEX = Symbol("##CONSTANTZEROINDEX##")
-const LOOPCONSTANT = Instruction(:LoopVectorization, Symbol("LOOPCONSTANTINSTRUCTION"))
+const LOOPCONSTANT =
+  Instruction(:LoopVectorization, Symbol("LOOPCONSTANTINSTRUCTION"))
 const GLOBALCONSTANT = Symbol("##GLOBAL##CONSTANT##")
 
-
-
 """
     ArrayReference
 
@@ -75,15 +74,27 @@ struct OffsetLoadCollection
   # offsets::Vector{Vector{Vector{Int8}}}
   opidcollectionmap::Vector{Tuple{Int,Int}}
   batchedcollections::Vector{
-    SubArray{Tuple{Int,Int},1,Vector{Tuple{Int,Int}},Tuple{UnitRange{Int}},true},
+    SubArray{
+      Tuple{Int,Int},
+      1,
+      Vector{Tuple{Int,Int}},
+      Tuple{UnitRange{Int}},
+      true
+    }
   }
   batchedcollectionmap::Vector{Tuple{Int,Int}}
   function OffsetLoadCollection()
     new(
       Vector{Int}[],
       Tuple{Int,Int}[],
-      SubArray{Tuple{Int,Int},1,Vector{Tuple{Int,Int}},Tuple{UnitRange{Int}},true}[],
-      Tuple{Int,Int}[],
+      SubArray{
+        Tuple{Int,Int},
+        1,
+        Vector{Tuple{Int,Int}},
+        Tuple{UnitRange{Int}},
+        true
+      }[],
+      Tuple{Int,Int}[]
     )
   end
 end
@@ -126,13 +137,21 @@ abstract type AbstractLoopOperation end
   memstore
   loopvalue
 end
-"An operation setting a variable to a constant value (e.g., `a = 0.0`)"
+"""
+An operation setting a variable to a constant value (e.g., `a = 0.0`)
+"""
 constant
-"An operation setting a variable from a memory location (e.g., `a = A[i,j]`)"
+"""
+An operation setting a variable from a memory location (e.g., `a = A[i,j]`)
+"""
 memload
-"An operation computing a new value from one or more variables (e.g., `a = b + c`)"
+"""
+An operation computing a new value from one or more variables (e.g., `a = b + c`)
+"""
 compute
-"An operation storing a value to a memory location (e.g., `A[i,j] = a`)"
+"""
+An operation storing a value to a memory location (e.g., `A[i,j] = a`)
+"""
 memstore
 """
 `loopvalue` indicates an loop variable (`i` in `for i in ...`). These are the "parents" of `compute`
@@ -223,7 +242,7 @@ mutable struct Operation <: AbstractLoopOperation
     reduced_deps::Vector{Symbol} = Symbol[],
     parents::Vector{Operation} = Operation[],
     ref::ArrayReferenceMeta = NOTAREFERENCE,
-    reduced_children::Vector{Symbol} = Symbol[],
+    reduced_children::Vector{Symbol} = Symbol[]
   )
     new(
       identifier,
@@ -237,7 +256,7 @@ mutable struct Operation <: AbstractLoopOperation
       Operation[],
       ref,
       Symbol("##", variable, :_),
-      reduced_children,
+      reduced_children
     )
   end
 end
@@ -310,7 +329,10 @@ function Base.show(io::IO, op::Operation)
   elseif isload(op)
     print(io, Expr(:(=), op.variable, ref_for_print(op)))
   elseif iscompute(op)
-    print(io, Expr(:(=), op.variable, callexpr(op.instruction, map(name, parents(op)))))
+    print(
+      io,
+      Expr(:(=), op.variable, callexpr(op.instruction, map(name, parents(op))))
+    )
   elseif isstore(op)
     print(io, Expr(:(=), ref_for_print(op), name(first(parents(op)))))
   elseif isloopvalue(op)
@@ -349,7 +371,8 @@ name(x::ArrayReference) = x.array
 name(x::ArrayReferenceMeta) = x.ref.array
 name(op::Operation) = op.variable
 instruction(op::Operation) = op.instruction
-isreductcombineinstr(op::Operation) = iscompute(op) && isreductcombineinstr(instruction(op))
+isreductcombineinstr(op::Operation) =
+  iscompute(op) && isreductcombineinstr(instruction(op))
 """
     mvar = mangledvar(op)
 
@@ -357,12 +380,12 @@ Returns the mangled variable name, for use in the produced expressions.
 These names will be further processed if op is tiled and/or unrolled.
 
 ```julia
-    if tiled ∈ loopdependencies(op) # `suffix` is tilenumber
-        mvar = Symbol(op, suffix, :_)
-    end
-    if unrolled ∈ loopdependencies(op) # `u` is unroll number
-        mvar = Symbol(op, u)
-    end
+if tiled ∈ loopdependencies(op) # `suffix` is tilenumber
+  mvar = Symbol(op, suffix, :_)
+end
+if unrolled ∈ loopdependencies(op) # `u` is unroll number
+  mvar = Symbol(op, u)
+end
 ```
 """
 mangledvar(op::Operation) = op.mangledvariable
@@ -383,7 +406,7 @@ function Operation(
   elementbytes::Int,
   instr,
   optype::OperationType,
-  mpref::ArrayReferenceMetaPosition,
+  mpref::ArrayReferenceMetaPosition
 )
   Operation(
     id,
@@ -394,16 +417,22 @@ function Operation(
     mpref.loopdependencies,
     mpref.reduceddeps,
     mpref.parents,
-    mpref.mref,
+    mpref.mref
   )
 end
-Base.:(==)(x::ArrayReferenceMetaPosition, y::ArrayReferenceMetaPosition) = x.mref == y.mref
+Base.:(==)(x::ArrayReferenceMetaPosition, y::ArrayReferenceMetaPosition) =
+  x.mref == y.mref
 parents(op::ArrayReferenceMetaPosition) = op.parents
 # Avoid memory allocations by using this for ops that aren't references
 const NOTAREFERENCE =
   ArrayReferenceMeta(ArrayReference(Symbol(""), Symbol[]), Bool[], Symbol(""))
-const NOTAREFERENCEMP =
-  ArrayReferenceMetaPosition(NOTAREFERENCE, NOPARENTS, Symbol[], Symbol[], Symbol(""))
+const NOTAREFERENCEMP = ArrayReferenceMetaPosition(
+  NOTAREFERENCE,
+  NOPARENTS,
+  Symbol[],
+  Symbol[],
+  Symbol("")
+)
 varname(::Nothing) = nothing
 varname(mpref::ArrayReferenceMetaPosition) = mpref.varname
 name(mpref::ArrayReferenceMetaPosition) = name(mpref.mref.ref)
@@ -480,7 +509,8 @@ function ifelse_reduce_fun_expr(f::Symbol, op::Operation)
   lvcmp_instr = lv(instruction(cmp).instr)
   if success
     lvf = lv(f)
-    return not ? Expr(:call, lvf, :($(!) ∘ $lvcmp_instr)) : Expr(:call, lvf, lvcmp_instr)
+    return not ? Expr(:call, lvf, :($(!) ∘ $lvcmp_instr)) :
+           Expr(:call, lvf, lvcmp_instr)
   end
   options = children(cmp)
   for oop ∈ options
@@ -488,7 +518,9 @@ function ifelse_reduce_fun_expr(f::Symbol, op::Operation)
     _cmp, _cmpa, _cmpb, _not, _success = find_cmp_args_from_ifelse(oop)
     _success || continue
     lvf = lv(Symbol(f, :Mirror))
-    expr = not ? Expr(:call, lvf, :($(!) ∘ $lvcmp_instr)) : Expr(:call, lvf, lvcmp_instr)
+    expr =
+      not ? Expr(:call, lvf, :($(!) ∘ $lvcmp_instr)) :
+      Expr(:call, lvf, lvcmp_instr)
     push!(expr.args, name(_cmpa), name(_cmpb))
     return expr
   end
@@ -500,7 +532,8 @@ function ifelse_reduction(f::F, rsym::Symbol, op::Operation) where {F}
   lvcmp_instr = lv(instruction(cmp).instr)
   if success
     lvf = lv(rsym)
-    return not ? Expr(:call, lvf, :($(!) ∘ $lvcmp_instr)) : Expr(:call, lvf, lvcmp_instr)
+    return not ? Expr(:call, lvf, :($(!) ∘ $lvcmp_instr)) :
+           Expr(:call, lvf, lvcmp_instr)
   end
   options = children(cmp)
   for oop ∈ options
@@ -523,17 +556,22 @@ end
 #   end
 # end
 # No `@eval` to make the language server happy
-reduction_scalar_combine(x) = reduction_scalar_combine(reduction_instruction_class(x))
+reduction_scalar_combine(x) =
+  reduction_scalar_combine(reduction_instruction_class(x))
 reduction_scalar_combine(op::Operation)::GlobalRef =
   lv(reduction_scalar_combine(instruction(op)))
 reduction_to_scalar(x) = reduction_to_scalar(reduction_instruction_class(x))
-reduction_to_scalar(op::Operation)::GlobalRef = lv(reduction_to_scalar(instruction(op)))
-reduce_number_of_vectors(x) = reduce_number_of_vectors(reduction_instruction_class(x))
+reduction_to_scalar(op::Operation)::GlobalRef =
+  lv(reduction_to_scalar(instruction(op)))
+reduce_number_of_vectors(x) =
+  reduce_number_of_vectors(reduction_instruction_class(x))
 reduce_number_of_vectors(op::Operation)::GlobalRef =
   lv(reduce_number_of_vectors(instruction(op)))
-reduce_to_onevecunroll(x) = reduce_to_onevecunroll(reduction_instruction_class(x))
+reduce_to_onevecunroll(x) =
+  reduce_to_onevecunroll(reduction_instruction_class(x))
 reduce_to_onevecunroll(op::Operation)::GlobalRef =
   lv(reduce_to_onevecunroll(instruction(op)))
-reduction_to_single_vector(x) = reduction_to_single_vector(reduction_instruction_class(x))
+reduction_to_single_vector(x) =
+  reduction_to_single_vector(reduction_instruction_class(x))
 reduction_to_single_vector(op::Operation)::GlobalRef =
   lv(reduction_to_single_vector(instruction(op)))
diff --git a/src/parse/add_compute.jl b/src/parse/add_compute.jl
index 2de835be8..ee1c7651e 100644
--- a/src/parse/add_compute.jl
+++ b/src/parse/add_compute.jl
@@ -14,7 +14,7 @@ end
 function mergesetdiffv!(
   s1::AbstractVector{T},
   s2::AbstractVector{T},
-  s3::AbstractVector{T},
+  s3::AbstractVector{T}
 ) where {T}
   for s ∈ s2
     s ∉ s3 && addsetv!(s1, s)
@@ -25,7 +25,7 @@ end
 function setdiffv!(
   s3::AbstractVector{T},
   s1::AbstractVector{T},
-  s2::AbstractVector{T},
+  s2::AbstractVector{T}
 ) where {T}
   for s ∈ s1
     (s ∈ s2) || (s ∉ s3 && push!(s3, s))
@@ -35,13 +35,17 @@ function setdiffv!(
   s4::AbstractVector{T},
   s3::AbstractVector{T},
   s1::AbstractVector{T},
-  s2::AbstractVector{T},
+  s2::AbstractVector{T}
 ) where {T}
   for s ∈ s1
     (s ∈ s2) ? (s ∉ s4 && push!(s4, s)) : (s ∉ s3 && push!(s3, s))
   end
 end
-function update_deps!(deps::Vector{Symbol}, reduceddeps::Vector{Symbol}, parent::Operation)
+function update_deps!(
+  deps::Vector{Symbol},
+  reduceddeps::Vector{Symbol},
+  parent::Operation
+)
   mergesetv!(deps, loopdependencies(parent))#, reduceddependencies(parent))
   if !(isload(parent) || isconstant(parent)) #&& !isreductcombineinstr(parent)
     mergesetv!(reduceddeps, reduceddependencies(parent))
@@ -53,7 +57,7 @@ function pushparent!(
   parents::Vector{Operation},
   deps::Vector{Symbol},
   reduceddeps::Vector{Symbol},
-  parent::Operation,
+  parent::Operation
 )
   @assert parents !== NOPARENTS
   push!(parents, parent)
@@ -69,7 +73,7 @@ function add_parent!(
   ls::LoopSet,
   var,
   elementbytes::Int,
-  position::Int,
+  position::Int
 )
   parent = if var isa Symbol
     # if var === :kern_1_1
@@ -112,13 +116,17 @@ function search_tree(opv::Vector{Operation}, var::Symbol) # relies on cycles bei
   false
 end
 
-search_tree_for_ref(ls::LoopSet, opv::Vector{Operation}, ::Nothing, var::Symbol) =
-  var, false
+search_tree_for_ref(
+  ls::LoopSet,
+  opv::Vector{Operation},
+  ::Nothing,
+  var::Symbol
+) = var, false
 function search_tree_for_ref(
   ls::LoopSet,
   opv::Vector{Operation},
   mpref::ArrayReferenceMetaPosition,
-  var::Symbol,
+  var::Symbol
 ) # relies on cycles being forbidden
   for opp ∈ opv
     if opp.ref == mpref.mref
@@ -143,7 +151,7 @@ end
 function update_reduction_status!(
   parentvec::Vector{Operation},
   deps::Vector{Symbol},
-  parent::Symbol,
+  parent::Symbol
 )
   for opp ∈ parentvec
     if name(opp) === parent
@@ -193,7 +201,7 @@ function substitute_op_in_parents!(
   replacer::Operation,
   replacee::Operation,
   reduceddeps::Vector{Symbol},
-  reductsym::Symbol,
+  reductsym::Symbol
 )
   found = false
   for i ∈ eachindex(vparents)
@@ -202,8 +210,13 @@ function substitute_op_in_parents!(
       vparents[i] = replacer
       found = true
     else
-      fopp =
-        substitute_op_in_parents!(parents(opp), replacer, replacee, reduceddeps, reductsym)
+      fopp = substitute_op_in_parents!(
+        parents(opp),
+        replacer,
+        replacee,
+        reduceddeps,
+        reductsym
+      )
       if fopp
         add_reduced_deps!(opp, reduceddeps)
         # FIXME: https://github.com/JuliaSIMD/LoopVectorization.jl/issues/259
@@ -217,7 +230,6 @@ function substitute_op_in_parents!(
   found
 end
 
-
 function add_reduction_update_parent!(
   vparents::Vector{Operation},
   deps::Vector{Symbol},
@@ -226,7 +238,7 @@ function add_reduction_update_parent!(
   parent::Operation,
   instr::Instruction,
   reduction_ind::Int,
-  elementbytes::Int,
+  elementbytes::Int
 )
   var = name(parent)
   # isouterreduction = iszero(length(loopdependencies(parent))) && (parent.instruction === LOOPCONSTANT)
@@ -257,7 +269,7 @@ function add_reduction_update_parent!(
       loopdependencies(parent),
       reductsym,
       elementbytes,
-      :numericconstant,
+      :numericconstant
     )
     if reduct_zero === :zero
       push!(ls.preamble_zeros, (identifier(reductinit), IntOrFloat))
@@ -278,7 +290,13 @@ function add_reduction_update_parent!(
       update_deps!(deps, reduceddeps, reductinit)#parent) # deps and reduced deps will not be disjoint
     end
   elseif !isouterreduction && reductinit !== parent
-    substitute_op_in_parents!(vparents, reductinit, parent, reduceddeps, reductsym)
+    substitute_op_in_parents!(
+      vparents,
+      reductinit,
+      parent,
+      reduceddeps,
+      reductsym
+    )
   end
   update_reduction_status!(vparents, reduceddeps, name(reductinit))
   # this is the op added by add_compute
@@ -290,7 +308,7 @@ function add_reduction_update_parent!(
     compute,
     deps,
     reduceddeps,
-    vparents,
+    vparents
   )
   isouterreduction && push!(ls.outer_reductions, identifier(op))
   opout = pushop!(ls, op, var) # note this overwrites the entry in the operations dict, but not the vector
@@ -314,7 +332,7 @@ function add_reduction_update_parent!(
     compute,
     childdeps,
     childrdeps,
-    childparents,
+    childparents
   )
   # child = Operation(
   #     length(operations(ls)), name(parent), elementbytes, Instruction(reductcombine,:identity), compute, childdeps, childrdeps, childparents
@@ -332,7 +350,13 @@ function substitute!(ex::Expr, d::Dict{Symbol,Symbol})
     end
   end
 end
-function argsymbol(ls::LoopSet, arg, mpref, elementbytes::Int, position::Int)::Symbol
+function argsymbol(
+  ls::LoopSet,
+  arg,
+  mpref,
+  elementbytes::Int,
+  position::Int
+)::Symbol
   argsym = gensym!(ls, "anonarg")
   if mpref === nothing
     add_operation!(ls, argsym, arg, elementbytes, position)
@@ -348,7 +372,7 @@ function add_anon_func!(
   ex::Expr,
   position::Int,
   mpref::Union{Nothing,ArrayReferenceMetaPosition},
-  elementbytes::Int,
+  elementbytes::Int
 )::Operation
   d = Dict{Symbol,Symbol}()
   anonargs = f.args[1]
@@ -387,7 +411,7 @@ function add_anon_func!(
       LHS,
       instruction(:identity),
       Operation[getop(ls, lastline)],
-      elementbytes,
+      elementbytes
     )
   elseif Meta.isexpr(lastline, :call)
     add_compute!(ls, LHS, lastline, elementbytes, position, mpref)
@@ -414,7 +438,7 @@ function maybe_fix_reduced_deps!(
   reduceddeps::Vector{Symbol},
   parent::Operation,
   mpref::ArrayReferenceMetaPosition,
-  position::Int,
+  position::Int
 )
   loopdeps_parent = loopdependencies(parent)
   reduceddeps_parent = reduceddependencies(parent)
@@ -453,7 +477,7 @@ function add_compute!(
   ex::Expr,
   elementbytes::Int,
   position::Int,
-  mpref::Union{Nothing,ArrayReferenceMetaPosition} = nothing,
+  mpref::Union{Nothing,ArrayReferenceMetaPosition} = nothing
 )::Operation
   @assert ex.head === :call
   fexpr = first(ex.args)
@@ -466,7 +490,14 @@ function add_compute!(
     arg1 = args[1]
     arg2 = args[2]
     if arg1 isa Number && convert(Float64, arg1) === -1.0
-      return add_compute!(ls, var, :(2iseven($arg2) - 1), elementbytes, position, mpref)
+      return add_compute!(
+        ls,
+        var,
+        :(2iseven($arg2) - 1),
+        elementbytes,
+        position,
+        mpref
+      )
     end
     if arg2 isa Number
       return add_pow!(ls, var, args[1], arg2, elementbytes, position)
@@ -491,18 +522,40 @@ function add_compute!(
         if mpref == argref
           if varname(mpref) === var
             id = findfirst(==(mpref.mref), ls.refs_aliasing_syms)
-            mpref.varname = var = id === nothing ? var : ls.syms_aliasing_refs[id]
+            mpref.varname =
+              var = id === nothing ? var : ls.syms_aliasing_refs[id]
             reduction_ind = ind
-            mergesetv!(deps, loopdependencies(add_load!(ls, argref, elementbytes)))
+            mergesetv!(
+              deps,
+              loopdependencies(add_load!(ls, argref, elementbytes))
+            )
           else
-            pushparent!(vparents, deps, reduceddeps, add_load!(ls, argref, elementbytes))
+            pushparent!(
+              vparents,
+              deps,
+              reduceddeps,
+              add_load!(ls, argref, elementbytes)
+            )
           end
         else
           argref.varname = gensym!(ls, "tempload")
-          pushparent!(vparents, deps, reduceddeps, add_load!(ls, argref, elementbytes))
+          pushparent!(
+            vparents,
+            deps,
+            reduceddeps,
+            add_load!(ls, argref, elementbytes)
+          )
         end
       else
-        add_parent!(vparents, deps, reduceddeps, ls, arg, elementbytes, position)
+        add_parent!(
+          vparents,
+          deps,
+          reduceddeps,
+          ls,
+          arg,
+          elementbytes,
+          position
+        )
       end
     elseif arg ∈ ls.loopsymbols
       loopsymop = add_loopvalue!(ls, arg, elementbytes)
@@ -533,8 +586,14 @@ function add_compute!(
      ) &&
      isone(length(vparents)) &&
      (position == length(loopdependencies(only(vparents))))
-    deps, reduceddeps =
-      maybe_fix_reduced_deps!(ls, deps, reduceddeps, only(vparents), mpref, position)
+    deps, reduceddeps = maybe_fix_reduced_deps!(
+      ls,
+      deps,
+      reduceddeps,
+      only(vparents),
+      mpref,
+      position
+    )
   end
   # @show reduction, search_tree(vparents, var) ex var vparents mpref get(ls.opdict, var, nothing) search_tree_for_ref(ls, vparents, mpref, var) # relies on cycles being forbidden
   if reduction || search_tree(vparents, var)
@@ -546,7 +605,7 @@ function add_compute!(
       vparents,
       reduction_ind,
       elementbytes,
-      instr,
+      instr
     )
   else
     if mpref ≢ nothing && (
@@ -562,7 +621,7 @@ function add_compute!(
         vparents,
         reduction_ind,
         elementbytes,
-        instr,
+        instr
       )
     end
     op = Operation(
@@ -573,7 +632,7 @@ function add_compute!(
       compute,
       deps,
       reduceddeps,
-      vparents,
+      vparents
     )
     return pushop!(ls, op, var)
   end
@@ -587,7 +646,7 @@ function add_reduction!(
   vparents,
   reduction_ind,
   elementbytes,
-  instr,
+  instr
 )
   parent = ls.opdict[var]
   setdiffv!(reduceddeps, deps, loopdependencies(parent))
@@ -606,7 +665,7 @@ function add_reduction!(
       compute,
       deps,
       reduceddeps,
-      vparents,
+      vparents
     )
     pushop!(ls, op, var)
   else
@@ -618,7 +677,7 @@ function add_reduction!(
       parent,
       instr,
       reduction_ind,
-      elementbytes,
+      elementbytes
     )
   end
 end
@@ -628,7 +687,7 @@ function add_compute!(
   LHS::Symbol,
   instr,
   vparents::Vector{Operation},
-  elementbytes::Int,
+  elementbytes::Int
 )
   deps = Symbol[]
   reduceddeps = Symbol[]
@@ -643,7 +702,7 @@ function add_compute!(
     compute,
     deps,
     reduceddeps,
-    vparents,
+    vparents
   )
   pushop!(ls, op, LHS)
 end
@@ -654,7 +713,7 @@ function add_compute_ifelse!(
   cond::Operation,
   iftrue::Operation,
   iffalse::Operation,
-  elementbytes::Int,
+  elementbytes::Int
 )
   deps = Symbol[]
   reduceddeps = Symbol[]
@@ -676,7 +735,7 @@ function add_compute_ifelse!(
         iftrue,
         Instruction(:LoopVectorization, :ifelse),
         2,
-        elementbytes,
+        elementbytes
       )
     end
   elseif name(iffalse) === LHS
@@ -691,7 +750,7 @@ function add_compute_ifelse!(
         iffalse,
         Instruction(:LoopVectorization, :ifelse),
         3,
-        elementbytes,
+        elementbytes
       )
     end
   end
@@ -704,10 +763,9 @@ function add_compute_ifelse!(
     compute,
     deps,
     reduceddeps,
-    vparents,
+    vparents
   )
   pushop!(ls, op, LHS)
-
 end
 
 # adds x ^ (p::Real)
@@ -717,7 +775,7 @@ function add_pow!(
   @nospecialize(x),
   p::Real,
   elementbytes::Int,
-  position::Int,
+  position::Int
 )
   xop::Operation = if x isa Expr
     add_operation!(
@@ -725,7 +783,7 @@ function add_pow!(
       Symbol("###xpow###$(length(operations(ls)))###"),
       x,
       elementbytes,
-      position,
+      position
     )
   elseif x isa Symbol
     if x ∈ ls.loopsymbols
@@ -796,9 +854,12 @@ function add_pow!(
       constant,
       NODEPENDENCY,
       Symbol[],
-      NOPARENTS,
+      NOPARENTS
+    )
+    push!(
+      ls.preamble_funcofeltypes,
+      (identifier(op), MULTIPLICATIVE_IN_REDUCTIONS)
     )
-    push!(ls.preamble_funcofeltypes, (identifier(op), MULTIPLICATIVE_IN_REDUCTIONS))
     return pushop!(ls, op)
   elseif pint == 1#requires `pden ≠ 1`.
     return add_compute!(ls, var, :identity, [xop], elementbytes)
@@ -817,14 +878,15 @@ function add_pow!(
     t = trailing_zeros(pint) + 1
     pint >>= t
     while (t -= 1) >= 0
-      xop = add_compute!(ls, gensym!(ls, "pbs"), :abs2_fast, [xop], elementbytes)
+      xop =
+        add_compute!(ls, gensym!(ls, "pbs"), :abs2_fast, [xop], elementbytes)
     end
     yop = add_compute!(
       ls,
       iszero(pint) ? var : gensym!(ls, "pbs"),
       :mul_fast,
       [xop, yop],
-      elementbytes,
+      elementbytes
     )
   end
   yop
diff --git a/src/parse/add_constants.jl b/src/parse/add_constants.jl
index 3fafaf693..f818536ea 100644
--- a/src/parse/add_constants.jl
+++ b/src/parse/add_constants.jl
@@ -9,7 +9,7 @@ const CONSTANT_SYMBOLS = (
   :Int32,
   :UInt32,
   :Int64,
-  :UInt64,
+  :UInt64
 )
 function add_constant!(ls::LoopSet, var::Symbol, elementbytes::Int)
   var ∈ ls.loopsymbols && return add_loopvalue!(ls, var, elementbytes)
@@ -23,7 +23,7 @@ function add_constant!(ls::LoopSet, var::Symbol, elementbytes::Int)
     constant,
     NODEPENDENCY,
     Symbol[],
-    NOPARENTS,
+    NOPARENTS
   )
   rop = pushop!(ls, op, var)
   (!globalconst && (rop === op)) && pushpreamble!(ls, op, var)
@@ -38,7 +38,7 @@ function add_constant!(
   ls::LoopSet,
   var::Number,
   elementbytes::Int = 8,
-  varname = gensym!(ls, "loopconstnumber"),
+  varname = gensym!(ls, "loopconstnumber")
 )
   op = Operation(
     length(operations(ls)),
@@ -48,7 +48,7 @@ function add_constant!(
     constant,
     NODEPENDENCY,
     Symbol[],
-    NOPARENTS,
+    NOPARENTS
   )
   ops = operations(ls)
   typ = var isa Integer ? HardInt : HardFloat
@@ -109,7 +109,6 @@ function ensure_constant_lowered!(ls::LoopSet, op::Operation)
         pushpreamble!(ls, Expr(:(=), name(op), floatval))
         return
       end
-
     end
     for (id, typ) ∈ ls.preamble_zeros
       if id == opid
@@ -119,7 +118,10 @@ function ensure_constant_lowered!(ls::LoopSet, op::Operation)
     end
     for (id, f) ∈ ls.preamble_funcofeltypes
       if id == opid
-        pushpreamble!(ls, Expr(:(=), name(op), Expr(:call, reduction_zero(f), Float64)))
+        pushpreamble!(
+          ls,
+          Expr(:(=), name(op), Expr(:call, reduction_zero(f), Float64))
+        )
         return
       end
     end
@@ -128,7 +130,7 @@ end
 function ensure_constant_lowered!(
   ls::LoopSet,
   mpref::ArrayReferenceMetaPosition,
-  ind::Symbol,
+  ind::Symbol
 )
   length(loopdependencies(mpref)) == 0 && return
   for (id, opp) ∈ enumerate(parents(mpref))
@@ -142,7 +144,7 @@ function add_constant_vload!(
   ls::LoopSet,
   op::Operation,
   mpref::ArrayReferenceMetaPosition,
-  elementbytes::Int,
+  elementbytes::Int
 )
   temp = gensym!(ls, "intermediateconstref")
   use_getindex = vptr(name(mpref)) === mpref.mref.ptr
@@ -165,8 +167,8 @@ function add_constant_vload!(
           fill(false, nindices),
           true,
           ls,
-          false,
-        ).args,
+          false
+        ).args
       )
     else
       push!(
@@ -177,8 +179,8 @@ function add_constant_vload!(
           fill(false, nindices),
           true,
           ls,
-          false,
-        ),
+          false
+        )
       )
     end
   end
@@ -192,7 +194,11 @@ function add_constant_vload!(
   pushpreamble!(ls, op, temp)
   return temp
 end
-function add_constant!(ls::LoopSet, mpref::ArrayReferenceMetaPosition, elementbytes::Int)
+function add_constant!(
+  ls::LoopSet,
+  mpref::ArrayReferenceMetaPosition,
+  elementbytes::Int
+)
   op = Operation(
     length(operations(ls)),
     varname(mpref),
@@ -202,7 +208,7 @@ function add_constant!(ls::LoopSet, mpref::ArrayReferenceMetaPosition, elementby
     NODEPENDENCY,
     Symbol[],
     NOPARENTS,
-    mpref.mref,
+    mpref.mref
   )
   add_vptr!(ls, op)
   temp = add_constant_vload!(ls, op, mpref, elementbytes)
@@ -217,7 +223,7 @@ function add_constant!(
   deps::Vector{Symbol},
   assignedsym::Symbol,
   elementbytes::Int,
-  f::Symbol = Symbol(""),
+  f::Symbol = Symbol("")
 )
   value ∈ ls.loopsymbols && return add_loopvalue!(ls, value, elementbytes)
   retop = get(ls.opdict, value, nothing)
@@ -230,7 +236,7 @@ function add_constant!(
       constant,
       deps,
       NODEPENDENCY,
-      NOPARENTS,
+      NOPARENTS
     )
   else
     op = Operation(
@@ -241,7 +247,7 @@ function add_constant!(
       compute,
       deps,
       reduceddependencies(retop),
-      [retop],
+      [retop]
     )
   end
   pushop!(ls, op, assignedsym)
@@ -258,7 +264,7 @@ function add_constant!(
   value::Number,
   deps::Vector{Symbol},
   assignedsym::Symbol,
-  elementbytes::Int,
+  elementbytes::Int
 )
   op = add_constant!(
     ls,
@@ -266,7 +272,7 @@ function add_constant!(
     deps,
     assignedsym,
     elementbytes,
-    :numericconstant,
+    :numericconstant
   )
   pushpreamble!(ls, op, value)
   op
diff --git a/src/parse/add_ifelse.jl b/src/parse/add_ifelse.jl
index 611d87ed2..cc3e0d106 100644
--- a/src/parse/add_ifelse.jl
+++ b/src/parse/add_ifelse.jl
@@ -11,7 +11,7 @@ function add_if!(
   RHS::Expr,
   elementbytes::Int,
   position::Int,
-  mpref::Union{Nothing,ArrayReferenceMetaPosition} = nothing,
+  mpref::Union{Nothing,ArrayReferenceMetaPosition} = nothing
 )
   # for now, just simple 1-liners
   @assert length(RHS.args) == 3 "if statements without an else cannot be assigned to a variable."
@@ -21,11 +21,19 @@ function add_if!(
   elseif mpref === nothing
     add_operation!(ls, gensym!(ls, "mask"), condition, elementbytes, position)
   else
-    add_operation!(ls, gensym!(ls, "mask"), condition, mpref, elementbytes, position)
+    add_operation!(
+      ls,
+      gensym!(ls, "mask"),
+      condition,
+      mpref,
+      elementbytes,
+      position
+    )
   end
   iftrue = RHS.args[2]
   if iftrue isa Expr
-    trueop = add_operation!(ls, gensym!(ls, "iftrue"), iftrue, elementbytes, position)
+    trueop =
+      add_operation!(ls, gensym!(ls, "iftrue"), iftrue, elementbytes, position)
     if iftrue.head === :ref &&
        all(ld -> ld ∈ loopdependencies(trueop), loopdependencies(condop)) &&
        !search_tree(parents(condop), trueop)
@@ -49,7 +57,13 @@ function add_if!(
     end
   end
   if iffalse isa Expr
-    falseop = add_operation!(ls, gensym!(ls, "iffalse"), iffalse, elementbytes, position)
+    falseop = add_operation!(
+      ls,
+      gensym!(ls, "iffalse"),
+      iffalse,
+      elementbytes,
+      position
+    )
     if iffalse.head === :ref &&
        all(ld -> ld ∈ loopdependencies(falseop), loopdependencies(condop)) &&
        !search_tree(parents(condop), falseop)
@@ -78,7 +92,7 @@ function add_andblock!(
   LHS,
   rhsop::Operation,
   elementbytes::Int,
-  position::Int,
+  position::Int
 )
   if LHS isa Symbol
     altop = getop(ls, LHS, elementbytes)
@@ -95,9 +109,10 @@ function add_andblock!(
   LHS,
   RHS::Expr,
   elementbytes::Int,
-  position::Int,
+  position::Int
 )
-  rhsop = add_compute!(ls, gensym!(ls, "iftruerhs"), RHS, elementbytes, position)
+  rhsop =
+    add_compute!(ls, gensym!(ls, "iftruerhs"), RHS, elementbytes, position)
   add_andblock!(ls, condop, LHS, rhsop, elementbytes, position)
 end
 function add_andblock!(
@@ -106,7 +121,7 @@ function add_andblock!(
   LHS,
   RHS,
   elementbytes::Int,
-  position::Int,
+  position::Int
 )
   rhsop = getop(ls, RHS, elementbytes)
   add_andblock!(ls, condop, LHS, rhsop, elementbytes, position)
@@ -116,9 +131,10 @@ function add_andblock!(
   condexpr::Expr,
   condeval::Expr,
   elementbytes::Int,
-  position::Int,
+  position::Int
 )
-  condop = add_operation!(ls, gensym!(ls, "mask"), condexpr, elementbytes, position)
+  condop =
+    add_operation!(ls, gensym!(ls, "mask"), condexpr, elementbytes, position)
   add_andblock!(ls, condop, condeval, elementbytes, position)
 end
 function add_andblock!(
@@ -126,14 +142,21 @@ function add_andblock!(
   condop::Operation,
   condeval::Expr,
   elementbytes::Int,
-  position::Int,
+  position::Int
 )
   if condeval.head === :call
     @assert first(condeval.args) === :setindex!
     array, raw_indices = ref_from_setindex!(ls, condeval)
     ref = Expr(:ref, array)
     append!(ref.args, raw_indices)
-    return add_andblock!(ls, condop, ref, condeval.args[3], elementbytes, position)
+    return add_andblock!(
+      ls,
+      condop,
+      ref,
+      condeval.args[3],
+      elementbytes,
+      position
+    )
   end
   @assert condeval.head === :(=)
   @assert length(condeval.args) == 2
@@ -151,7 +174,7 @@ function add_andblock!(ls::LoopSet, ex::Expr, elementbytes::Int, position::Int)
       getop(ls, condexpr, elementbytes),
       last(ex.args)::Expr,
       elementbytes,
-      position,
+      position
     )
   end
 end
@@ -162,7 +185,7 @@ function add_orblock!(
   LHS,
   rhsop::Operation,
   elementbytes::Int,
-  position::Int,
+  position::Int
 )
   negatedcondop = negateop!(ls, condop, elementbytes)
   if LHS isa Symbol
@@ -170,7 +193,14 @@ function add_orblock!(
     # return add_compute!(ls, LHS, :ifelse, [condop, altop, rhsop], elementbytes)
     # Placing altop second seems to let LLVM fuse operations; but as of LLVM 9.0.1 it will not if altop is first
     # therefore, we negate the condition and switch order so that the altop is second.
-    return add_compute_ifelse!(ls, LHS, negatedcondop, rhsop, altop, elementbytes)
+    return add_compute_ifelse!(
+      ls,
+      LHS,
+      negatedcondop,
+      rhsop,
+      altop,
+      elementbytes
+    )
   elseif LHS isa Expr && LHS.head === :ref
     # negatedcondop = add_compute!(ls, gensym(:negated_mask), :~, [condop], elementbytes)
     return add_conditional_store!(ls, LHS, negatedcondop, rhsop, elementbytes)
@@ -184,9 +214,10 @@ function add_orblock!(
   LHS,
   RHS::Expr,
   elementbytes::Int,
-  position::Int,
+  position::Int
 )
-  rhsop = add_compute!(ls, gensym!(ls, "iffalserhs"), RHS, elementbytes, position)
+  rhsop =
+    add_compute!(ls, gensym!(ls, "iffalserhs"), RHS, elementbytes, position)
   add_orblock!(ls, condop, LHS, rhsop, elementbytes, position)
 end
 function add_orblock!(
@@ -195,7 +226,7 @@ function add_orblock!(
   LHS,
   RHS,
   elementbytes::Int,
-  position::Int,
+  position::Int
 )
   rhsop = getop(ls, RHS, elementbytes)
   add_orblock!(ls, condop, LHS, rhsop, elementbytes, position)
@@ -205,15 +236,23 @@ function add_orblock!(
   condexpr::Expr,
   condeval::Expr,
   elementbytes::Int,
-  position::Int,
+  position::Int
 )
-  condop = add_operation!(ls, gensym!(ls, "mask"), condexpr, elementbytes, position)
+  condop =
+    add_operation!(ls, gensym!(ls, "mask"), condexpr, elementbytes, position)
   if condeval.head === :call
     @assert first(condeval.args) === :setindex!
     array, raw_indices = ref_from_setindex!(ls, condeval)
     ref = Expr(:ref, array)
     append!(ref.args, raw_indices)
-    return add_orblock!(ls, condop, ref, condeval.args[3], elementbytes, position)
+    return add_orblock!(
+      ls,
+      condop,
+      ref,
+      condeval.args[3],
+      elementbytes,
+      position
+    )
   end
   @assert condeval.head === :(=)
   @assert length(condeval.args) == 2
@@ -222,5 +261,11 @@ function add_orblock!(
   add_orblock!(ls, condop, LHS, RHS, elementbytes, position)
 end
 function add_orblock!(ls::LoopSet, ex::Expr, elementbytes::Int, position::Int)
-  add_orblock!(ls, first(ex.args)::Expr, last(ex.args)::Expr, elementbytes, position)
+  add_orblock!(
+    ls,
+    first(ex.args)::Expr,
+    last(ex.args)::Expr,
+    elementbytes,
+    position
+  )
 end
diff --git a/src/parse/add_loads.jl b/src/parse/add_loads.jl
index be167176d..bf8537c7f 100644
--- a/src/parse/add_loads.jl
+++ b/src/parse/add_loads.jl
@@ -42,7 +42,13 @@ function add_load!(ls::LoopSet, op::Operation, actualarray::Bool = true)
   pushop!(ls, op, name(op))
 end
 
-function add_load!(ls::LoopSet, var::Symbol, array::Symbol, rawindices, elementbytes::Int)
+function add_load!(
+  ls::LoopSet,
+  var::Symbol,
+  array::Symbol,
+  rawindices,
+  elementbytes::Int
+)
   mpref = array_reference_meta!(ls, array, rawindices, elementbytes, var)
   add_load!(ls, mpref, elementbytes)
 end
@@ -57,7 +63,11 @@ function load_is_constant(mpref::ArrayReferenceMetaPosition)
   end
   true
 end
-function add_load!(ls::LoopSet, mpref::ArrayReferenceMetaPosition, elementbytes::Int)
+function add_load!(
+  ls::LoopSet,
+  mpref::ArrayReferenceMetaPosition,
+  elementbytes::Int
+)
   if length(mpref.loopdependencies) == 0 || load_is_constant(mpref)
     return add_constant!(ls, mpref, elementbytes)
   end
@@ -71,7 +81,7 @@ function add_simple_load!(
   var::Symbol,
   ref::ArrayReference,
   elementbytes::Int,
-  actualarray::Bool = true,
+  actualarray::Bool = true
 )
   loopdeps = copy(getindicesonly(ref))
   mref = ArrayReferenceMeta(ref, fill(true, length(loopdeps)))
@@ -83,7 +93,7 @@ function add_simple_load!(
   mref::ArrayReferenceMeta,
   loopdeps::Vector{Symbol},
   elementbytes::Int,
-  actualarray::Bool = true,
+  actualarray::Bool = true
 )
   op = Operation(
     length(operations(ls)),
@@ -94,7 +104,7 @@ function add_simple_load!(
     loopdeps,
     NODEPENDENCY,
     NOPARENTS,
-    mref,
+    mref
   )
   add_vptr!(ls, op.ref.ref.array, vptr(op), actualarray)
   pushop!(ls, op, var)
@@ -103,7 +113,12 @@ function add_load_ref!(ls::LoopSet, var::Symbol, ex::Expr, elementbytes::Int)
   array, rawindices = ref_from_ref!(ls, ex)
   add_load!(ls, var, array, rawindices, elementbytes)
 end
-function add_load_getindex!(ls::LoopSet, var::Symbol, ex::Expr, elementbytes::Int)
+function add_load_getindex!(
+  ls::LoopSet,
+  var::Symbol,
+  ex::Expr,
+  elementbytes::Int
+)
   array, rawindices = ref_from_getindex!(ls, ex)
   add_load!(ls, var, array, rawindices, elementbytes)
 end
@@ -114,6 +129,13 @@ function add_loopvalue!(ls::LoopSet, arg::Symbol, elementbytes::Int)
   for op ∈ operations(ls)#check to CSE
     (op.variable === arg && instr == instruction(op)) && return op
   end
-  op = Operation(length(operations(ls)), arg, elementbytes, instr, loopvalue, [arg])
+  op = Operation(
+    length(operations(ls)),
+    arg,
+    elementbytes,
+    instr,
+    loopvalue,
+    [arg]
+  )
   pushop!(ls, op, arg)
 end
diff --git a/src/parse/add_stores.jl b/src/parse/add_stores.jl
index c462a7365..e0139588a 100644
--- a/src/parse/add_stores.jl
+++ b/src/parse/add_stores.jl
@@ -11,7 +11,7 @@ end
 function add_store!(
   ls::LoopSet,
   op::Operation,
-  add_pvar::Bool = !any(r -> r == op.ref, ls.refs_aliasing_syms),
+  add_pvar::Bool = !any(r -> r == op.ref, ls.refs_aliasing_syms)
 )
   @assert isstore(op)
   if add_pvar
@@ -25,19 +25,19 @@ function add_copystore!(
   ls::LoopSet,
   parent::Operation,
   mpref::ArrayReferenceMetaPosition,
-  elementbytes::Int,
+  elementbytes::Int
 )
-  op = add_compute!(ls, gensym!(ls, "identity"), :identity, [parent], elementbytes)
+  op =
+    add_compute!(ls, gensym!(ls, "identity"), :identity, [parent], elementbytes)
   # pushfirst!(mpref.parents, parent)
   add_store!(ls, mpref, elementbytes, op)
 end
 
-
 function add_store!(
   ls::LoopSet,
   mpref::ArrayReferenceMetaPosition,
   elementbytes::Int,
-  parent = getop(ls, varname(mpref), mpref.loopdependencies, elementbytes),
+  parent = getop(ls, varname(mpref), mpref.loopdependencies, elementbytes)
 )
   isload(parent) && return add_copystore!(ls, parent, mpref, elementbytes)
   vparents = mpref.parents
@@ -62,7 +62,13 @@ function add_store!(
   add_store!(ls, op, add_pvar)
 end
 
-function add_store!(ls::LoopSet, var::Symbol, array::Symbol, rawindices, elementbytes::Int)
+function add_store!(
+  ls::LoopSet,
+  var::Symbol,
+  array::Symbol,
+  rawindices,
+  elementbytes::Int
+)
   mpref = array_reference_meta!(ls, array, rawindices, elementbytes, var)
   add_store!(ls, mpref, elementbytes)
 end
@@ -70,7 +76,7 @@ function add_simple_store!(
   ls::LoopSet,
   parent::Operation,
   mref::ArrayReferenceMeta,
-  elementbytes::Int,
+  elementbytes::Int
 )
   op = Operation(
     ls,
@@ -81,7 +87,7 @@ function add_simple_store!(
     getindices(mref.ref),
     NODEPENDENCY,
     [parent],
-    mref,
+    mref
   )
   add_unique_store!(ls, op)
 end
@@ -89,7 +95,7 @@ function add_simple_store!(
   ls::LoopSet,
   var::Union{Symbol,Operation},
   ref::Union{ArrayReference,ArrayReferenceMeta},
-  elementbytes::Int,
+  elementbytes::Int
 )
   parent = isa(var, Symbol) ? getop(ls, var, elementbytes) : var
   mref =
@@ -104,7 +110,13 @@ end
 function add_store_ref!(ls::LoopSet, var, ex::Expr, elementbytes::Int)
   array, raw_indices = ref_from_ref!(ls, ex)
   mpref = array_reference_meta!(ls, array, raw_indices, elementbytes)
-  c = add_constant!(ls, var, loopdependencies(mpref), gensym(:storeconst), elementbytes)
+  c = add_constant!(
+    ls,
+    var,
+    loopdependencies(mpref),
+    gensym(:storeconst),
+    elementbytes
+  )
   add_store!(ls, mpref, elementbytes, c)
 end
 
@@ -115,7 +127,7 @@ function add_conditional_store!(
   LHS,
   condop::Operation,
   storeop::Operation,
-  elementbytes::Int,
+  elementbytes::Int
 )
   array, rawindices = ref_from_ref!(ls, LHS)
   mpref = array_reference_meta!(ls, array, rawindices, elementbytes)
@@ -169,7 +181,7 @@ function add_conditional_store!(
     ldref,
     reduceddependencies(storeop),
     storeparents,
-    mref,
+    mref
   )
   add_unique_store!(ls, op)
 end
diff --git a/src/parse/memory_ops_common.jl b/src/parse/memory_ops_common.jl
index fe7be5538..21a30b108 100644
--- a/src/parse/memory_ops_common.jl
+++ b/src/parse/memory_ops_common.jl
@@ -6,7 +6,11 @@ function dottosym(x::Expr)::Symbol
   xa2 isa QuoteNode ? Symbol(s1, "###extractarray###", xa2.value) :
   Symbol(s1, "###extractarray###", xa2)
 end
-function extract_array_symbol_from_ref!(ls::LoopSet, ex::Expr, offset1::Int)::Symbol
+function extract_array_symbol_from_ref!(
+  ls::LoopSet,
+  ex::Expr,
+  offset1::Int
+)::Symbol
   ar = ex.args[1+offset1]
   if isa(ar, Symbol)
     return ar
@@ -20,7 +24,6 @@ function extract_array_symbol_from_ref!(ls::LoopSet, ex::Expr, offset1::Int)::Sy
   end
 end
 
-
 function ref_from_expr!(ls, ex, offset1::Int, offset2::Int)
   ar = extract_array_symbol_from_ref!(ls, ex, offset1)
   ar, @view(ex.args[2+offset2:end])
@@ -38,10 +41,16 @@ function ref_from_expr!(ls::LoopSet, ex::Expr)
 end
 
 add_vptr!(ls::LoopSet, op::Operation) = add_vptr!(ls, op.ref)
-add_vptr!(ls::LoopSet, mref::ArrayReferenceMeta) = add_vptr!(ls, mref.ref.array, vptr(mref))
+add_vptr!(ls::LoopSet, mref::ArrayReferenceMeta) =
+  add_vptr!(ls, mref.ref.array, vptr(mref))
 # using VectorizationBase: noaliasstridedpointer
 presbufsym(array) = Symbol('#', array, "#preserve#buffer#")
-function add_vptr!(ls::LoopSet, array::Symbol, vptrarray::Symbol, actualarray::Bool = true)
+function add_vptr!(
+  ls::LoopSet,
+  array::Symbol,
+  vptrarray::Symbol,
+  actualarray::Bool = true
+)
   if !includesarray(ls, array)
     push!(ls.includedarrays, array)
     actualarray && push!(ls.includedactualarrays, array)
@@ -50,8 +59,8 @@ function add_vptr!(ls::LoopSet, array::Symbol, vptrarray::Symbol, actualarray::B
       Expr(
         :(=),
         Expr(:tuple, vptrarray, presbufsym(array)),
-        Expr(:call, lv(:stridedpointer_preserve), array),
-      ),
+        Expr(:call, lv(:stridedpointer_preserve), array)
+      )
     )
   end
   nothing
@@ -72,7 +81,7 @@ function subset_vptr!(
   ind,
   previndices,
   loopindex,
-  D::Int,
+  D::Int
 )
   subset = D == 0
   str_typ = subset ? "subset" : "index"
@@ -94,7 +103,8 @@ function subset_vptr!(
         # A[I + J, constindex], I and J may be CartesianIndices. This requires they all be of same number of dims
         loopdep = first(loopdependencies(ls.opdict[previndices[i+offset]]))
       end
-      constoffset = append_loop_staticdims!(valcall, getloop(ls, loopdep), constoffset)
+      constoffset =
+        append_loop_staticdims!(valcall, getloop(ls, loopdep), constoffset)
     end
   end
   # indm1 = ind isa Integer ? ind - 1 : Expr(:call, :-, ind, 1)
@@ -118,7 +128,7 @@ function gesp_const_offset!(
   loopedindex::Vector{Bool},
   mlt::Integer,
   sym,
-  D::Int,
+  D::Int
 )
   if isone(mlt)
     subset_vptr!(ls, vptrarray, ninds, sym, indices, loopedindex, D)
@@ -137,16 +147,24 @@ function gesp_const_offsets!(
   indices::Vector{Symbol},
   loopedindex::Vector{Bool},
   mltsyms::Vector{Tuple{Int,Symbol}},
-  D::Int,
+  D::Int
 )
-  length(mltsyms) > 1 && sort!(mltsyms, by = last) # if multiple have same combination of syms, make sure they match even if order is different
+  length(mltsyms) > 1 && sort!(mltsyms; by = last) # if multiple have same combination of syms, make sure they match even if order is different
   for (mlt, sym) ∈ mltsyms
-    vptrarray = gesp_const_offset!(ls, vptrarray, ninds, indices, loopedindex, mlt, sym, D)
+    vptrarray = gesp_const_offset!(
+      ls,
+      vptrarray,
+      ninds,
+      indices,
+      loopedindex,
+      mlt,
+      sym,
+      D
+    )
   end
   vptrarray
 end
 
-
 byterepresentable(x)::Bool = false
 byterepresentable(x::Integer)::Bool = typemin(Int8) ≤ x ≤ typemax(Int8)
 function _addoffset!(
@@ -157,7 +175,7 @@ function _addoffset!(
   loopdependencies,
   ind,
   offset,
-  stride,
+  stride
 )
   push!(indices, ind)
   push!(offsets, offset % Int8)
@@ -186,7 +204,7 @@ function addopindex!(
   loopedindex::Vector{Bool},
   indop::Operation,
   stride = one(Int8),
-  offset = zero(Int8),
+  offset = zero(Int8)
 )
   pushparent!(parents, loopdependencies, reduceddeps, indop)
   push!(indices, name(indop))
@@ -205,7 +223,7 @@ function add_affine_index_expr!(
   mult_syms::Vector{Tuple{Int,Symbol}},
   constant::Base.RefValue{Int},
   stride::Int,
-  expr::Symbol,
+  expr::Symbol
 )
   push!(mult_syms, (stride, expr))
   return nothing
@@ -215,7 +233,7 @@ function add_affine_index_expr!(
   mult_syms::Vector{Tuple{Int,Symbol}},
   constant::Base.RefValue{Int},
   stride::Int,
-  expr::Integer,
+  expr::Integer
 )
   constant[] += stride * expr
   return nothing
@@ -225,10 +243,15 @@ function add_affine_op!(
   mult_syms::Vector{Tuple{Int,Symbol}},
   constant::Base.RefValue{Int},
   stride::Int,
-  expr::Expr,
+  expr::Expr
 )
-  parent =
-    add_operation!(ls, gensym!(ls, "indexpr"), expr, sizeof(Int), length(ls.loopsymbols))
+  parent = add_operation!(
+    ls,
+    gensym!(ls, "indexpr"),
+    expr,
+    sizeof(Int),
+    length(ls.loopsymbols)
+  )
   add_affine_index_expr!(ls, mult_syms, constant, stride, name(parent))
   return nothing
 end
@@ -239,7 +262,7 @@ function add_mul!(
   stride::Int,
   arg1,
   arg2,
-  expr,
+  expr
 )
   if arg1 isa Integer
     add_affine_index_expr!(ls, mult_syms, constant, stride * arg1, arg2)
@@ -255,9 +278,10 @@ function add_affine_index_expr!(
   mult_syms::Vector{Tuple{Int,Symbol}},
   constant::Base.RefValue{Int},
   stride::Int,
-  expr::Expr,
+  expr::Expr
 )
-  expr.head === :call || return add_affine_op!(ls, mult_syms, constant, stride, expr)
+  expr.head === :call ||
+    return add_affine_op!(ls, mult_syms, constant, stride, expr)
   f = expr.args[1]
   if f === :(*)
     @assert length(expr.args) == 3
@@ -278,7 +302,10 @@ function add_affine_index_expr!(
   end
   return nothing
 end
-function affine_index_expression(ls::LoopSet, expr)::Tuple{Int,Vector{Tuple{Int,Symbol}}}
+function affine_index_expression(
+  ls::LoopSet,
+  expr
+)::Tuple{Int,Vector{Tuple{Int,Symbol}}}
   mult_syms = Tuple{Int,Symbol}[]
   constant = Ref(0)
   add_affine_index_expr!(ls, mult_syms, constant, 1, expr)
@@ -296,7 +323,7 @@ function muladd_index!(
   loopedindex,
   mlt::Int,
   sym::Symbol,
-  offset::Int,
+  offset::Int
 )
   muladd_index!(
     ls,
@@ -309,7 +336,7 @@ function muladd_index!(
     loopedindex,
     mlt,
     getop(ls, sym, sizeof(Int)),
-    offset,
+    offset
   )
 end
 function muladd_op!(ls::LoopSet, mlt::Int, sym::Symbol, offset::Int)
@@ -350,7 +377,7 @@ function muladd_index!(
   loopedindex,
   mlt::Int,
   symop::Operation,
-  offset::Int,
+  offset::Int
 )
   if byterepresentable(offset) & byterepresentable(mlt)
     addopindex!(
@@ -363,7 +390,7 @@ function muladd_index!(
       loopedindex,
       symop,
       mlt,
-      offset,
+      offset
     )
   else
     indop = muladd_op!(ls, mlt, symop, offset)
@@ -375,7 +402,7 @@ function muladd_index!(
       offsets,
       strides,
       loopedindex,
-      indop,
+      indop
     )
   end
 end
@@ -403,7 +430,7 @@ function add_additive_index!(
   reduceddeps,
   offset,
   mlt,
-  D,
+  D
 )
   factor = Core.ifelse((instruction(mop).instr === :sub_fast), -1, 1)
   if length(parents(mop)) == 2
@@ -420,7 +447,7 @@ function add_additive_index!(
           loopdependencies,
           name(sub1),
           offset + literalval,
-          mlt,
+          mlt
         )
       else
         vptrarray = gesp_const_offset!(
@@ -431,7 +458,7 @@ function add_additive_index!(
           loopedindex,
           factor * mlt,
           name(sub2),
-          D,
+          D
         )
         _addoffset!(
           indices,
@@ -441,7 +468,7 @@ function add_additive_index!(
           loopdependencies,
           name(sub1),
           offset,
-          mlt,
+          mlt
         )
       end
     elseif isloopvalue(sub2) & isconstant(sub1)
@@ -455,11 +482,19 @@ function add_additive_index!(
           loopdependencies,
           name(sub2),
           offset + literalval,
-          factor * mlt,
+          factor * mlt
         )
       else
-        vptrarray =
-          gesp_const_offset!(ls, vptrarray, ninds, indices, loopedindex, mlt, name(sub1), D)
+        vptrarray = gesp_const_offset!(
+          ls,
+          vptrarray,
+          ninds,
+          indices,
+          loopedindex,
+          mlt,
+          name(sub1),
+          D
+        )
         _addoffset!(
           indices,
           offsets,
@@ -468,7 +503,7 @@ function add_additive_index!(
           loopdependencies,
           name(sub2),
           offset,
-          factor * mlt,
+          factor * mlt
         )
       end
     else
@@ -483,7 +518,7 @@ function add_additive_index!(
         loopedindex,
         mlt,
         sym,
-        offset,
+        offset
       )
     end
   else
@@ -498,7 +533,7 @@ function add_additive_index!(
       loopedindex,
       mlt,
       sym,
-      offset,
+      offset
     )
   end
   vptrarray
@@ -516,9 +551,8 @@ function checkforoffset!(
   loopdependencies::Vector{Symbol},
   reduceddeps::Vector{Symbol},
   ind::Expr,
-  D::Int,
+  D::Int
 )::Symbol
-
   offset, mult_syms = affine_index_expression(ls, ind)
   let deleted = 0, N = length(mult_syms)
     for n ∈ 1:N
@@ -563,14 +597,22 @@ function checkforoffset!(
           loopedindex,
           mlt,
           sym,
-          offset,
+          offset
         )
         return vptrarray
       end
     end
     r = copysign(abs(offset) & 127, offset)
-    vptrarray =
-      gesp_const_offset!(ls, vptrarray, ninds, indices, loopedindex, 1, offset - r, D)
+    vptrarray = gesp_const_offset!(
+      ls,
+      vptrarray,
+      ninds,
+      indices,
+      loopedindex,
+      1,
+      offset - r,
+      D
+    )
     offset = r
   end
   # (success && byterepresentable(offset)) || return false, vptrarray
@@ -589,7 +631,7 @@ function checkforoffset!(
           loopdependencies,
           sym,
           offset,
-          mlt,
+          mlt
         )
       else
         muladd_index!(
@@ -603,7 +645,7 @@ function checkforoffset!(
           loopedindex,
           mlt,
           sym,
-          offset,
+          offset
         )
       end
     elseif !byterepresentable(mlt)
@@ -618,17 +660,33 @@ function checkforoffset!(
         loopedindex,
         mlt,
         sym,
-        offset,
+        offset
       )
     else
       mop = get(ls.opdict, sym, nothing)
       if mop === nothing
-        vptrarray =
-          gesp_const_offset!(ls, vptrarray, ninds, indices, loopedindex, mlt, sym, D)
+        vptrarray = gesp_const_offset!(
+          ls,
+          vptrarray,
+          ninds,
+          indices,
+          loopedindex,
+          mlt,
+          sym,
+          D
+        )
         addconstindex!(indices, offsets, strides, loopedindex, offset)
       elseif isconstant(mop)
-        vptrarray =
-          gesp_const_offset!(ls, vptrarray, ninds, indices, loopedindex, mlt, name(mop), D)
+        vptrarray = gesp_const_offset!(
+          ls,
+          vptrarray,
+          ninds,
+          indices,
+          loopedindex,
+          mlt,
+          name(mop),
+          D
+        )
         addconstindex!(indices, offsets, strides, loopedindex, offset)
       elseif (instruction(mop).instr === :add_fast) ||
              (instruction(mop).instr === :sub_fast)
@@ -647,7 +705,7 @@ function checkforoffset!(
           reduceddeps,
           offset,
           mlt,
-          D,
+          D
         )
       else
         muladd_index!(
@@ -661,7 +719,7 @@ function checkforoffset!(
           loopedindex,
           mlt,
           sym,
-          offset,
+          offset
         )
       end
     end
@@ -685,11 +743,28 @@ function checkforoffset!(
       sop = get(ls.opdict, s, nothing)
       if sop === nothing
         push!(deleteat_inds, i)
-        vptrarray = gesp_const_offset!(ls, vptrarray, ninds, indices, loopedindex, m, s, D)
+        vptrarray = gesp_const_offset!(
+          ls,
+          vptrarray,
+          ninds,
+          indices,
+          loopedindex,
+          m,
+          s,
+          D
+        )
       elseif isconstant(sop)
         push!(deleteat_inds, i)
-        vptrarray =
-          gesp_const_offset!(ls, vptrarray, ninds, indices, loopedindex, m, name(sop), D)
+        vptrarray = gesp_const_offset!(
+          ls,
+          vptrarray,
+          ninds,
+          indices,
+          loopedindex,
+          m,
+          name(sop),
+          D
+        )
       else
         # @show sop
         # if instruction(sop).instr === :sub_fast
@@ -729,18 +804,38 @@ function checkforoffset!(
         loopdependencies,
         sym,
         offset,
-        mlt,
+        mlt
       )
     end
     deleteat!(mult_syms, deleteat_inds)
-    return gesp_const_offsets!(ls, vptrarray, ninds, indices, loopedindex, mult_syms, D)
+    return gesp_const_offsets!(
+      ls,
+      vptrarray,
+      ninds,
+      indices,
+      loopedindex,
+      mult_syms,
+      D
+    )
   end
   deleteat!(mult_syms, deleteat_inds)
-  vptrarray = gesp_const_offsets!(ls, vptrarray, ninds, indices, loopedindex, mult_syms, D)
+  vptrarray = gesp_const_offsets!(
+    ls,
+    vptrarray,
+    ninds,
+    indices,
+    loopedindex,
+    mult_syms,
+    D
+  )
   if length(operations) == 1
     _mlt = only(operation_mults)
-    indop =
-      muladd_op!(ls, Core.ifelse(byterepresentable(_mlt), 1, _mlt), only(operations), 0)
+    indop = muladd_op!(
+      ls,
+      Core.ifelse(byterepresentable(_mlt), 1, _mlt),
+      only(operations),
+      0
+    )
     addopindex!(
       opparents,
       loopdependencies,
@@ -751,7 +846,7 @@ function checkforoffset!(
       loopedindex,
       indop,
       Core.ifelse(byterepresentable(_mlt), _mlt % Int8, one(Int8)),
-      offset % Int8,
+      offset % Int8
     )
   else
     mlt1ind = findfirst(isone, operation_mults)
@@ -770,7 +865,7 @@ function checkforoffset!(
           gensym!(ls, "indexaccum"),
           instruction(:(-)),
           [opbase, _op],
-          sizeof(Int),
+          sizeof(Int)
         )
       elseif _mlt == 1
         add_compute!(
@@ -778,7 +873,7 @@ function checkforoffset!(
           gensym!(ls, "indexaccum"),
           instruction(:(+)),
           [opbase, _op],
-          sizeof(Int),
+          sizeof(Int)
         )
       else
         add_compute!(
@@ -786,7 +881,7 @@ function checkforoffset!(
           gensym!(ls, "indexaccum"),
           instruction(:muladd),
           [add_constant!(ls, _mlt, sizeof(Int)), _op, opbase],
-          sizeof(Int),
+          sizeof(Int)
         )
       end
     end
@@ -800,7 +895,7 @@ function checkforoffset!(
       loopedindex,
       opbase,
       one(Int8),
-      offset % Int8,
+      offset % Int8
     )
   end
   return vptrarray
@@ -819,14 +914,15 @@ function repeated_index!(
   indices::Vector{Symbol},
   vptr::Symbol,
   indnum::Int,
-  firstind::Int,
+  firstind::Int
 )
   # Move ind to last position
-  vptrrepremoved = Symbol(vptr, "##ind##", firstind, "##repeated##", indnum, "##")
+  vptrrepremoved =
+    Symbol(vptr, "##ind##", firstind, "##repeated##", indnum, "##")
   f = Expr(
     :(.),
     Expr(:(.), :LoopVectorization, QuoteNode(:VectorizationBase)),
-    QuoteNode(:double_index),
+    QuoteNode(:double_index)
   )
   fiv = Expr(:call, Expr(:curly, :Val, firstind - 1))
   siv = Expr(:call, Expr(:curly, :Val, indnum - 1))
@@ -839,7 +935,7 @@ function array_reference_meta!(
   array::Symbol,
   rawindices,
   elementbytes::Int,
-  var::Union{Nothing,Symbol} = nothing,
+  var::Union{Nothing,Symbol} = nothing
 )
   vptrarray = vptr(array)
   add_vptr!(ls, array, vptrarray) # now, subset
@@ -860,7 +956,15 @@ function array_reference_meta!(
         ninds += 1
       else
         # convert ind to reduce invalidations
-        vptrarray = subset_vptr!(ls, vptrarray, ninds, convert(Int, ind), indices, loopedindex, 0)
+        vptrarray = subset_vptr!(
+          ls,
+          vptrarray,
+          ninds,
+          convert(Int, ind),
+          indices,
+          loopedindex,
+          0
+        )
         length(indices) == 0 && push!(indices, DISCONTIGUOUS)
       end
     elseif ind isa Expr
@@ -877,7 +981,7 @@ function array_reference_meta!(
         loopdependencies,
         reduceddeps,
         ind,
-        D,
+        D
       )
       ninds += 1
     elseif ind isa Symbol
@@ -900,7 +1004,7 @@ function array_reference_meta!(
           else
             move_to_last!(
               loopdependencies,
-              findfirst(Base.Fix2(===, ind), loopdependencies)::Int,
+              findfirst(Base.Fix2(===, ind), loopdependencies)::Int
             )
           end
           vptrarray = repeated_index!(
@@ -908,7 +1012,7 @@ function array_reference_meta!(
             indices,
             vptrarray,
             ninds,
-            ind_prev_index + (first(indices) === DISCONTIGUOUS),
+            ind_prev_index + (first(indices) === DISCONTIGUOUS)
           )
           makediscontiguous!(indices)
         end
@@ -932,7 +1036,7 @@ function array_reference_meta!(
               reduceddeps,
               0,
               1,
-              D,
+              D
             )
             ninds += 1
           else
@@ -944,7 +1048,8 @@ function array_reference_meta!(
             push!(loopedindex, false)
           end
         else
-          vptrarray = subset_vptr!(ls, vptrarray, ninds, ind, indices, loopedindex, 0)
+          vptrarray =
+            subset_vptr!(ls, vptrarray, ninds, ind, indices, loopedindex, 0)
           length(indices) == 0 && push!(indices, DISCONTIGUOUS)
         end
       end
@@ -955,21 +1060,21 @@ function array_reference_meta!(
   mref = ArrayReferenceMeta(
     ArrayReference(array, indices, offsets, strides),
     loopedindex,
-    vptrarray,
+    vptrarray
   )
   ArrayReferenceMetaPosition(
     mref,
     parents,
     loopdependencies,
     reduceddeps,
-    var === nothing ? Symbol("") : var,
+    var === nothing ? Symbol("") : var
   )
 end
 function tryrefconvert(
   ls::LoopSet,
   ex::Expr,
   elementbytes::Int,
-  var::Union{Nothing,Symbol} = nothing,
+  var::Union{Nothing,Symbol} = nothing
 )::Tuple{Bool,ArrayReferenceMetaPosition}
   ya, yinds = if ex.head === :ref
     ref_from_ref!(ls, ex)
diff --git a/src/predicates.jl b/src/predicates.jl
index 192d367c2..48291974d 100644
--- a/src/predicates.jl
+++ b/src/predicates.jl
@@ -23,4 +23,5 @@ hasscope(modex, modpath::Tuple{Vararg{Symbol}}) =
 
 Return true if `g` is equal to `GlobalRef(mod, name)`.
 """
-isglobalref(g, mod, name) = isa(g, GlobalRef) && g.mod === mod && g.name === name
+isglobalref(g, mod, name) =
+  isa(g, GlobalRef) && g.mod === mod && g.name === name
diff --git a/src/reconstruct_loopset.jl b/src/reconstruct_loopset.jl
index b3eef51d6..e704100d5 100644
--- a/src/reconstruct_loopset.jl
+++ b/src/reconstruct_loopset.jl
@@ -2,39 +2,47 @@ const NOpsType = Int#Union{Int,Vector{Int}}
 
 struct UpperBoundedInteger{N,T<:Base.BitInteger} <: Integer
   i::T
-  @inline UpperBoundedInteger{N}(i::T) where {N,T<:Base.BitInteger} = new{N,T}(i)
+  @inline UpperBoundedInteger{N}(i::T) where {N,T<:Base.BitInteger} =
+    new{N,T}(i)
 end
 
 @inline UpperBoundedInteger(i::T, ::StaticInt{N}) where {N,T<:Base.BitInteger} =
   UpperBoundedInteger{N}(i)
-@inline UpperBoundedInteger(::StaticInt{M}, ::StaticInt{N}) where {N,M} = StaticInt{M}()
+@inline UpperBoundedInteger(::StaticInt{M}, ::StaticInt{N}) where {N,M} =
+  StaticInt{M}()
 @inline UpperBoundedInteger{N}(::StaticInt{M}) where {N,M} = StaticInt{M}()
-@inline Base.:(%)(a::UpperBoundedInteger, ::Type{T}) where {T<:Base.BitInteger} = a.i % T
+@inline Base.:(%)(
+  a::UpperBoundedInteger,
+  ::Type{T}
+) where {T<:Base.BitInteger} = a.i % T
 Base.promote_rule(
   ::Type{T},
-  ::Type{UpperBoundedInteger{N,S}},
+  ::Type{UpperBoundedInteger{N,S}}
 ) where {N,T<:Base.BitInteger,S} = promote_rule(T, S)
 Base.promote_rule(
   ::Type{UpperBoundedInteger{N,S}},
-  ::Type{T},
+  ::Type{T}
 ) where {N,T<:Base.BitInteger,S} = promote_rule(S, T)
 Base.promote_rule(
   ::Type{UpperBoundedInteger{N,T}},
-  ::Type{T},
+  ::Type{T}
 ) where {N,T<:Base.BitInteger} = T
-Base.convert(::Type{T}, i::UpperBoundedInteger) where {T<:Number} = convert(T, i.i)
+Base.convert(::Type{T}, i::UpperBoundedInteger) where {T<:Number} =
+  convert(T, i.i)
 Base.convert(
   ::Type{UpperBoundedInteger{N,T}},
-  i::UpperBoundedInteger{N,T},
+  i::UpperBoundedInteger{N,T}
 ) where {N,T<:Base.BitInteger} = i
 upper_bound(_) = typemax(Int)
-upper_bound(::Type{CO}) where {T,N,S,CO<:AbstractCloseOpen{T,UpperBoundedInteger{N,S}}} =
-  N - 1
+upper_bound(
+  ::Type{CO}
+) where {T,N,S,CO<:AbstractCloseOpen{T,UpperBoundedInteger{N,S}}} = N - 1
 
 @inline Base.last(r::AbstractCloseOpen{<:Integer,<:UpperBoundedInteger}) =
   getfield(getfield(r, :upper), :i) - One()
-@inline ArrayInterface.static_last(r::CloseOpen{<:Integer,<:UpperBoundedInteger}) =
-  getfield(getfield(r, :upper), :i) - One()
+@inline ArrayInterface.static_last(
+  r::CloseOpen{<:Integer,<:UpperBoundedInteger}
+) = getfield(getfield(r, :upper), :i) - One()
 @inline Base.length(r::AbstractCloseOpen{<:Integer,<:UpperBoundedInteger}) =
   getfield(getfield(r, :upper), :i) - getfield(r, :start)
 @inline Base.length(r::AbstractCloseOpen{Zero,<:UpperBoundedInteger}) =
@@ -50,13 +58,17 @@ function Loop(ls::LoopSet, ex::Expr, sym::Symbol, f, s, l, ub::Int)
   pushpreamble!(ls, Expr(:(=), rangesym, ex))
   pushpreamble!(
     ls,
-    Expr(:(=), lensym, Expr(:call, GlobalRef(ArrayInterface, :static_length), rangesym)),
+    Expr(
+      :(=),
+      lensym,
+      Expr(:call, GlobalRef(ArrayInterface, :static_length), rangesym)
+    )
   )
   F = if f === nothing
     start = gensym(ssym * "_loopstart")
     pushpreamble!(
       ls,
-      Expr(:(=), start, Expr(:call, %, Expr(:call, lv(:first), rangesym), Int)),
+      Expr(:(=), start, Expr(:call, %, Expr(:call, lv(:first), rangesym), Int))
     )
     MaybeKnown(start, 1)
   else
@@ -66,7 +78,7 @@ function Loop(ls::LoopSet, ex::Expr, sym::Symbol, f, s, l, ub::Int)
     step = gensym(ssym * "_loopstep")
     pushpreamble!(
       ls,
-      Expr(:(=), step, Expr(:call, %, Expr(:call, lv(:step), rangesym), Int)),
+      Expr(:(=), step, Expr(:call, %, Expr(:call, lv(:step), rangesym), Int))
     )
     MaybeKnown(step, 1)
   else
@@ -76,7 +88,7 @@ function Loop(ls::LoopSet, ex::Expr, sym::Symbol, f, s, l, ub::Int)
     stop = gensym(ssym * "_loopstop")
     pushpreamble!(
       ls,
-      Expr(:(=), stop, Expr(:call, %, Expr(:call, lv(:last), rangesym), Int)),
+      Expr(:(=), stop, Expr(:call, %, Expr(:call, lv(:last), rangesym), Int))
     )
     MaybeKnown(stop, min(ub, 1024))
   else
@@ -84,7 +96,12 @@ function Loop(ls::LoopSet, ex::Expr, sym::Symbol, f, s, l, ub::Int)
   end
   loopiteratesatleastonce!(ls, Loop(sym, F, L, S, rangesym, lensym))
 end
-function Loop(ls::LoopSet, ex::Expr, sym::Symbol, ::Type{R}) where {R<:AbstractRange}
+function Loop(
+  ls::LoopSet,
+  ex::Expr,
+  sym::Symbol,
+  ::Type{R}
+) where {R<:AbstractRange}
   f = ArrayInterface.known_first(R)
   s = ArrayInterface.known_step(R)
   l = ArrayInterface.known_last(R)
@@ -93,13 +110,20 @@ function Loop(ls::LoopSet, ex::Expr, sym::Symbol, ::Type{R}) where {R<:AbstractR
 end
 
 function static_loop(sym::Symbol, L::Int, S::Int, U::Int)
-  Loop(sym, MaybeKnown(L, 0), MaybeKnown(U, 0), MaybeKnown(S, 0), Symbol(""), Symbol(""))
+  Loop(
+    sym,
+    MaybeKnown(L, 0),
+    MaybeKnown(U, 0),
+    MaybeKnown(S, 0),
+    Symbol(""),
+    Symbol("")
+  )
 end
 function Loop(
   ::LoopSet,
   ::Expr,
   sym::Symbol,
-  ::Type{OptionallyStaticUnitRange{StaticInt{L},StaticInt{U}}},
+  ::Type{OptionallyStaticUnitRange{StaticInt{L},StaticInt{U}}}
 ) where {L,U}
   static_loop(sym, L, 1, U)
 end
@@ -107,7 +131,13 @@ function Loop(
   ::LoopSet,
   ::Expr,
   sym::Symbol,
-  ::Type{ArrayInterface.OptionallyStaticStepRange{StaticInt{L},StaticInt{S},StaticInt{U}}},
+  ::Type{
+    ArrayInterface.OptionallyStaticStepRange{
+      StaticInt{L},
+      StaticInt{S},
+      StaticInt{U}
+    }
+  }
 ) where {L,S,U}
   static_loop(sym, L, S, U)
 end
@@ -115,12 +145,11 @@ function Loop(
   ::LoopSet,
   ::Expr,
   sym::Symbol,
-  ::Type{CO},
+  ::Type{CO}
 ) where {L,U,CO<:AbstractCloseOpen{StaticInt{L},StaticInt{U}}}
   static_loop(sym, L, 1, U - 1)
 end
 
-
 extract_loop(l) = Expr(:call, getfield, Symbol("#loop#bounds#"), l)
 
 function add_loops!(ls::LoopSet, LPSYM, LB)
@@ -148,7 +177,12 @@ function add_loops!(
       :($getfield($getfield($getfield(var"#loop#bounds#", $i), :indices), $k))
     add_loop!(
       ls,
-      Loop(ls, axisexpr, Symbol(ssym * '#' * string(k) * '#'), T.parameters[k])::Loop,
+      Loop(
+        ls,
+        axisexpr,
+        Symbol(ssym * '#' * string(k) * '#'),
+        T.parameters[k]
+      )::Loop
     )
   end
   push!(ls.loopsymbol_offsets, ls.loopsymbol_offsets[end] + N)
@@ -160,7 +194,7 @@ function ArrayReferenceMeta(
   arraysymbolinds::Vector{Symbol},
   opsymbols::Vector{Symbol},
   nopsv::Vector{NOpsType},
-  expandedv::Vector{Bool},
+  expandedv::Vector{Bool}
 )
   # unpack the `ArrayRefStruct`
   # we don't want to specialize on it, as it is typed on symbols.
@@ -182,7 +216,7 @@ function ArrayReferenceMeta(
     arraysymbolinds,
     opsymbols,
     nopsv,
-    expandedv,
+    expandedv
   )
 end
 function ArrayReferenceMeta(
@@ -196,7 +230,7 @@ function ArrayReferenceMeta(
   arraysymbolinds::Vector{Symbol},
   opsymbols::Vector{Symbol},
   nopsv::Vector{NOpsType},
-  expandedv::Vector{Bool},
+  expandedv::Vector{Bool}
 )
   ni = filled_8byte_chunks(index_types)
   index_vec = Symbol[]
@@ -253,11 +287,10 @@ function ArrayReferenceMeta(
   ArrayReferenceMeta(
     ArrayReference(arrayar, index_vec, offset_vec, stride_vec),
     loopedindex,
-    ptrar,
+    ptrar
   )
 end
 
-
 extract_varg(i) = :($getfield(var"#vargs#", $i))
 # _extract(::Type{StaticInt{N}}) where {N} = N
 extract_gsp!(sptrs::Expr, name::Symbol) = (push!(sptrs.args, name); nothing)
@@ -275,7 +308,9 @@ function loop_indexes_bit!(ls::LoopSet, ar::ArrayReferenceMeta)
   ind = first(getindices(ar))
   ind === DISCONTIGUOUS && return
   first(li) || throw(
-    LoopError("The contiguous index of a `BitArray` shouldn't be a complex function.")ind,
+    LoopError(
+      "The contiguous index of a `BitArray` shouldn't be a complex function."
+    )ind
   )
   ls.loopindexesbit[getloopid(ls, ind)] = true
   nothing
@@ -288,13 +323,14 @@ function add_mref!(
   C::Int,
   B::Int,
   sp::Vector{Int},
-  name::Symbol,
+  name::Symbol
 ) where {T}
   @assert B ≤ 0 "Batched arrays not supported yet."
   _add_mref!(sptrs, ls, ar, typetosym(T), C, B, sp, name)
   sizeof(T)
 end
-typetosym(::Type{T}) where {T<:NativeTypes} = (VectorizationBase.JULIA_TYPES[T])::Symbol
+typetosym(::Type{T}) where {T<:NativeTypes} =
+  (VectorizationBase.JULIA_TYPES[T])::Symbol
 typetosym(T) = T
 function _add_mref!(
   sptrs::Expr,
@@ -304,7 +340,7 @@ function _add_mref!(
   C::Int,
   B::Int,
   sp::Vector{Int},
-  name::Symbol,
+  name::Symbol
 )
   # maybe no change needed? -- optimize common case
   li = ar.loopedindex
@@ -333,14 +369,15 @@ function _add_mref!(
   for n ∈ eachindex(sp)
     push!(column_major.args, n)
   end
-  sitype = Expr(:curly, lv(:StrideIndex), length(sp), column_major, (C == -1 ? -1 : 1))
+  sitype =
+    Expr(:curly, lv(:StrideIndex), length(sp), column_major, (C == -1 ? -1 : 1))
   siexpr = Expr(:call, sitype, strd_tup, offsets_tup)
   sptr = Expr(
     :call,
     lv(:stridedpointer),
     Expr(:call, lv(:pointer), tmpsp),
     siexpr,
-    staticexpr(B),
+    staticexpr(B)
   )
 
   pushpreamble!(ls, Expr(:(=), name, sptr))
@@ -373,7 +410,7 @@ function add_mref!(
   ::Int,
   ::Int,
   sp::Vector{Int},
-  name::Symbol,
+  name::Symbol
 ) where {T,F,S,O}
   extract_gsp!(sptrs, name)
   sizeof(T)
@@ -385,12 +422,12 @@ function create_mrefs!(
   os::Vector{Symbol},
   nopsv::Vector{NOpsType},
   expanded::Vector{Bool},
-  ::Type{Tuple{}},
+  ::Type{Tuple{}}
 )
   length(arf) == 0 || throw(
     ArgumentError(
-      "Length of array ref vector should be 0 if there are no stridedpointers.",
-    ),
+      "Length of array ref vector should be 0 if there are no stridedpointers."
+    )
   )
   Vector{ArrayReferenceMeta}(undef, length(arf)), Int[]
 end
@@ -404,7 +441,10 @@ function stabilize_grouped_stridedpointer_type(C, B, R)
     Bv[n] = B[n]
     Rₙ = R[n]
     let L::Int = length(Rₙ)
-      Rv[n] = (ntuple(i -> i > L ? typemax(Int) : (Rₙ[i])::Int, Val(8))::NTuple{8,Int}, L)
+      Rv[n] = (
+        ntuple(i -> i > L ? typemax(Int) : (Rₙ[i])::Int, Val(8))::NTuple{8,Int},
+        L
+      )
     end
   end
   Cv, Bv, Rv
@@ -418,7 +458,6 @@ function create_mrefs!(
   expanded::Vector{Bool},
   @nospecialize(_::Type{GroupedStridedPointers{P,C,B,R,I,X,O}})
 ) where {P,C,B,R,I,X,O}
-
   Cv, Bv, Rv = stabilize_grouped_stridedpointer_type(C, B, R)
   _create_mrefs!(ls, arf, as, os, nopsv, expanded, P.parameters, Cv, Bv, Rv)
 end
@@ -432,9 +471,10 @@ function _create_mrefs!(
   P::Core.SimpleVector,
   C::Vector{Int},
   B::Vector{Int},
-  R::Vector{Tuple{NTuple{8,Int},Int}},
+  R::Vector{Tuple{NTuple{8,Int},Int}}
 )
-  mrefs::Vector{ArrayReferenceMeta} = Vector{ArrayReferenceMeta}(undef, length(arf))
+  mrefs::Vector{ArrayReferenceMeta} =
+    Vector{ArrayReferenceMeta}(undef, length(arf))
   elementbytes::Vector{Int} = Vector{Int}(undef, length(arf))
   sptrs = Expr(:tuple)
   # pushpreamble!(ls, Expr(:(=), sptrs, :(VectorizationBase.stridedpointers(getfield(vargs, 1, false)))))
@@ -443,8 +483,8 @@ function _create_mrefs!(
     Expr(
       :(=),
       sptrs,
-      :(VectorizationBase.stridedpointers(getfield(var"#vargs#", 1, false))),
-    ),
+      :(VectorizationBase.stridedpointers(getfield(var"#vargs#", 1, false)))
+    )
   )
   j = 0
   rank_to_sps = Vector{Tuple{Int,Vector{Int}}}(undef, length(arf))
@@ -504,7 +544,7 @@ function expandbyoffset!(
   indexpand::Vector{T},
   inds,
   offsets::Vector{Int},
-  expand::Bool = true,
+  expand::Bool = true
 ) where {T<:Union{Int,Tuple{Int,<:Any}}}
   for _ind ∈ inds
     ind = T === Int ? _ind : first(_ind)
@@ -524,7 +564,12 @@ function expandbyoffset!(
 end
 expandbyoffset(inds::Vector{Int}, offsets::Vector{Int}, expand::Bool) =
   expandbyoffset!(Int[], inds, offsets, expand)
-function loopindex!(idxs::Vector{Int}, ls::LoopSet, u::Unsigned, shift::Unsigned)
+function loopindex!(
+  idxs::Vector{Int},
+  ls::LoopSet,
+  u::Unsigned,
+  shift::Unsigned
+)
   mask = (one(shift) << shift) - one(shift) # mask to zero out all but shift-bits
   while u != zero(u)
     pushfirst!(idxs, (u % typeof(shift)) & mask)
@@ -534,7 +579,12 @@ function loopindex!(idxs::Vector{Int}, ls::LoopSet, u::Unsigned, shift::Unsigned
 end
 loopindex(ls::LoopSet, u::Unsigned, shift::Unsigned) =
   reverse!(loopindex!(Int[], ls, u, shift))
-function loopindexoffset(ls::LoopSet, u::Unsigned, li::Bool, expand::Bool = false)
+function loopindexoffset(
+  ls::LoopSet,
+  u::Unsigned,
+  li::Bool,
+  expand::Bool = false
+)
   if li
     shift = 0x04
     offsets = ls.loopsymbol_offsets
@@ -551,13 +601,27 @@ function parents_symvec(ls::LoopSet, u::Unsigned, expand, offset)
 end
 loopdependencies(ls::LoopSet, os::OperationStruct, expand = false, offset = 0) =
   parents_symvec(ls, os.loopdeps, expand, offset)
-reduceddependencies(ls::LoopSet, os::OperationStruct, expand = false, offset = 0) =
-  parents_symvec(ls, os.reduceddeps, expand, offset)
-childdependencies(ls::LoopSet, os::OperationStruct, expand = false, offset = 0) =
-  parents_symvec(ls, os.childdeps, expand, offset)
+reduceddependencies(
+  ls::LoopSet,
+  os::OperationStruct,
+  expand = false,
+  offset = 0
+) = parents_symvec(ls, os.reduceddeps, expand, offset)
+childdependencies(
+  ls::LoopSet,
+  os::OperationStruct,
+  expand = false,
+  offset = 0
+) = parents_symvec(ls, os.childdeps, expand, offset)
 
 # parents(ls::LoopSet, u::UInt128) = loopindexoffset(ls, u, false)
-function parents(ls::LoopSet, u₀::UInt128, u₁::UInt128, u₂::UInt128, u₃::UInt128)
+function parents(
+  ls::LoopSet,
+  u₀::UInt128,
+  u₁::UInt128,
+  u₂::UInt128,
+  u₃::UInt128
+)
   idxs = Int[]
   u₃ == zero(u₃) || loopindex!(idxs, ls, u₃, 0x0010)
   u₂ == zero(u₂) || loopindex!(idxs, ls, u₂, 0x0010)
@@ -584,7 +648,7 @@ function isexpanded(
   ls::LoopSet,
   ops::Vector{OperationStruct},
   nopsv::Vector{NOpsType},
-  i::Int,
+  i::Int
 )
   nops = nopsv[i]
   # nops isa Vector{Int} only if accesses_memory(os), which means isexpanded must be false
@@ -602,7 +666,7 @@ end
 function mref_elbytes(
   os::OperationStruct,
   mrefs::Vector{ArrayReferenceMeta},
-  elementbytes::Vector{Int},
+  elementbytes::Vector{Int}
 )
   if isload(os) | isstore(os)
     mrefs[os.array], elementbytes[os.array]
@@ -619,7 +683,7 @@ function add_op!(
   i::Int,
   mrefs::Vector{ArrayReferenceMeta},
   opsymbol,
-  elementbytes::Vector{Int},
+  elementbytes::Vector{Int}
 )
   os = ops[i]
   mref, elbytes = mref_elbytes(os, mrefs, elementbytes)
@@ -640,7 +704,7 @@ function add_op!(
       reduceddependencies(ls, os, true),
       Operation[],
       mref,
-      childdependencies(ls, os, true),
+      childdependencies(ls, os, true)
     )
     push!(ls.operations, op)
     push!(opoffsets, opoffsets[end] + 1)
@@ -660,7 +724,7 @@ function add_op!(
       reduceddependencies(ls, os, false, offset),
       Operation[],
       mref,
-      childdependencies(ls, os, false, offset),
+      childdependencies(ls, os, false, offset)
     )
     push!(ls.operations, op)
   end
@@ -675,7 +739,7 @@ function add_parents_to_op!(
   up₂::UInt128,
   up₃::UInt128,
   k::Int,
-  Δ::Int,
+  Δ::Int
 )
   vparents = parents(op)
   ops = operations(ls)
@@ -701,7 +765,11 @@ function add_parents_to_op!(
     end
   end
 end
-function add_parents_to_ops!(ls::LoopSet, ops::Vector{OperationStruct}, constoffset)
+function add_parents_to_ops!(
+  ls::LoopSet,
+  ops::Vector{OperationStruct},
+  constoffset
+)
   offsets = ls.operation_offsets
   for i = 1:length(offsets)-1
     pos = offsets[i]
@@ -723,7 +791,7 @@ function add_parents_to_ops!(ls::LoopSet, ops::Vector{OperationStruct}, constoff
           ops[i].parents₂,
           ops[i].parents₃,
           k,
-          Δ,
+          Δ
         )
       end
     end
@@ -739,13 +807,23 @@ function add_ops!(
   opsymbols::Vector{Symbol},
   constoffset::Int,
   nopsv::Vector{NOpsType},
-  expandedv::Vector{Bool},
+  expandedv::Vector{Bool}
 )
   # @show ls.loopsymbols ls.loopsymbol_offsets
   for i ∈ eachindex(ops)
     os = ops[i]
     opsymbol = opsymbols[os.symid]
-    add_op!(ls, instr[i], ops, nopsv, expandedv, i, mrefs, opsymbol, elementbytes)
+    add_op!(
+      ls,
+      instr[i],
+      ops,
+      nopsv,
+      expandedv,
+      i,
+      mrefs,
+      opsymbol,
+      elementbytes
+    )
   end
   add_parents_to_ops!(ls, ops, constoffset)
   # for op ∈ operations(ls)
@@ -766,7 +844,11 @@ typeeltype(::Type{VectorizationBase.FastRange{T,F,S,O}}) where {T,F,S,O} = T
 typeeltype(::Type{T}) where {T<:Real} = T
 # typeeltype(::Any) = Int8
 
-function add_array_symbols!(ls::LoopSet, arraysymbolinds::Vector{Symbol}, offset::Int)
+function add_array_symbols!(
+  ls::LoopSet,
+  arraysymbolinds::Vector{Symbol},
+  offset::Int
+)
   for as ∈ arraysymbolinds
     pushpreamble!(ls, Expr(:(=), as, extract_varg((offset += 1))))
   end
@@ -780,8 +862,13 @@ function extract_external_functions!(ls::LoopSet, offset::Int, vargs)
         offset += 1
         instr_new = get(FUNCTIONSYMBOLS, vargs[offset], instr)
         if instr_new === instr
-          extractf =
-            Expr(:call, GlobalRef(Core, :getfield), Symbol("#vargs#"), offset, false)
+          extractf = Expr(
+            :call,
+            GlobalRef(Core, :getfield),
+            Symbol("#vargs#"),
+            offset,
+            false
+          )
           pushpreamble!(ls, Expr(:(=), instr.instr, extractf))
         else
           op.instruction = instr_new
@@ -791,12 +878,18 @@ function extract_external_functions!(ls::LoopSet, offset::Int, vargs)
   end
   offset
 end
-outer_reduct_init_typename(op::Operation) = Symbol(mangledvar(op), "#or#init#type#")
+outer_reduct_init_typename(op::Operation) =
+  Symbol(mangledvar(op), "#or#init#type#")
 function extract_outerreduct_types!(ls::LoopSet, offset::Int, vargs)
   # for op
   for or ∈ ls.outer_reductions
-    extractt =
-      Expr(:call, GlobalRef(Core, :getfield), Symbol("#vargs#"), (offset += 1), false)
+    extractt = Expr(
+      :call,
+      GlobalRef(Core, :getfield),
+      Symbol("#vargs#"),
+      (offset += 1),
+      false
+    )
     op = operations(ls)[or]
     if instruction(op).instr ≢ :ifelse
       pushpreamble!(ls, Expr(:(=), outer_reduct_init_typename(op), extractt))
@@ -805,7 +898,11 @@ function extract_outerreduct_types!(ls::LoopSet, offset::Int, vargs)
       pushpreamble!(ls, Expr(:(=), opextractbase, extractt))
       pushpreamble!(
         ls,
-        Expr(:(=), outer_reduct_init_typename(op), Expr(:call, lv(:typeof), opextractbase)),
+        Expr(
+          :(=),
+          outer_reduct_init_typename(op),
+          Expr(:call, lv(:typeof), opextractbase)
+        )
       )
     end
   end
@@ -819,10 +916,13 @@ function sizeofeltypes(v)::Int
   T = typeeltype(v[1])
   sz =
     if (
-      VectorizationBase.simd_integer_register_size() != VectorizationBase.register_size()
+      VectorizationBase.simd_integer_register_size() !=
+      VectorizationBase.register_size()
     ) && T <: Integer # hack
-      (VectorizationBase.register_size() ÷ VectorizationBase.simd_integer_register_size()) *
-      sizeof(T)
+      (
+        VectorizationBase.register_size() ÷
+        VectorizationBase.simd_integer_register_size()
+      ) * sizeof(T)
     else
       sz = sizeof(T)
     end
@@ -830,7 +930,8 @@ function sizeofeltypes(v)::Int
     Ttemp = typeeltype(v[i])
     szᵢ =
       if (
-        VectorizationBase.simd_integer_register_size() != VectorizationBase.register_size()
+        VectorizationBase.simd_integer_register_size() !=
+        VectorizationBase.register_size()
       ) && T <: Integer # hack
         (
           VectorizationBase.register_size() ÷
@@ -857,9 +958,12 @@ function avx_loopset!(
   AM::Vector{Any},
   LPSYM::Vector{Any},
   LB::Core.SimpleVector,
-  vargs::Core.SimpleVector,
+  vargs::Core.SimpleVector
 )
-  pushpreamble!(ls, :((var"#loop#bounds#", var"#vargs#") = var"#lv#tuple#args#"))
+  pushpreamble!(
+    ls,
+    :((var"#loop#bounds#", var"#vargs#") = var"#lv#tuple#args#")
+  )
   add_loops!(ls, LPSYM, LB)
   resize!(ls.loop_order, ls.loopsymbol_offsets[end])
   arraysymbolinds = gen_array_syminds(AM)
@@ -869,13 +973,30 @@ function avx_loopset!(
 
   resize!(ls.loopindexesbit, length(ls.loops))
   fill!(ls.loopindexesbit, false)
-  mrefs, elementbytes =
-    create_mrefs!(ls, arf, arraysymbolinds, opsymbols, nopsv, expandedv, vargs[1])
+  mrefs, elementbytes = create_mrefs!(
+    ls,
+    arf,
+    arraysymbolinds,
+    opsymbols,
+    nopsv,
+    expandedv,
+    vargs[1]
+  )
   for mref ∈ mrefs
     push!(ls.includedactualarrays, vptr(mref))
   end
   # extra args extraction
-  extractind = add_ops!(ls, instr, ops, mrefs, elementbytes, opsymbols, 1, nopsv, expandedv)
+  extractind = add_ops!(
+    ls,
+    instr,
+    ops,
+    mrefs,
+    elementbytes,
+    opsymbols,
+    1,
+    nopsv,
+    expandedv
+  )
   extractind = process_metadata!(ls, AM, extractind)
   extractind = add_array_symbols!(ls, arraysymbolinds, extractind)
   extractind = extract_external_functions!(ls, extractind, vargs)
@@ -884,7 +1005,7 @@ function avx_loopset!(
 end
 function avx_body(
   ls::LoopSet,
-  UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,UInt,Int,Bool},
+  UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,UInt,Int,Bool}
 )
   inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, nt = UNROLL
   q =
@@ -902,10 +1023,18 @@ function _turbo_loopset_debug(
   ::Val{ARF},
   ::Val{AM},
   ::Val{LPSYM},
-  _vargs::Tuple{LB,V},
+  _vargs::Tuple{LB,V}
 ) where {UNROLL,OPS,ARF,AM,LPSYM,LB,V}
   # @show OPS ARF AM LPSYM _vargs
-  _turbo_loopset(OPS, ARF, AM, LPSYM, _vargs[1].parameters, V.parameters, UNROLL)
+  _turbo_loopset(
+    OPS,
+    ARF,
+    AM,
+    LPSYM,
+    _vargs[1].parameters,
+    V.parameters,
+    UNROLL
+  )
 end
 function tovector(@nospecialize(t))
   v = Vector{Any}(undef, length(t))
@@ -926,7 +1055,7 @@ function _turbo_loopset(
   @nospecialize(LPSYMsv),
   LBsv::Core.SimpleVector,
   vargs::Core.SimpleVector,
-  UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,UInt,Int,Bool},
+  UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,UInt,Int,Bool}
 )
   nops = length(OPSsv) ÷ 3
   instr = Instruction[Instruction(OPSsv[3i+1], OPSsv[3i+2]) for i ∈ 0:nops-1]
@@ -940,27 +1069,37 @@ function _turbo_loopset(
   for i ∈ eachindex(arsv)
     arsv[i] = ARFsv[i]
   end
-  avx_loopset!(ls, instr, ops, arsv, tovector(AMsv), tovector(LPSYMsv), LBsv, vargs)
+  avx_loopset!(
+    ls,
+    instr,
+    ops,
+    arsv,
+    tovector(AMsv),
+    tovector(LPSYMsv),
+    LBsv,
+    vargs
+  )
 end
 
 """
     _turbo_!(unroll, ops, arf, am, lpsym, lb, vargs...)
 
 Execute an `@turbo` block. The block's code is represented via the arguments:
-- `unroll` is `Val((u₁,u₂))` and specifies the loop unrolling factor(s).
-  These values may be supplied manually via the `unroll` keyword
-  of [`@turbo`](@ref).
-- `ops` is `Tuple{mod1, sym1, op1, mod2, sym2, op2...}` encoding the operations of the loop.
-  `mod` and `sym` encode the module and symbol of the called function; `op` is an [`OperationStruct`](@ref)
-  encoding the details of the operation.
-- `arf` is `Tuple{arf1, arf2...}`, where each `arfi` is an [`ArrayRefStruct`](@ref) encoding
-  an array reference.
-- `am` contains miscellaneous data about the LoopSet (see `process_metadata!`)
-- `lpsym` is `Tuple{:i,:j,...}`, a Tuple of the "loop symbols", i.e. the item variable `i` in `for i ∈ iter`
-- `lb` is `Tuple{RngTypei,RngTypej,...}`, a Tuple encoding syntactically-knowable information about
-  the iterators corresponding to `lpsym`. For example, in `for i ∈ 1:n`, the `1:n` would be encoded with
-  `StaticLowerUnitRange(1)` because the lower bound of the iterator can be determined to be 1.
-- `vargs...` holds the encoded pointers of all the arrays (see `VectorizationBase`'s various pointer types).
+
+  - `unroll` is `Val((u₁,u₂))` and specifies the loop unrolling factor(s).
+    These values may be supplied manually via the `unroll` keyword
+    of [`@turbo`](@ref).
+  - `ops` is `Tuple{mod1, sym1, op1, mod2, sym2, op2...}` encoding the operations of the loop.
+    `mod` and `sym` encode the module and symbol of the called function; `op` is an [`OperationStruct`](@ref)
+    encoding the details of the operation.
+  - `arf` is `Tuple{arf1, arf2...}`, where each `arfi` is an [`ArrayRefStruct`](@ref) encoding
+    an array reference.
+  - `am` contains miscellaneous data about the LoopSet (see `process_metadata!`)
+  - `lpsym` is `Tuple{:i,:j,...}`, a Tuple of the "loop symbols", i.e. the item variable `i` in `for i ∈ iter`
+  - `lb` is `Tuple{RngTypei,RngTypej,...}`, a Tuple encoding syntactically-knowable information about
+    the iterators corresponding to `lpsym`. For example, in `for i ∈ 1:n`, the `1:n` would be encoded with
+    `StaticLowerUnitRange(1)` because the lower bound of the iterator can be determined to be 1.
+  - `vargs...` holds the encoded pointers of all the arrays (see `VectorizationBase`'s various pointer types).
 """
 @generated function _turbo_!(
   ::Val{var"#UNROLL#"},
@@ -969,7 +1108,7 @@ Execute an `@turbo` block. The block's code is represented via the arguments:
   ::Val{var"#AM#"},
   ::Val{var"#LPSYM#"},
   ::Val{Tuple{var"#LB#",var"#V#"}},
-  var"#flattened#var#arguments#"::Vararg{Any,var"#num#vargs#"},
+  var"#flattened#var#arguments#"::Vararg{Any,var"#num#vargs#"}
 ) where {
   var"#UNROLL#",
   var"#OPS#",
@@ -978,7 +1117,7 @@ Execute an `@turbo` block. The block's code is represented via the arguments:
   var"#LPSYM#",
   var"#LB#",
   var"#V#",
-  var"#num#vargs#",
+  var"#num#vargs#"
 }
   # 1 + 1 # Irrelevant line you can comment out/in to force recompilation...
   ls = _turbo_loopset(
@@ -988,19 +1127,22 @@ Execute an `@turbo` block. The block's code is represented via the arguments:
     var"#LPSYM#",
     var"#LB#".parameters,
     var"#V#".parameters,
-    var"#UNROLL#",
+    var"#UNROLL#"
   )
   pushfirst!(
     ls.preamble.args,
     :(
-      var"#lv#tuple#args#" =
-        reassemble_tuple(Tuple{var"#LB#",var"#V#"}, var"#flattened#var#arguments#")
-    ),
+      var"#lv#tuple#args#" = reassemble_tuple(
+        Tuple{var"#LB#",var"#V#"},
+        var"#flattened#var#arguments#"
+      )
+    )
   )
   post = hoist_constant_memory_accesses!(ls)
   # q = @show(avx_body(ls, var"#UNROLL#")); post === ls.preamble ? q : Expr(:block, q, post)
   q = if (var"#UNROLL#"[10] > 1) && length(var"#LPSYM#") == length(ls.loops)
-    inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, nt, wca, safe = var"#UNROLL#"
+    inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, nt, wca, safe =
+      var"#UNROLL#"
     # wrap in `var"#OPS#", var"#ARF#", var"#AM#", var"#LPSYM#"` in `Expr` to homogenize types
     avx_threads_expr(
       ls,
@@ -1009,7 +1151,7 @@ Execute an `@turbo` block. The block's code is represented via the arguments:
       :(Val{$(var"#OPS#")}()),
       :(Val{$(var"#ARF#")}()),
       :(Val{$(var"#AM#")}()),
-      :(Val{$(var"#LPSYM#")}()),
+      :(Val{$(var"#LPSYM#")}())
     )
   else
     # Main.BODY[] = avx_body(ls, var"#UNROLL#")
@@ -1026,7 +1168,7 @@ end
   ::Val{var"#AM#"},
   ::Val{var"#LPSYM#"},
   ::Val{Tuple{var"#LB#",var"#V#"}},
-  var"#flattened#var#arguments#"::Tuple{Vararg{Any,var"#num#vargs#"}},
+  var"#flattened#var#arguments#"::Tuple{Vararg{Any,var"#num#vargs#"}}
 ) where {
   var"#UNROLL#",
   var"#OPS#",
@@ -1035,7 +1177,7 @@ end
   var"#LPSYM#",
   var"#LB#",
   var"#V#",
-  var"#num#vargs#",
+  var"#num#vargs#"
 }
   1 + 1 # Irrelevant line you can comment out/in to force recompilation...
   ls = _turbo_loopset(
@@ -1045,19 +1187,22 @@ end
     var"#LPSYM#",
     var"#LB#".parameters,
     var"#V#".parameters,
-    var"#UNROLL#",
+    var"#UNROLL#"
   )
   pushfirst!(
     ls.preamble.args,
     :(
-      var"#lv#tuple#args#" =
-        reassemble_tuple(Tuple{var"#LB#",var"#V#"}, var"#flattened#var#arguments#")
-    ),
+      var"#lv#tuple#args#" = reassemble_tuple(
+        Tuple{var"#LB#",var"#V#"},
+        var"#flattened#var#arguments#"
+      )
+    )
   )
   post = hoist_constant_memory_accesses!(ls)
   # q = @show(avx_body(ls, var"#UNROLL#")); post === ls.preamble ? q : Expr(:block, q, post)
   q = if (var"#UNROLL#"[10] > 1) && length(var"#LPSYM#") == length(ls.loops)
-    inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, nt, wca, safe = var"#UNROLL#"
+    inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, nt, wca, safe =
+      var"#UNROLL#"
     # wrap in `var"#OPS#", var"#ARF#", var"#AM#", var"#LPSYM#"` in `Expr` to homogenize types
     avx_threads_expr(
       ls,
@@ -1066,7 +1211,7 @@ end
       :(Val{$(var"#OPS#")}()),
       :(Val{$(var"#ARF#")}()),
       :(Val{$(var"#AM#")}()),
-      :(Val{$(var"#LPSYM#")}()),
+      :(Val{$(var"#LPSYM#")}())
     )
   else
     # Main.BODY[] = avx_body(ls, var"#UNROLL#")
diff --git a/src/simdfunctionals/filter.jl b/src/simdfunctionals/filter.jl
index 00c746602..39ab11ce7 100644
--- a/src/simdfunctionals/filter.jl
+++ b/src/simdfunctionals/filter.jl
@@ -1,5 +1,9 @@
 
-function vfilter!(f::F, x::Vector{T}, y::AbstractArray{T}) where {F,T<:NativeTypes}
+function vfilter!(
+  f::F,
+  x::Vector{T},
+  y::AbstractArray{T}
+) where {F,T<:NativeTypes}
   W, Wshift = VectorizationBase.pick_vector_width_shift(T)
   N = length(y)
   Nrep = N >>> Wshift
@@ -13,20 +17,31 @@ function vfilter!(f::F, x::Vector{T}, y::AbstractArray{T}) where {F,T<:NativeTyp
     ptr_x = pointer(x)
     ptr_y = pointer(y)
     for _ ∈ 1:Nrep
-      vy = VectorizationBase.__vload(ptr_y, zero_index, False(), register_size())
+      vy =
+        VectorizationBase.__vload(ptr_y, zero_index, False(), register_size())
       mask = f(vy)
       VectorizationBase.compressstore!(
         gep(ptr_x, VectorizationBase.lazymul(st, j)),
         vy,
-        mask,
+        mask
       )
       ptr_y = gep(ptr_y, incr)
       j = vadd_nw(j, count_ones(mask))
     end
     rem_mask = VectorizationBase.mask(T, Nrem)
-    vy = VectorizationBase.__vload(ptr_y, zero_index, rem_mask, False(), register_size())
+    vy = VectorizationBase.__vload(
+      ptr_y,
+      zero_index,
+      rem_mask,
+      False(),
+      register_size()
+    )
     mask = rem_mask & f(vy)
-    VectorizationBase.compressstore!(gep(ptr_x, VectorizationBase.lazymul(st, j)), vy, mask)
+    VectorizationBase.compressstore!(
+      gep(ptr_x, VectorizationBase.lazymul(st, j)),
+      vy,
+      mask
+    )
     j = vadd_nw(j, count_ones(mask))
     Base._deleteend!(x, N - j) # resize!(x, j)
   end
diff --git a/src/simdfunctionals/map.jl b/src/simdfunctionals/map.jl
index 7e63e4073..3f22f8e2c 100644
--- a/src/simdfunctionals/map.jl
+++ b/src/simdfunctionals/map.jl
@@ -6,7 +6,7 @@ function setup_vmap!(
   f::F,
   y::AbstractArray{T},
   ::Val{true},
-  args::Vararg{AbstractArray,A},
+  args::Vararg{AbstractArray,A}
 ) where {F,T<:Base.HWReal,A}
   N = length(y)
   ptry = VectorizationBase.zstridedpointer(y)
@@ -18,7 +18,8 @@ function setup_vmap!(
   @assert iszero(uintptry & (sizeof(T) - 1)) "The destination vector (`dest`) must be aligned to `sizeof(eltype(dest)) == $(sizeof(T))` bytes."
   alignment = uintptry & (register_size() - 1)
   if alignment > 0
-    i = reinterpret(Int, W - (alignment >>> VectorizationBase.intlog2(sizeof(T))))
+    i =
+      reinterpret(Int, W - (alignment >>> VectorizationBase.intlog2(sizeof(T))))
     m = mask(T, i)
     if N < i
       m &= mask(T, N & (W - 1))
@@ -31,7 +32,7 @@ function setup_vmap!(
       False(),
       True(),
       False(),
-      register_size(),
+      register_size()
     )
     gesp(ptry, (i,)), map1(gesp, ptrargs, (i,)), N - i
   else
@@ -50,10 +51,17 @@ function map1_quote(K::Int, args::Int)
   end
   Expr(:block, Expr(:meta, :inline), t)
 end
-@generated map1(f::F, x_1::Tuple{Vararg{Any,K}}, x_2) where {F,K} = map1_quote(K, 2)
-@generated map1(f::F, x_1::Tuple{Vararg{Any,K}}, x_2, x_3) where {F,K} = map1_quote(K, 3)
+@generated map1(f::F, x_1::Tuple{Vararg{Any,K}}, x_2) where {F,K} =
+  map1_quote(K, 2)
+@generated map1(f::F, x_1::Tuple{Vararg{Any,K}}, x_2, x_3) where {F,K} =
+  map1_quote(K, 3)
 
-@inline function setup_vmap!(f, y, ::Val{false}, args::Vararg{AbstractArray,A}) where {A}
+@inline function setup_vmap!(
+  f,
+  y,
+  ::Val{false},
+  args::Vararg{AbstractArray,A}
+) where {A}
   N = length(y)
   ptry = VectorizationBase.zstridedpointer(y)
   ptrargs = map(VectorizationBase.zstridedpointer, args)
@@ -64,7 +72,7 @@ function vmap_singlethread!(
   f::F,
   y::AbstractArray{T},
   ::Val{NonTemporal},
-  args::Vararg{AbstractArray,A},
+  args::Vararg{AbstractArray,A}
 ) where {F,T<:NativeTypes,A,NonTemporal}
   ptry, ptrargs, N = setup_vmap!(f, y, Val{NonTemporal}(), args...)
   _vmap_singlethread!(f, ptry, Zero(), N, Val{NonTemporal}(), ptrargs)
@@ -76,11 +84,11 @@ function _vmap_singlethread!(
   start,
   N,
   ::Val{NonTemporal},
-  ptrargs::Tuple{Vararg{Any,A}},
+  ptrargs::Tuple{Vararg{Any,A}}
 ) where {F,T,NonTemporal,A}
   i = convert(Int, start)
   V = VectorizationBase.pick_vector_width(
-    promote_type(T, reduce(promote_type, map(eltype, ptrargs))),
+    promote_type(T, reduce(promote_type, map(eltype, ptrargs)))
   )
   W = unwrap(V)
   UNROLL = 4
@@ -116,22 +124,42 @@ function _vmap_singlethread!(
     m = mask(StaticInt(W), N & (W - 1))
     vfinal = f(map1(vload, ptrargs, (MM{W}(i),), m)...)
     if NonTemporal
-      _vstore!(ptry, vfinal, (MM{W}(i),), m, True(), True(), False(), register_size())
+      _vstore!(
+        ptry,
+        vfinal,
+        (MM{W}(i),),
+        m,
+        True(),
+        True(),
+        False(),
+        register_size()
+      )
     else
-      _vstore!(ptry, vfinal, (MM{W}(i),), m, False(), True(), False(), register_size())
+      _vstore!(
+        ptry,
+        vfinal,
+        (MM{W}(i),),
+        m,
+        False(),
+        True(),
+        False(),
+        register_size()
+      )
     end
   end
   # end
   nothing
 end
 
-abstract type AbstractVmapClosure{NonTemporal,F,D,N,A<:Tuple{Vararg{Any,N}}} <: Function end
-struct VmapClosure{NonTemporal,F,D,N,A} <: AbstractVmapClosure{NonTemporal,F,D,N,A}
+abstract type AbstractVmapClosure{NonTemporal,F,D,N,A<:Tuple{Vararg{Any,N}}} <:
+              Function end
+struct VmapClosure{NonTemporal,F,D,N,A} <:
+       AbstractVmapClosure{NonTemporal,F,D,N,A}
   f::F
   function VmapClosure{NonTemporal}(
     f::F,
     ::D,
-    ::A,
+    ::A
   ) where {NonTemporal,F,D,N,A<:Tuple{Vararg{Any,N}}}
     new{NonTemporal,F,D,N,A}(f)
   end
@@ -141,7 +169,9 @@ end
 # @generated function (::VmapKnownClosure{NonTemporal,F,D,N,A})(p::Ptr{UInt})  where {NonTemporal,F,D,N,A}
 #     :(_vmap_thread_call!($(F.instance), p, $D, $A, Val{$NonTemporal}()))
 # end
-function (m::VmapClosure{NonTemporal,F,D,N,A})(p::Ptr{UInt}) where {NonTemporal,F,D,N,A}
+function (m::VmapClosure{NonTemporal,F,D,N,A})(
+  p::Ptr{UInt}
+) where {NonTemporal,F,D,N,A}
   (offset, dest) = ThreadingUtilities.load(p, D, 2 * sizeof(UInt))
   (offset, args) = ThreadingUtilities.load(p, A, offset)
 
@@ -181,7 +211,7 @@ end
     ptry,
     ptrargs,
     start,
-    stop,
+    stop
   ) do p, cfunc, ptry, ptrargs, start, stop
     setup_thread_vmap!(p, cfunc, ptry, ptrargs, start, stop)
   end
@@ -191,7 +221,7 @@ end
   f::F,
   ptry::D,
   ptrargs::A,
-  ::Val{NonTemporal},
+  ::Val{NonTemporal}
 ) where {F,D<:StridedPointer,N,A<:Tuple{Vararg{Any,N}},NonTemporal}
   vmc = VmapClosure{NonTemporal}(f, ptry, ptrargs)
   @cfunction($vmc, Cvoid, (Ptr{UInt},))
@@ -201,7 +231,7 @@ function vmap_multithread!(
   f::F,
   y::AbstractArray{T},
   ::Val{NonTemporal},
-  args::Vararg{AbstractArray,A},
+  args::Vararg{AbstractArray,A}
 ) where {F,T,A,NonTemporal}
   W, Wshift = VectorizationBase.pick_vector_width_shift(T)
   ptry, ptrargs, N = setup_vmap!(f, y, Val{NonTemporal}(), args...)
@@ -254,16 +284,16 @@ end
   y::AbstractArray,
   ::Val{NonTemporal},
   ::Val{Threaded},
-  args::Vararg{AbstractArray,A},
+  args::Vararg{AbstractArray,A}
 ) where {F,A,NonTemporal,Threaded}
   gc_preserve_vmap_quote(NonTemporal, Threaded, A)
 end
 
-
 @inline _all_dense(t::Tuple{ArrayInterface.True}) = true
 @inline _all_dense(t::Tuple{ArrayInterface.True,ArrayInterface.True,Vararg}) =
   _all_dense(Base.tail(t))
-@inline _all_dense(t::Tuple{ArrayInterface.True,ArrayInterface.False,Vararg}) = false
+@inline _all_dense(t::Tuple{ArrayInterface.True,ArrayInterface.False,Vararg}) =
+  false
 @inline _all_dense(t::Tuple{ArrayInterface.False,Vararg}) = false
 @inline all_dense() = true
 @inline all_dense(t::NTuple{N}) where {N} = true
@@ -272,22 +302,26 @@ end
 @inline all_dense(
   A::AbstractArray,
   B::AbstractArray,
-  C::Vararg{AbstractArray,K},
+  C::Vararg{AbstractArray,K}
 ) where {K} = all_dense(A) && all_dense(B, C...)
 
 """
     vmap!(f, destination, a::AbstractArray)
     vmap!(f, destination, a::AbstractArray, b::AbstractArray, ...)
+
 Vectorized-`map!`, applying `f` to batches of elements of `a` (or paired batches of `a`, `b`, ...)
 and storing the result in `destination`.
 
 The function `f` must accept `VectorizationBase.AbstractSIMD` inputs. Ideally, all this requires
 is making sure that `f` is defined to be agnostic with respect to input types, but if the function `f`
 contains branches or loops, more work will probably be needed. For example, a function
+
 ```julia
 f(x) = x > 0 ? log(x) : inv(x)
 ```
+
 can be rewritten into
+
 ```julia
 using IfElse
 f(x) = IfElse.ifelse(x > 0, log(x), inv(x))
@@ -298,7 +332,7 @@ function vmap!(
   y::AbstractArray,
   arg1::AbstractArray,
   arg2::AbstractArray,
-  args::Vararg{AbstractArray,A},
+  args::Vararg{AbstractArray,A}
 ) where {F,A}
   if check_args(y, arg1, arg2, args...) && all_dense(y, arg1, arg2, args...)
     gc_preserve_vmap!(f, y, Val{false}(), Val{false}(), arg1, arg2, args...)
@@ -314,12 +348,16 @@ function vmap!(f::F, y::AbstractArray, arg::AbstractArray) where {F}
   end
 end
 
-
 """
     vmapt!(::Function, dest, args...)
+
 A threaded variant of [`vmap!`](@ref).
 """
-function vmapt!(f::F, y::AbstractArray, args::Vararg{AbstractArray,A}) where {F,A}
+function vmapt!(
+  f::F,
+  y::AbstractArray,
+  args::Vararg{AbstractArray,A}
+) where {F,A}
   if check_args(y, args...) && all_dense(y, args...)
     gc_preserve_vmap!(f, y, Val{false}(), Val{true}(), args...)
   else
@@ -327,17 +365,23 @@ function vmapt!(f::F, y::AbstractArray, args::Vararg{AbstractArray,A}) where {F,
   end
 end
 
-
 """
     vmapnt!(::Function, dest, args...)
+
 This is a vectorized map implementation using nontemporal store operations. This means that the write operations to the destination will not go to the CPU's cache.
 If you will not immediately be reading from these values, this can improve performance because the writes won't pollute your cache. This can especially be the case if your arguments are very long.
+
 ```julia
-julia> using LoopVectorization, BenchmarkTools
-julia> x = rand(10^8); y = rand(10^8); z = similar(x);
-julia> f(x,y) = exp(-0.5abs2(x - y))
-f (generic function with 1 method)
+julia> f(x, y) = exp(-0.5abs2(x - y))
+using LoopVectorization, BenchmarkTools
+
 julia> @benchmark map!(f, \$z, \$x, \$y)
+x = rand(10^8); y = rand(10^8); z = similar(x);
+
+julia> @benchmark vmap!(f, \$z, \$x, \$y)
+f (generic function with 1 method)
+
+julia> @benchmark vmapnt!(f, \$z, \$x, \$y)
 BenchmarkTools.Trial:
   memory estimate:  0 bytes
   allocs estimate:  0
@@ -349,33 +393,13 @@ BenchmarkTools.Trial:
   --------------
   samples:          12
   evals/sample:     1
-julia> @benchmark vmap!(f, \$z, \$x, \$y)
-BenchmarkTools.Trial:
-  memory estimate:  0 bytes
-  allocs estimate:  0
-  --------------
-  minimum time:     178.147 ms (0.00% GC)
-  median time:      178.381 ms (0.00% GC)
-  mean time:        178.430 ms (0.00% GC)
-  maximum time:     179.054 ms (0.00% GC)
-  --------------
-  samples:          29
-  evals/sample:     1
-julia> @benchmark vmapnt!(f, \$z, \$x, \$y)
-BenchmarkTools.Trial:
-  memory estimate:  0 bytes
-  allocs estimate:  0
-  --------------
-  minimum time:     144.183 ms (0.00% GC)
-  median time:      144.338 ms (0.00% GC)
-  mean time:        144.349 ms (0.00% GC)
-  maximum time:     144.641 ms (0.00% GC)
-  --------------
-  samples:          35
-  evals/sample:     1
 ```
 """
-function vmapnt!(f::F, y::AbstractArray, args::Vararg{AbstractArray,A}) where {F,A}
+function vmapnt!(
+  f::F,
+  y::AbstractArray,
+  args::Vararg{AbstractArray,A}
+) where {F,A}
   if check_args(y, args...) && all_dense(y, args...)
     gc_preserve_vmap!(f, y, Val{true}(), Val{false}(), args...)
   else
@@ -385,9 +409,14 @@ end
 
 """
     vmapntt!(::Function, dest, args...)
+
 A threaded variant of [`vmapnt!`](@ref).
 """
-function vmapntt!(f::F, y::AbstractArray, args::Vararg{AbstractArray,A}) where {F,A}
+function vmapntt!(
+  f::F,
+  y::AbstractArray,
+  args::Vararg{AbstractArray,A}
+) where {F,A}
   if check_args(y, args...) && all_dense(y, args...)
     gc_preserve_vmap!(f, y, Val{true}(), Val{true}(), args...)
   else
@@ -414,6 +443,7 @@ end
 """
     vmap(f, a::AbstractArray)
     vmap(f, a::AbstractArray, b::AbstractArray, ...)
+
 SIMD-vectorized `map`, applying `f` to each element of `a` (or paired elements of `a`, `b`, ...)
 and returning a new array.
 """
@@ -422,6 +452,7 @@ vmap(f::F, args::Vararg{Any,N}) where {F,N} = vmap_call(f, vmap!, args...)
 """
     vmapt(f, a::AbstractArray)
     vmapt(f, a::AbstractArray, b::AbstractArray, ...)
+
 A threaded variant of [`vmap`](@ref).
 """
 vmapt(f::F, args::Vararg{Any,N}) where {F,N} = vmap_call(f, vmapt!, args...)
@@ -429,6 +460,7 @@ vmapt(f::F, args::Vararg{Any,N}) where {F,N} = vmap_call(f, vmapt!, args...)
 """
     vmapnt(f, a::AbstractArray)
     vmapnt(f, a::AbstractArray, b::AbstractArray, ...)
+
 A "non-temporal" variant of [`vmap`](@ref). This can improve performance in cases where
 `destination` will not be needed soon.
 """
@@ -437,6 +469,7 @@ vmapnt(f::F, args::Vararg{Any,N}) where {F,N} = vmap_call(f, vmapnt!, args...)
 """
     vmapntt(f, a::AbstractArray)
     vmapntt(f, a::AbstractArray, b::AbstractArray, ...)
+
 A threaded variant of [`vmapnt`](@ref).
 """
 vmapntt(f::F, args::Vararg{Any,N}) where {F,N} = vmap_call(f, vmapntt!, args...)
diff --git a/src/simdfunctionals/mapreduce.jl b/src/simdfunctionals/mapreduce.jl
index ebb625c04..af7d3c094 100644
--- a/src/simdfunctionals/mapreduce.jl
+++ b/src/simdfunctionals/mapreduce.jl
@@ -1,20 +1,31 @@
 
 @inline vreduce(::typeof(+), v::VectorizationBase.AbstractSIMDVector) = vsum(v)
 @inline vreduce(::typeof(*), v::VectorizationBase.AbstractSIMDVector) = vprod(v)
-@inline vreduce(::typeof(max), v::VectorizationBase.AbstractSIMDVector) = vmaximum(v)
-@inline vreduce(::typeof(min), v::VectorizationBase.AbstractSIMDVector) = vminimum(v)
-@inline vreduce(op, v::VectorizationBase.AbstractSIMDVector) = vec_vreduce(op, v)
-@inline vec_reduce(op, v::VectorizationBase.AbstractSIMDVector) = vec_reduce(op, Vec(v))
+@inline vreduce(::typeof(max), v::VectorizationBase.AbstractSIMDVector) =
+  vmaximum(v)
+@inline vreduce(::typeof(min), v::VectorizationBase.AbstractSIMDVector) =
+  vminimum(v)
+@inline vreduce(op, v::VectorizationBase.AbstractSIMDVector) =
+  vec_vreduce(op, v)
+@inline vec_reduce(op, v::VectorizationBase.AbstractSIMDVector) =
+  vec_reduce(op, Vec(v))
 vec_vreduce(op, v::Vec{1}) = VectorizationBase.extractelement(v, 0)
 @inline function vec_vreduce(op, v::Vec{W}) where {W}
-  a = op(VectorizationBase.extractelement(v, 0), VectorizationBase.extractelement(v, 1))
+  a = op(
+    VectorizationBase.extractelement(v, 0),
+    VectorizationBase.extractelement(v, 1)
+  )
   for i ∈ 2:W-1
     a = op(a, VectorizationBase.extractelement(v, i))
   end
   a
 end
 
-function mapreduce_simple(f::F, op::OP, args::Vararg{AbstractArray,A}) where {F,OP,A}
+function mapreduce_simple(
+  f::F,
+  op::OP,
+  args::Vararg{AbstractArray,A}
+) where {F,OP,A}
   ptrargs = ntuple(a -> pointer(args[a]), Val(A))
   N = length(first(args))
   iszero(N) && throw("Length of vector is 0!")
@@ -29,16 +40,15 @@ function mapreduce_simple(f::F, op::OP, args::Vararg{AbstractArray,A}) where {F,
           ptrargs,
           VectorizationBase.lazymul.(st, i),
           False(),
-          register_size(),
-        )...,
-      ),
+          register_size()
+        )...
+      )
     )
     i += 1
   end
   a_0
 end
 
-
 """
     vmapreduce(f, op, A::DenseArray...)
 
@@ -48,7 +58,7 @@ Vectorized version of `mapreduce`. Applies `f` to each element of the arrays `A`
   f::F,
   op::OP,
   arg1::AbstractArray{T},
-  args::Vararg{AbstractArray{T},A},
+  args::Vararg{AbstractArray{T},A}
 ) where {F,OP,T<:NativeTypes,A}
   if !(check_args(arg1, args...) && all_dense(arg1, args...))
     return mapreduce(f, op, arg1, args...)
@@ -68,7 +78,7 @@ end
   ::StaticInt{W},
   N,
   ::Type{T},
-  args::Vararg{AbstractArray{<:NativeTypes},A},
+  args::Vararg{AbstractArray{<:NativeTypes},A}
 ) where {F,OP,A,W,T}
   ptrargs = VectorizationBase.zero_offsets.(stridedpointer.(args))
   if N ≥ 4W
@@ -111,7 +121,7 @@ At most one dimension may be supplied as kwarg.
 for (op, init) in zip((:+, :max, :min), (:zero, :typemin, :typemax))
   @eval @inline function vreduce(::typeof($op), arg; dims = nothing)
     if !(check_args(arg) && all_dense(arg))
-      return reduce($op, arg, dims = dims)
+      return reduce($op, arg; dims = dims)
     end
     dims === nothing && return _vreduce($op, arg)
     isone(ndims(arg)) && return [_vreduce($op, arg)]
@@ -132,7 +142,14 @@ for (op, init) in zip((:+, :max, :min), (:zero, :typemin, :typemax))
     end
   end
 
-  @eval @inline function _vreduce_dims!(out, ::typeof($op), Rpre, is, Rpost, arg)
+  @eval @inline function _vreduce_dims!(
+    out,
+    ::typeof($op),
+    Rpre,
+    is,
+    Rpost,
+    arg
+  )
     s = $init(first(arg))
     @turbo for Ipost in Rpost, Ipre in Rpre
       accum = s
diff --git a/src/simdfunctionals/vmap_grad_forwarddiff.jl b/src/simdfunctionals/vmap_grad_forwarddiff.jl
index 902cae420..39ef12c31 100644
--- a/src/simdfunctionals/vmap_grad_forwarddiff.jl
+++ b/src/simdfunctionals/vmap_grad_forwarddiff.jl
@@ -20,7 +20,7 @@ end
   ∂p::Tuple{Vararg{AbstractStridedPointer,A}},
   p::AbstractStridedPointer,
   ∂v,
-  im::Vararg{Any,N},
+  im::Vararg{Any,N}
 ) where {A,N}
   quote
     $(Expr(:meta, :inline))
diff --git a/src/simdfunctionals/vmap_grad_rrule.jl b/src/simdfunctionals/vmap_grad_rrule.jl
index 84fa8ebd8..18f10ae19 100644
--- a/src/simdfunctionals/vmap_grad_rrule.jl
+++ b/src/simdfunctionals/vmap_grad_rrule.jl
@@ -35,7 +35,7 @@ function ∂vmap_singlethread!(
   f::F,
   ∂y::Tuple{Vararg{DenseArray{T},A}},
   y::DenseArray{T},
-  args::Vararg{DenseArray{<:Base.HWReal},A},
+  args::Vararg{DenseArray{<:Base.HWReal},A}
 ) where {F,T<:Base.HWReal,A}
   N = length(y)
   ptry = VectorizationBase.zero_offsets(stridedpointer(y))
@@ -65,13 +65,12 @@ function ∂vmap_singlethread!(
       ptry,
       f(init_dual(vload.(ptrargs, ((MM{W}(i),),), m))...),
       (MM{W}(i),),
-      m,
+      m
     )
   end
   nothing
 end
 
-
 struct SIMDMapBack{K,T<:Tuple{Vararg{Any,K}}}
   jacs::T
 end
@@ -94,14 +93,22 @@ end
   end
 end
 
-function ChainRulesCore.rrule(::typeof(vmap), f::F, args::Vararg{Any,K}) where {F,K}
+function ChainRulesCore.rrule(
+  ::typeof(vmap),
+  f::F,
+  args::Vararg{Any,K}
+) where {F,K}
   out = similar(first(args))
   jacs = map(similar, args)
   ∂vmap_singlethread!(f, jacs, out, args...)
   out, SIMDMapBack(jacs)
 end
 for f in (:vmapt, :vmapnt, :vmapntt)
-  @eval function ChainRulesCore.rrule(::typeof($f), f::F, args::Vararg{Any,K}) where {F,K}
+  @eval function ChainRulesCore.rrule(
+    ::typeof($f),
+    f::F,
+    args::Vararg{Any,K}
+  ) where {F,K}
     ChainRulesCore.rrule(typeof(vmap), f, args...)
   end
 end
diff --git a/src/transforms.jl b/src/transforms.jl
index 7f522cd66..4d3a6643a 100644
--- a/src/transforms.jl
+++ b/src/transforms.jl
@@ -33,9 +33,9 @@ function hoist_constant_vload!(ls::LoopSet, op::Operation)
       parents(op),
       loopdependencies(op),
       reduceddependencies(op),
-      name(op),
+      name(op)
     ),
-    elementbytes,
+    elementbytes
   )
 end
 
@@ -53,8 +53,6 @@ function return_empty_reductinit(op::Operation, var::Symbol)
   return op
 end
 
-
-
 function constant_symbol!(ls::LoopSet, op::Operation)
   # hack
   # relowers, but should make it work
@@ -74,7 +72,10 @@ function constant_symbol!(ls::LoopSet, op::Operation)
     if intsz == 1
       pushpreamble!(ls, Expr(:(=), symname, intval % Bool))
     else
-      pushpreamble!(ls, Expr(:(=), symname, sizeequivalent_symint_expr(intval, signed)))
+      pushpreamble!(
+        ls,
+        Expr(:(=), symname, sizeequivalent_symint_expr(intval, signed))
+      )
     end
     return symname
   end
@@ -82,7 +83,11 @@ function constant_symbol!(ls::LoopSet, op::Operation)
     (idcheck ≢ nothing) && ((idcheck == id) && continue)
     pushpreamble!(
       ls,
-      Expr(:(=), symname, Expr(:call, lv(:sizeequivalentfloat), ELTYPESYMBOL, floatval)),
+      Expr(
+        :(=),
+        symname,
+        Expr(:call, lv(:sizeequivalentfloat), ELTYPESYMBOL, floatval)
+      )
     )
     return symname
   end
@@ -92,15 +97,24 @@ function constant_symbol!(ls::LoopSet, op::Operation)
     if typ == IntOrFloat
       pushpreamble!(ls, Expr(:(=), symname, Expr(:call, :zero, ELTYPESYMBOL)))
     elseif typ == HardInt
-      pushpreamble!(ls, Expr(:(=), symname, Expr(:call, lv(:zerointeger), ELTYPESYMBOL)))
+      pushpreamble!(
+        ls,
+        Expr(:(=), symname, Expr(:call, lv(:zerointeger), ELTYPESYMBOL))
+      )
     else#if typ == HardFloat
-      pushpreamble!(ls, Expr(:(=), symname, Expr(:call, lv(:zerofloat), ELTYPESYMBOL)))
+      pushpreamble!(
+        ls,
+        Expr(:(=), symname, Expr(:call, lv(:zerofloat), ELTYPESYMBOL))
+      )
     end
     return symname
   end
   for (id, f) ∈ ls.preamble_funcofeltypes
     (idcheck ≢ nothing) && ((idcheck == id) && continue)
-    pushpreamble!(ls, Expr(:(=), symname, Expr(:call, reduction_zero(f), ELTYPESYMBOL)))
+    pushpreamble!(
+      ls,
+      Expr(:(=), symname, Expr(:call, reduction_zero(f), ELTYPESYMBOL))
+    )
     return symname
   end
   throw("Constant operation symbol not found.")
@@ -124,7 +138,7 @@ function hoist_constant_store!(q::Expr, ls::LoopSet, op::Operation)
   # @show last(ls.preamble.args)
   pushpreamble!(
     ls,
-    Expr(:(=), outer_reduct_init_typename(opr), Expr(:call, lv(:typeof), init)),
+    Expr(:(=), outer_reduct_init_typename(opr), Expr(:call, lv(:typeof), init))
   )
   qpre = Expr(:block)
   push!(
@@ -133,8 +147,8 @@ function hoist_constant_store!(q::Expr, ls::LoopSet, op::Operation)
       :call,
       lv(:unsafe_store!),
       Expr(:call, lv(:pointer), op.ref.ptr),
-      outer_reduction_to_scalar_reduceq!(qpre, opr, init),
-    ),
+      outer_reduction_to_scalar_reduceq!(qpre, opr, init)
+    )
   )
   length(qpre.args) == 0 || pushpreamble!(ls, qpre) # creating `Expr` and pushing because `outer_reduction_to_scalar_reduceq!` uses `pushfirst!(q.args`, and we don't want it at the start of the preamble
   return nothing
diff --git a/src/user_api_conveniences.jl b/src/user_api_conveniences.jl
index d963faf1e..c4bcd59ed 100644
--- a/src/user_api_conveniences.jl
+++ b/src/user_api_conveniences.jl
@@ -11,7 +11,6 @@ const GEMMLOOPSET = loopset(:(
   end
 ));
 
-
 # function matmul_params(rs::Int, rc::Int, cls::Int)
 #     set_hw!(GEMMLOOPSET, rs, rc, cls, Int(cache_size(StaticInt(1))), Int(cache_size(StaticInt(2))), Int(cache_size(StaticInt(3))))
 #     order = choose_order(GEMMLOOPSET)
@@ -24,23 +23,41 @@ function matmul_params(
   M = nothing,
   K = nothing,
   N = nothing,
-  W = 0,
+  W = 0
 )
   set_hw!(GEMMLOOPSET, rs, rc, cls)
   if N ≢ nothing
     nloop = GEMMLOOPSET.loops[1]
-    GEMMLOOPSET.loops[1] =
-      Loop(:n, MaybeKnown(1), MaybeKnown(N), MaybeKnown(1), nloop.rangesym, nloop.lensym)
+    GEMMLOOPSET.loops[1] = Loop(
+      :n,
+      MaybeKnown(1),
+      MaybeKnown(N),
+      MaybeKnown(1),
+      nloop.rangesym,
+      nloop.lensym
+    )
   end
   if M ≢ nothing
     mloop = GEMMLOOPSET.loops[2]
-    GEMMLOOPSET.loops[2] =
-      Loop(:m, MaybeKnown(1), MaybeKnown(M), MaybeKnown(1), mloop.rangesym, mloop.lensym)
+    GEMMLOOPSET.loops[2] = Loop(
+      :m,
+      MaybeKnown(1),
+      MaybeKnown(M),
+      MaybeKnown(1),
+      mloop.rangesym,
+      mloop.lensym
+    )
   end
   if K ≢ nothing
     kloop = GEMMLOOPSET.loops[3]
-    GEMMLOOPSET.loops[3] =
-      Loop(:k, MaybeKnown(1), MaybeKnown(K), MaybeKnown(1), kloop.rangesym, kloop.lensym)
+    GEMMLOOPSET.loops[3] = Loop(
+      :k,
+      MaybeKnown(1),
+      MaybeKnown(K),
+      MaybeKnown(1),
+      kloop.rangesym,
+      kloop.lensym
+    )
   end
   GEMMLOOPSET.vector_width = W
   order = choose_order(GEMMLOOPSET)
@@ -52,16 +69,17 @@ end
 @generated function matmul_params(
   ::StaticInt{RS},
   ::StaticInt{RC},
-  ::StaticInt{CLS},
+  ::StaticInt{CLS}
 ) where {RS,RC,CLS}
   mᵣ, nᵣ = matmul_params(RS, RC, CLS)
   Expr(
     :tuple,
     Expr(:call, Expr(:curly, :StaticInt, mᵣ)),
-    Expr(:call, Expr(:curly, :StaticInt, nᵣ)),
+    Expr(:call, Expr(:curly, :StaticInt, nᵣ))
   )
 end
-matmul_params() = matmul_params(register_size(), register_count(), cache_linesize())
+matmul_params() =
+  matmul_params(register_size(), register_count(), cache_linesize())
 
 # function dotturbo(x,y)
 #   s = zero(promote_type(eltype(x),eltype(y)))
diff --git a/src/vectorizationbase_compat/contract_pass.jl b/src/vectorizationbase_compat/contract_pass.jl
index 6381e8717..8539d80e0 100644
--- a/src/vectorizationbase_compat/contract_pass.jl
+++ b/src/vectorizationbase_compat/contract_pass.jl
@@ -75,7 +75,13 @@ function muladd_arguments!(argv, mod, f = first(argv))
   end
 end
 
-function recursive_muladd_search!(call, argv, mod, cnmul::Bool = false, csub::Bool = false)
+function recursive_muladd_search!(
+  call,
+  argv,
+  mod,
+  cnmul::Bool = false,
+  csub::Bool = false
+)
   if length(argv) < 3
     muladd_arguments!(argv, mod)
     return length(call.args) == 4, cnmul, csub
@@ -121,7 +127,10 @@ function recursive_muladd_search!(call, argv, mod, cnmul::Bool = false, csub::Bo
         if length(exargs) == 2
           push!(call.args, exargs[3-i])
         else
-          push!(call.args, append_args_skip!(Expr(:call, :add_fast), exargs, i, mod))
+          push!(
+            call.args,
+            append_args_skip!(Expr(:call, :add_fast), exargs, i, mod)
+          )
         end
         if issub
           csub = i == 1
@@ -139,12 +148,16 @@ function recursive_muladd_search!(call, argv, mod, cnmul::Bool = false, csub::Bo
                 :call,
                 :sub_fast,
                 append_args_skip!(Expr(:call, :add_fast), exargs, i, mod),
-                call.args[4],
+                call.args[4]
               )
             end
           else
-            call.args[4] =
-              append_args_skip!(Expr(:call, :add_fast, call.args[4]), exargs, i, mod)
+            call.args[4] = append_args_skip!(
+              Expr(:call, :add_fast, call.args[4]),
+              exargs,
+              i,
+              mod
+            )
           end
           return true, cnmul, false
         end
@@ -209,8 +222,7 @@ function capture_a_muladd(ex::Expr, mod)
   end
   true, call
 end
-function capture_muladd(ex::Expr, mod)
-  while true
+capture_muladd(ex::Expr, mod) = while true
     ex.head === :ref && return ex
     if Meta.isexpr(ex, :call, 2)
       if (ex.args[1] === :(-))
@@ -224,7 +236,6 @@ function capture_muladd(ex::Expr, mod)
     found, ex = capture_a_muladd(ex, mod)
     found || return ex
   end
-end
 function append_update_args(f::Symbol, ex::Expr)
   call = Expr(:call, f)
   for i ∈ 2:length(ex.args)
@@ -250,11 +261,34 @@ function contract!(expr::Expr, ex::Expr, i::Int, mod)
   else
     j = findfirst(
       Base.Fix2(===, ex.head),
-      (:(-=), :(/=), :(÷=), :(%=), :(^=), :(&=), :(|=), :(⊻=), :(>>>=), :(>>=), :(<<=)),
+      (
+        :(-=),
+        :(/=),
+        :(÷=),
+        :(%=),
+        :(^=),
+        :(&=),
+        :(|=),
+        :(⊻=),
+        :(>>>=),
+        :(>>=),
+        :(<<=)
+      )
     )
     if j ≢ nothing
-      f =
-        (:sub_fast, :div_fast, :(÷), :(%), :(^), :(&), :(|), :(⊻), :(>>>), :(>>), :(<<))[j::Int]
+      f = (
+        :sub_fast,
+        :div_fast,
+        :(÷),
+        :(%),
+        :(^),
+        :(&),
+        :(|),
+        :(⊻),
+        :(>>>),
+        :(>>),
+        :(<<)
+      )[j::Int]
       call = Expr(:call, f)
       append!(call.args, ex.args)
       expr.args[i] = ex = Expr(:(=), first(ex.args), call)
diff --git a/src/vectorizationbase_compat/subsetview.jl b/src/vectorizationbase_compat/subsetview.jl
index 41142f362..ced5febb5 100644
--- a/src/vectorizationbase_compat/subsetview.jl
+++ b/src/vectorizationbase_compat/subsetview.jl
@@ -11,7 +11,7 @@ end
 @generated function subsetview(
   ptr::AbstractStridedPointer{T,N,C,B,R,X,O},
   ::StaticInt{I},
-  i::Union{Integer,StaticInt},
+  i::Union{Integer,StaticInt}
 ) where {T,N,C,B,R,X,O,I}
   I > N && return :ptr
   @assert B ≤ 0 "Batched dims not currently supported."
@@ -32,31 +32,42 @@ end
     stridedpointer($gptr, si, StaticInt{$B}())
   end
 end
-@inline _subsetview(ptr::AbstractStridedPointer, ::StaticInt{I}, J::Tuple{}) where {I} = ptr
 @inline _subsetview(
   ptr::AbstractStridedPointer,
   ::StaticInt{I},
-  J::Tuple{J1},
+  J::Tuple{}
+) where {I} = ptr
+@inline _subsetview(
+  ptr::AbstractStridedPointer,
+  ::StaticInt{I},
+  J::Tuple{J1}
 ) where {I,J1} = subsetview(ptr, StaticInt{I}(), first(J))
 @inline _subsetview(
   ptr::AbstractStridedPointer,
   ::StaticInt{I},
-  J::Tuple{J1,J2,Vararg},
-) where {I,J1,J2} =
-  _subsetview(subsetview(ptr, StaticInt{I}(), first(J)), StaticInt{I}(), Base.tail(J))
+  J::Tuple{J1,J2,Vararg}
+) where {I,J1,J2} = _subsetview(
+  subsetview(ptr, StaticInt{I}(), first(J)),
+  StaticInt{I}(),
+  Base.tail(J)
+)
 @inline subsetview(
   ptr::AbstractStridedPointer,
   ::StaticInt{I},
-  J::CartesianIndex,
+  J::CartesianIndex
 ) where {I} = _subsetview(ptr, StaticInt{I}(), Tuple(J))
 
-@inline _gesp(sp::VectorizationBase.FastRange, ::StaticInt{1}, i, ::StaticInt{1}) =
-  gesp(sp, (i,))
+@inline _gesp(
+  sp::VectorizationBase.FastRange,
+  ::StaticInt{1},
+  i,
+  ::StaticInt{1}
+) = gesp(sp, (i,))
 @generated function _gesp(
   sp::AbstractStridedPointer{T,N},
   ::StaticInt{I},
   i::Union{Integer,StaticInt},
-  ::StaticInt{D},
+  ::StaticInt{D}
 ) where {I,N,T,D}
   t = Expr(:tuple)
   for j ∈ 1:I-1
diff --git a/test/manyarrayrefs.jl b/test/manyarrayrefs.jl
index 5ea7ed6d8..b4ba68d0d 100644
--- a/test/manyarrayrefs.jl
+++ b/test/manyarrayrefs.jl
@@ -1,23 +1,23 @@
-@generated function sum_way_too_unrolled(A, ::Val{rows}, ::Val{cols}) where {rows, cols}
-    terms = :( 0 )
-    
-    for i in 1:rows
-        for j in 1:cols
-            terms = :( $terms + A[$i, $j, k] )
-        end
+@generated function sum_way_too_unrolled(A, ::Val{rows}, ::Val{cols}) where {rows,cols}
+  terms = :(0)
+
+  for i = 1:rows
+    for j = 1:cols
+      terms = :($terms + A[$i, $j, k])
     end
+  end
 
-    quote
-        sum = 0.0
-        @turbo for k in axes(A, 3)
-            sum += $terms
-        end
-        sum
+  quote
+    sum = 0.0
+    @turbo for k in axes(A, 3)
+      sum += $terms
     end
+    sum
+  end
 end
 
 @testset "Many Array References" begin
-    A = rand(17, 16, 10)
+  A = rand(17, 16, 10)
 
-    @test isapprox(sum_way_too_unrolled(A, Val(17), Val(16)), sum(A))
+  @test isapprox(sum_way_too_unrolled(A, Val(17), Val(16)), sum(A))
 end
diff --git a/utils/generate_costs.jl b/utils/generate_costs.jl
index ebc14fdc6..14d32bcd6 100644
--- a/utils/generate_costs.jl
+++ b/utils/generate_costs.jl
@@ -12,7 +12,7 @@ using VectorizationBase: data
     :(Tuple{}),
     "i64",
     String[],
-    Symbol[],
+    Symbol[]
   )
 end
 
@@ -30,12 +30,13 @@ end
         $sideeffect_str,
         NTuple{$W,Core.VecElement{$T}},
         Tuple{NTuple{$W,Core.VecElement{$T}}},
-        VectorizationBase.data(x),
-      ),
+        VectorizationBase.data(x)
+      )
     )
   end
 end
-@inline volatile(x::VecUnroll) = VecUnroll(VectorizationBase.fmap(volatile, data(x)))
+@inline volatile(x::VecUnroll) =
+  VecUnroll(VectorizationBase.fmap(volatile, data(x)))
 @inline volatile(x::Tuple) = map(volatile, x)
 # @generated function volatile(x::Vec{W,T}, x::Vec{W,T}) where {W,T}
 #     typ = VectorizationBase.LLVM_TYPES[T]
@@ -89,7 +90,6 @@ end
 #     end
 # end
 
-
 # @generated function unrolltest!(f::F, y::AbstractVector{T}, x::AbstractVector{T}, ::Val{U}) where {F,U,T}
 #     quote
 #         cc = readcyclecounter()
@@ -106,13 +106,22 @@ end
 let
   vx = Vec(ntuple(_ -> 10randn(), pick_vector_width(Float64))...)
   vu2 = VectorizationBase.VecUnroll(
-    ntuple(_ -> Vec(ntuple(_ -> 10randn(), pick_vector_width(Float64))...), Val(2)),
+    ntuple(
+      _ -> Vec(ntuple(_ -> 10randn(), pick_vector_width(Float64))...),
+      Val(2)
+    )
   )
   vu4 = VectorizationBase.VecUnroll(
-    ntuple(_ -> Vec(ntuple(_ -> 10randn(), pick_vector_width(Float64))...), Val(4)),
+    ntuple(
+      _ -> Vec(ntuple(_ -> 10randn(), pick_vector_width(Float64))...),
+      Val(4)
+    )
   )
   vu8 = VectorizationBase.VecUnroll(
-    ntuple(_ -> Vec(ntuple(_ -> 10randn(), pick_vector_width(Float64))...), Val(8)),
+    ntuple(
+      _ -> Vec(ntuple(_ -> 10randn(), pick_vector_width(Float64))...),
+      Val(8)
+    )
   )
   for unaryf ∈ [log, log2, log10, log1p, exp, exp2, exp10, expm1, sin, cos]
     rt1 = unrolltest(f, vx)
@@ -131,7 +140,14 @@ end
 let
   f, io = mktemp()
   W = Int(VectorizationBase.pick_vector_width(Float64))
-  code_native(io, exp, (VecUnroll{1,W,Float64,Vec{W,Float64}},); debuginfo = :none)
+  code_native(
+    io,
+    exp,
+    (VecUnroll{1,W,Float64,Vec{W,Float64}},);
+    debuginfo = :none
+  )
   close(io)
-  run(`llvm-mca -mcpu=$(Sys.CPU_NAME) -output-asm-variant=1 -bottleneck-analysis $f`)
+  run(
+    `llvm-mca -mcpu=$(Sys.CPU_NAME) -output-asm-variant=1 -bottleneck-analysis $f`
+  )
 end
diff --git a/utils/generate_precompiles.jl b/utils/generate_precompiles.jl
index eefbc6168..a78a67527 100644
--- a/utils/generate_precompiles.jl
+++ b/utils/generate_precompiles.jl
@@ -1,8 +1,12 @@
 
 using LoopVectorization, SnoopCompile
 LOOPVECTORIZATION_TEST = "all"
-tinf = @snoopi_deep include(joinpath(pkgdir(LoopVectorization), "test", "testsetup.jl"))
-tinf = @snoopi_deep include(joinpath(pkgdir(LoopVectorization), "test", "grouptests.jl"))
+tinf = @snoopi_deep include(
+  joinpath(pkgdir(LoopVectorization), "test", "testsetup.jl")
+)
+tinf = @snoopi_deep include(
+  joinpath(pkgdir(LoopVectorization), "test", "grouptests.jl")
+)
 
 ttot, pcs = SnoopCompile.parcel(tinf);
 
@@ -31,17 +35,17 @@ blacklist = (
   :tanh_fast,
   :check_args,
   :relu,
-  :init_dual,
+  :init_dual
 )
-filteredmethods = filter(m -> !Base.sym_in(m[2].def.name, blacklist), last(pcslv));
+filteredmethods =
+  filter(m -> !Base.sym_in(m[2].def.name, blacklist), last(pcslv));
 length(filteredmethods);
 
 SnoopCompile.write(
   "/tmp/precompile_loopvec",
-  [LoopVectorization => (sum(first, filteredmethods), filteredmethods)],
+  [LoopVectorization => (sum(first, filteredmethods), filteredmethods)]
 )
 
-
 # pc = SnoopCompile.parcel(tinf; blacklist=["vmaterialize", "vmaterialize!", "vreduce", "Base.Broadcast.materialize", "_vreduce_dims!", "vmapreduce"])
 # pcs = pc[:LoopVectorization]
 # open(joinpath(pkgdir, "src", "precompile.jl"), "w") do io