diff --git a/Project.toml b/Project.toml
index e3f8e0da..bd278731 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "LoopVectorization"
 uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
 authors = ["Chris Elrod <elrodc@gmail.com>"]
-version = "0.12.166"
+version = "0.12.167"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
diff --git a/README.md b/README.md
index 569b8b9b..099e903e 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,10 @@
 
 [![LoopVectorization Downloads](https://shields.io/endpoint?url=https://pkgs.genieframework.com/api/v1/badge/LoopVectorization)](https://pkgs.genieframework.com?packages=LoopVectorization)
 
+# NOTE: Deprecated for Julia v1.11 and above!
+
+LoopVectorization only works for Julia 1.3 through 1.10. For 1.11 and newer, it simply uses `@inbounds @fastmath` instead, so it should still get roughly the same answer, but both runtime and compile time performance may change dramatically.
+
 ## Installation
 
 ```julia
diff --git a/ext/ForwardDiffExt.jl b/ext/ForwardDiffExt.jl
index 26227f69..f9c20ec5 100644
--- a/ext/ForwardDiffExt.jl
+++ b/ext/ForwardDiffExt.jl
@@ -1,4 +1,5 @@
 module ForwardDiffExt
+if VERSION < v"1.11-DEV"
 import ForwardDiff, ChainRulesCore
 using LoopVectorization, VectorizationBase, SLEEFPirates, ForwardDiff
 
@@ -377,3 +378,4 @@ for f in (:vmapt, :vmapnt, :vmapntt)
   end
 end
 end
+end
diff --git a/ext/SpecialFunctionsExt.jl b/ext/SpecialFunctionsExt.jl
index bfd813ee..2b1ed5fb 100644
--- a/ext/SpecialFunctionsExt.jl
+++ b/ext/SpecialFunctionsExt.jl
@@ -1,6 +1,8 @@
 module SpecialFunctionsExt
+if VERSION < v"1.11-DEV"
 using SpecialFunctions
 using LoopVectorization: VectorizationBase
 using LoopVectorization: AbstractSIMD
 @inline SpecialFunctions.erf(x::AbstractSIMD) = VectorizationBase.verf(float(x))
 end
+end
diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl
index 7151df75..bf5795f0 100644
--- a/src/LoopVectorization.jl
+++ b/src/LoopVectorization.jl
@@ -1,189 +1,10 @@
 module LoopVectorization
 
-if isdefined(Base, :Experimental) &&
-   isdefined(Base.Experimental, Symbol("@max_methods"))
-  @eval Base.Experimental.@max_methods 1
-end
-
-using ArrayInterface: UpTri, LoTri
-using Static: StaticInt, gt, static, Zero, One, reduce_tup
-using VectorizationBase,
-  SLEEFPirates, UnPack, OffsetArrays, StaticArrayInterface
-const ArrayInterface = StaticArrayInterface
-using LayoutPointers:
-  AbstractStridedPointer,
-  StridedPointer,
-  StridedBitPointer,
-  grouped_strided_pointer,
-  stridedpointer_preserve,
-  GroupedStridedPointers
-import LayoutPointers
-
-using SIMDTypes: NativeTypes
-
-using VectorizationBase:
-  mask,
-  MM,
-  AbstractMask,
-  data,
-  AbstractSIMD,
-  vzero,
-  offsetprecalc,
-  lazymul,
-  vadd_nw,
-  vadd_nsw,
-  vadd_nuw,
-  vsub_nw,
-  vsub_nsw,
-  vsub_nuw,
-  vmul_nw,
-  vmul_nsw,
-  vmul_nuw,
-  vfmaddsub,
-  vfmsubadd,
-  vpermilps177,
-  vmovsldup,
-  vmovshdup,
-  maybestaticfirst,
-  maybestaticlast,
-  gep,
-  gesp,
-  vfmadd,
-  vfmsub,
-  vfnmadd,
-  vfnmsub,
-  vfmadd_fast,
-  vfmsub_fast,
-  vfnmadd_fast,
-  vfnmsub_fast,
-  vfmadd231,
-  vfmsub231,
-  vfnmadd231,
-  vfnmsub231,
-  vfma_fast,
-  vmuladd_fast,
-  vdiv_fast,
-  vadd_fast,
-  vsub_fast,
-  vmul_fast,
-  relu,
-  stridedpointer,
-  _vload,
-  _vstore!,
-  reduced_add,
-  reduced_prod,
-  reduce_to_add,
-  reduce_to_prod,
-  reduced_max,
-  reduced_min,
-  reduce_to_max,
-  reduce_to_min,
-  reduced_all,
-  reduced_any,
-  reduce_to_all,
-  reduce_to_any,
-  vsum,
-  vprod,
-  vmaximum,
-  vminimum,
-  vany,
-  vall,
-  Unroll,
-  VecUnroll,
-  preserve_buffer,
-  zero_vecunroll,
-  vbroadcast_vecunroll,
-  _vzero,
-  _vbroadcast,
-  contract_add,
-  collapse_add,
-  contract_mul,
-  collapse_mul,
-  contract_max,
-  collapse_max,
-  contract_min,
-  collapse_min,
-  contract_and,
-  collapse_and,
-  contract_or,
-  collapse_or,
-  max_mask,
-  maybestaticsize,
-  zero_mask
-
-using HostCPUFeatures:
-  pick_vector_width,
-  register_size,
-  register_count,
-  has_opmask_registers,
-  unwrap,
-  get_cpu_name
-using CPUSummary: num_cores, cache_linesize, cache_size
-
-using IfElse: ifelse
-
-using ThreadingUtilities, PolyesterWeave
-using Base.Broadcast: Broadcasted, DefaultArrayStyle
-using LinearAlgebra: Adjoint, Transpose, Diagonal
-using Base.Meta: isexpr
-using DocStringExtensions
-import LinearAlgebra # for check_args
-
-using Base: unsafe_trunc
-
-using Base.FastMath:
-  add_fast,
-  sub_fast,
-  mul_fast,
-  div_fast,
-  inv_fast,
-  abs2_fast,
-  rem_fast,
-  max_fast,
-  min_fast,
-  pow_fast,
-  sqrt_fast
-using SLEEFPirates:
-  log_fast,
-  log2_fast,
-  log10_fast,
-  pow,
-  sin_fast,
-  cos_fast,
-  sincos_fast,
-  tan_fast
-
-using StaticArrayInterface:
-  OptionallyStaticUnitRange,
-  OptionallyStaticRange,
-  StaticBool,
-  True,
-  False,
-  indices,
-  static_strides,
-  offsets,
-  static_size,
-  static_axes,
-  StrideIndex
-using CloseOpenIntervals: AbstractCloseOpen, CloseOpen#, SafeCloseOpen
-# @static if VERSION ≥ v"1.6.0-rc1" #TODO: delete `else` when dropping 1.5 support
-# using ArrayInterface: static_step
-# else # Julia 1.5 did not define `step` on CartesianIndices
-@inline static_step(x) = ArrayInterface.static_step(x)
-@inline static_step(x::CartesianIndices) =
-  VectorizationBase.CartesianVIndex(map(static_step, x.indices))
-# end
-
-export LowDimArray,
-  stridedpointer,
-  indices,
-  static,
+export indices,
   @avx,
   @avxt,
   @turbo,
   @tturbo,
-  *ˡ,
-  _turbo_!,
   vmap,
   vmap!,
   vmapt,
@@ -192,8 +13,6 @@ export LowDimArray,
   vmapnt!,
   vmapntt,
   vmapntt!,
-  tanh_fast,
-  sigmoid_fast,
   vfilter,
   vfilter!,
   vmapreduce,
@@ -201,71 +20,288 @@ export LowDimArray,
   vreduce,
   vcount
 
-const VECTORWIDTHSYMBOL, ELTYPESYMBOL, MASKSYMBOL =
-  Symbol("##Wvecwidth##"), Symbol("##Tloopeltype##"), Symbol("##mask##")
+if VERSION >= v"1.11-DEV"
+  macro turbo(args...)
+    quote
+      @inbounds @fastmath begin
+        $(esc(last(args)))
+      end
+    end
+  end
+  const var"@tturbo" = var"@turbo"
+  const var"@avx" = var"@turbo"
+  const var"@avxt" = var"@turbo"
+  const vmap = map
+  const vmap! = map!
+  const vmapt = map
+  const vmapt! = map!
+  const vmapnt = map
+  const vmapnt! = map!
+  const vmapntt = map
+  const vmapntt! = map!
+  const vfilter = filter
+  const vfilter! = filter!
+  const vmapreduce = mapreduce
+  const vsum = sum
+  const vreduce = reduce
+  const vcount = count
+
+  indices(A::AbstractArray, i::Integer) = axes(A, i)
+  function _check_axes_match(ax::Tuple)
+    fax = first(ax)
+    foreach(Base.tail(ax)) do x
+      fax == x || throw(DimensionMismatch("Axes do not match."))
+    end
+    fax
+  end
+  indices(a::Tuple, b::Tuple) = _check_axes_match(map(axes, a, b))
+  indices(a::Tuple, b::Integer) = _check_axes_match(map(Base.Fix2(axes, b), a))
+  function indices(a::AbstractArray, b::Tuple)
+    _check_axes_match(map(Base.Fix1(axes, a), b))
+  end
+
+else
+  if isdefined(Base, :Experimental) &&
+     isdefined(Base.Experimental, Symbol("@max_methods"))
+    @eval Base.Experimental.@max_methods 1
+  end
+  export LowDimArray,
+    static, stridedpointer, *ˡ, _turbo_!, tanh_fast, sigmoid_fast
+
+  using ArrayInterface: UpTri, LoTri
+  using Static: StaticInt, gt, static, Zero, One, reduce_tup
+  using VectorizationBase,
+    SLEEFPirates, UnPack, OffsetArrays, StaticArrayInterface
+  const ArrayInterface = StaticArrayInterface
+  using LayoutPointers:
+    AbstractStridedPointer,
+    StridedPointer,
+    StridedBitPointer,
+    grouped_strided_pointer,
+    stridedpointer_preserve,
+    GroupedStridedPointers
+  import LayoutPointers
+
+  using SIMDTypes: NativeTypes
+
+  using VectorizationBase:
+    mask,
+    MM,
+    AbstractMask,
+    data,
+    AbstractSIMD,
+    vzero,
+    offsetprecalc,
+    lazymul,
+    vadd_nw,
+    vadd_nsw,
+    vadd_nuw,
+    vsub_nw,
+    vsub_nsw,
+    vsub_nuw,
+    vmul_nw,
+    vmul_nsw,
+    vmul_nuw,
+    vfmaddsub,
+    vfmsubadd,
+    vpermilps177,
+    vmovsldup,
+    vmovshdup,
+    maybestaticfirst,
+    maybestaticlast,
+    gep,
+    gesp,
+    vfmadd,
+    vfmsub,
+    vfnmadd,
+    vfnmsub,
+    vfmadd_fast,
+    vfmsub_fast,
+    vfnmadd_fast,
+    vfnmsub_fast,
+    vfmadd231,
+    vfmsub231,
+    vfnmadd231,
+    vfnmsub231,
+    vfma_fast,
+    vmuladd_fast,
+    vdiv_fast,
+    vadd_fast,
+    vsub_fast,
+    vmul_fast,
+    relu,
+    stridedpointer,
+    _vload,
+    _vstore!,
+    reduced_add,
+    reduced_prod,
+    reduce_to_add,
+    reduce_to_prod,
+    reduced_max,
+    reduced_min,
+    reduce_to_max,
+    reduce_to_min,
+    reduced_all,
+    reduced_any,
+    reduce_to_all,
+    reduce_to_any,
+    vsum,
+    vprod,
+    vmaximum,
+    vminimum,
+    vany,
+    vall,
+    Unroll,
+    VecUnroll,
+    preserve_buffer,
+    zero_vecunroll,
+    vbroadcast_vecunroll,
+    _vzero,
+    _vbroadcast,
+    contract_add,
+    collapse_add,
+    contract_mul,
+    collapse_mul,
+    contract_max,
+    collapse_max,
+    contract_min,
+    collapse_min,
+    contract_and,
+    collapse_and,
+    contract_or,
+    collapse_or,
+    max_mask,
+    maybestaticsize,
+    zero_mask
+
+  using HostCPUFeatures:
+    pick_vector_width,
+    register_size,
+    register_count,
+    has_opmask_registers,
+    unwrap,
+    get_cpu_name
+  using CPUSummary: num_cores, cache_linesize, cache_size
+
+  using IfElse: ifelse
+
+  using ThreadingUtilities, PolyesterWeave
+  using Base.Broadcast: Broadcasted, DefaultArrayStyle
+  using LinearAlgebra: Adjoint, Transpose, Diagonal
+  using Base.Meta: isexpr
+  using DocStringExtensions
+  import LinearAlgebra # for check_args
+
+  using Base: unsafe_trunc
+
+  using Base.FastMath:
+    add_fast,
+    sub_fast,
+    mul_fast,
+    div_fast,
+    inv_fast,
+    abs2_fast,
+    rem_fast,
+    max_fast,
+    min_fast,
+    pow_fast,
+    sqrt_fast
+  using SLEEFPirates:
+    log_fast,
+    log2_fast,
+    log10_fast,
+    pow,
+    sin_fast,
+    cos_fast,
+    sincos_fast,
+    tan_fast
+
+  using StaticArrayInterface:
+    OptionallyStaticUnitRange,
+    OptionallyStaticRange,
+    StaticBool,
+    True,
+    False,
+    indices,
+    static_strides,
+    offsets,
+    static_size,
+    static_axes,
+    StrideIndex
+  using CloseOpenIntervals: AbstractCloseOpen, CloseOpen#, SafeCloseOpen
+  # @static if VERSION ≥ v"1.6.0-rc1" #TODO: delete `else` when dropping 1.5 support
+  # using ArrayInterface: static_step
+  # else # Julia 1.5 did not define `step` on CartesianIndices
+  @inline static_step(x) = ArrayInterface.static_step(x)
+  @inline static_step(x::CartesianIndices) =
+    VectorizationBase.CartesianVIndex(map(static_step, x.indices))
+  # end
 
-include("vectorizationbase_compat/contract_pass.jl")
-include("vectorizationbase_compat/subsetview.jl")
-include("getconstindexes.jl")
-include("predicates.jl")
-include("simdfunctionals/map.jl")
-include("simdfunctionals/filter.jl")
-include("modeling/costs.jl")
-include("modeling/operations.jl")
-include("modeling/graphs.jl")
-include("codegen/operation_evaluation_order.jl")
-include("parse/memory_ops_common.jl")
-include("parse/add_loads.jl")
-include("parse/add_stores.jl")
-include("parse/add_compute.jl")
-include("parse/add_constants.jl")
-include("parse/add_ifelse.jl")
-include("modeling/determinestrategy.jl")
-include("codegen/line_number_nodes.jl")
-include("codegen/loopstartstopmanager.jl")
-include("codegen/lower_compute.jl")
-include("codegen/lower_constant.jl")
-include("codegen/lower_memory_common.jl")
-include("codegen/lower_load.jl")
-include("codegen/lower_store.jl")
-include("codegen/lowering.jl")
-include("codegen/split_loops.jl")
-include("codegen/lower_threads.jl")
-include("condense_loopset.jl")
-include("transforms.jl")
-include("reconstruct_loopset.jl")
-include("constructors.jl")
-include("user_api_conveniences.jl")
-include("simdfunctionals/mapreduce.jl")
-include("simdfunctionals/count.jl")
-include("broadcast.jl")
+  const VECTORWIDTHSYMBOL, ELTYPESYMBOL, MASKSYMBOL =
+    Symbol("##Wvecwidth##"), Symbol("##Tloopeltype##"), Symbol("##mask##")
 
-"""
-LoopVectorization provides macros and functions that combine SIMD vectorization and
-loop-reordering so as to improve performance:
+  include("vectorizationbase_compat/contract_pass.jl")
+  include("vectorizationbase_compat/subsetview.jl")
+  include("getconstindexes.jl")
+  include("predicates.jl")
+  include("simdfunctionals/map.jl")
+  include("simdfunctionals/filter.jl")
+  include("modeling/costs.jl")
+  include("modeling/operations.jl")
+  include("modeling/graphs.jl")
+  include("codegen/operation_evaluation_order.jl")
+  include("parse/memory_ops_common.jl")
+  include("parse/add_loads.jl")
+  include("parse/add_stores.jl")
+  include("parse/add_compute.jl")
+  include("parse/add_constants.jl")
+  include("parse/add_ifelse.jl")
+  include("modeling/determinestrategy.jl")
+  include("codegen/line_number_nodes.jl")
+  include("codegen/loopstartstopmanager.jl")
+  include("codegen/lower_compute.jl")
+  include("codegen/lower_constant.jl")
+  include("codegen/lower_memory_common.jl")
+  include("codegen/lower_load.jl")
+  include("codegen/lower_store.jl")
+  include("codegen/lowering.jl")
+  include("codegen/split_loops.jl")
+  include("codegen/lower_threads.jl")
+  include("condense_loopset.jl")
+  include("transforms.jl")
+  include("reconstruct_loopset.jl")
+  include("constructors.jl")
+  include("user_api_conveniences.jl")
+  include("simdfunctionals/mapreduce.jl")
+  include("simdfunctionals/count.jl")
+  include("broadcast.jl")
 
-  - [`@turbo`](@ref): transform `for`-loops and broadcasting
-  - [`vmapreduce`](@ref): vectorized version of `mapreduce`
-  - [`vreduce`](@ref): vectorized version of `reduce`
-  - [`vsum`](@ref): vectorized version of `sum`
-  - [`vmap`](@ref) and `vmap!`: vectorized version of `map` and `map!`
-  - [`vmapnt`](@ref) and `vmapnt!`: non-temporal variants of `vmap` and `vmap!`
-  - [`vmapntt`](@ref) and `vmapntt!`: threaded variants of `vmapnt` and `vmapnt!`
-  - [`vfilter`](@ref) and `vfilter!`: vectorized versions of `filter` and `filter!`
-"""
-LoopVectorization
+  """
+  LoopVectorization provides macros and functions that combine SIMD vectorization and
+  loop-reordering so as to improve performance:
 
-include("precompile.jl")
-# _precompile_()
+    - [`@turbo`](@ref): transform `for`-loops and broadcasting
+    - [`vmapreduce`](@ref): vectorized version of `mapreduce`
+    - [`vreduce`](@ref): vectorized version of `reduce`
+    - [`vsum`](@ref): vectorized version of `sum`
+    - [`vmap`](@ref) and `vmap!`: vectorized version of `map` and `map!`
+    - [`vmapnt`](@ref) and `vmapnt!`: non-temporal variants of `vmap` and `vmap!`
+    - [`vmapntt`](@ref) and `vmapntt!`: threaded variants of `vmapnt` and `vmapnt!`
+    - [`vfilter`](@ref) and `vfilter!`: vectorized versions of `filter` and `filter!`
+  """
+  LoopVectorization
 
-# _vreduce(+, Float64[1.0])
-# matmul_params(64, 32, 64)
+  include("precompile.jl")
+  # _precompile_()
 
-# import ChainRulesCore, ForwardDiff
-# include("vmap_grad.jl")
-if !isdefined(Base, :get_extension)
-  include("../ext/ForwardDiffExt.jl")
-  include("../ext/SpecialFunctionsExt.jl")
-end
+  # _vreduce(+, Float64[1.0])
+  # matmul_params(64, 32, 64)
 
+  # import ChainRulesCore, ForwardDiff
+  # include("vmap_grad.jl")
+  if !isdefined(Base, :get_extension)
+    include("../ext/ForwardDiffExt.jl")
+    include("../ext/SpecialFunctionsExt.jl")
+  end
+end # if VERSION
 end # module
diff --git a/src/condense_loopset.jl b/src/condense_loopset.jl
index 403f865f..18e29a04 100644
--- a/src/condense_loopset.jl
+++ b/src/condense_loopset.jl
@@ -938,6 +938,7 @@ end
 @inline check_args(::VectorizationBase.AbstractStridedPointer) = true
 @inline function check_args(x)
   # @info "`LoopVectorization.check_args(::$(typeof(x))) == false`, therefore compiling a probably slow `@inbounds @fastmath` fallback loop." maxlog=1
+  @show @__LINE__, typeof(x)
   false
 end
 @inline check_args(A, B, C::Vararg{Any,K}) where {K} =
@@ -954,11 +955,17 @@ end
 Returns true if the element type is supported.
 """
 @inline check_type(::Type{T}) where {T<:NativeTypes} = true
-@inline check_type(::Type{T}) where {T} = false
+@inline function check_type(::Type{T}) where {T}
+@show @__LINE__, T
+false
+end
 @inline check_type(::Type{T}) where {T<:AbstractSIMD} = true
 @inline check_device(::ArrayInterface.CPUPointer) = true
 @inline check_device(::ArrayInterface.CPUTuple) = true
-@inline check_device(x) = false
+@inline function check_device(x)
+@show @__LINE__, typeof(x)
+false
+end
 
 function check_args_call(ls::LoopSet)
   q = Expr(:call, lv(:check_args))
@@ -978,6 +985,7 @@ can be used inside a `@turbo` loop.
 """
 function can_turbo(f::F, ::Val{NARGS})::Bool where {F,NARGS}
   promoted_op = Base.promote_op(f, ntuple(RetVec2Int(), Val(NARGS))...)
+  promoted_op === Union{} && @show f, NARGS
   return promoted_op !== Union{}
 end
 can_turbo(::typeof(vfmaddsub), ::Val{3}) = true
diff --git a/test/testsetup.jl b/test/testsetup.jl
index b60c2022..df27ccaa 100644
--- a/test/testsetup.jl
+++ b/test/testsetup.jl
@@ -2,7 +2,12 @@ using Test
 using Pkg
 using LoopVectorization
 
+if VERSION >= v"1.11-DEV"
+const var"@_avx" = LoopVectorization.var"@turbo"
+else
 const var"@_avx" = LoopVectorization.var"@_turbo"
+end
+
 
 using LinearAlgebra
 function clenshaw(x, coeff)