From 42c70a1a7f7b4ec4c12b4c99bb1cb8e22b7a6943 Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Mon, 17 Oct 2022 17:56:59 +0600
Subject: [PATCH 01/29] initial functionality

---
 base/sort.jl    | 946 +++++++++++++++++++++++++-----------------------
 test/sorting.jl |  34 +-
 2 files changed, 515 insertions(+), 465 deletions(-)

diff --git a/base/sort.jl b/base/sort.jl
index e7e767146abb6..edf946b6c24ab 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -86,7 +86,7 @@ issorted(itr;
     issorted(itr, ord(lt,by,rev,order))
 
 function partialsort!(v::AbstractVector, k::Union{Integer,OrdinalRange}, o::Ordering)
-    sort!(v, _PartialQuickSort(k), o)
+    _sort!(v, _PartialQuickSort(k), o)
     maybeview(v, k)
 end
 
@@ -407,36 +407,193 @@ function insorted end
 insorted(x, v::AbstractVector; kw...) = !isempty(searchsorted(v, x; kw...))
 insorted(x, r::AbstractRange) = in(x, r)
 
-## sorting algorithms ##
+## sorting algorithm components ##
 
 abstract type Algorithm end
 
-struct InsertionSortAlg <: Algorithm end
-struct MergeSortAlg     <: Algorithm end
-struct AdaptiveSortAlg  <: Algorithm end
 
-"""
-    PartialQuickSort(lo::Union{Integer, Missing}, hi::Union{Integer, Missing})
 
-Indicate that a sorting function should use the partial quick sort algorithm.
+#
+# Missing values always go at the end
+#
+struct MissingOptimization{T <: Algorithm} <: Algorithm
+    next::T
+end
 
-Partial quick sort finds and sorts the elements that would end up in positions
-`lo:hi` using [`QuickSort`](@ref).
+struct WithoutMissingVector{T, U <: AbstractVector{Union{T, Missing}}} <: AbstractVector{T}
+    data::U
+    function WithoutMissingVector(data; unsafe=false)
+        if !unsafe && any(ismissing, data)
+            throw(ArgumentError("data must not contain missing values"))
+        end
+        new{nonmissingtype(eltype(data)), typeof(data)}(data)
+    end
+end
+Base.@propagate_inbounds function Base.getindex(v::WithoutMissingVector, i)
+    out = v.data[i]
+    @assert !out isa Missing
+    out::eltype(v)
+end
+Base.@propagate_inbounds function Base.setindex!(v::WithoutMissingVector{T}, x::T, i) where T
+    v.data[i] = x
+    v
+end
+Base.size(v::WithoutMissingVector) = size(v.data)
 
-Characteristics:
-  * *stable*: preserves the ordering of elements which compare equal
-    (e.g. "a" and "A" in a sort of letters which ignores case).
-  * *not in-place* in memory.
-  * *divide-and-conquer*: sort strategy similar to [`MergeSort`](@ref).
+# TODO simplify this further, remove redundancy, try a reverse view.
 """
-struct PartialQuickSort{L<:Union{Integer,Missing}, H<:Union{Integer,Missing}} <: Algorithm
-    lo::L
-    hi::H
+    send_to_end!(f::Function, v::AbstractVector)
+
+Send every element of `v` for which `f` returns `true` to the end of that range and return
+the number of elements index of the last element which for which `f` returns `false`.
+
+Preserves the order of the elements that are not sent to the end.
+"""
+function send_to_end!(f::Function, v::AbstractVector, ::ReverseOrdering; lo, hi)
+    i = hi
+    @inbounds while lo <= i && !f(v[i])
+        i -= 1
+    end
+    j = i - 1
+    @inbounds while lo <= j
+        if !f(v[j])
+            v[i], v[j] = v[j], v[i]
+            i -= 1
+        end
+        j -= 1
+    end
+    return (i+1, hi), (lo, i)
+end
+function send_to_end!(f::Function, v::AbstractVector, ::ForwardOrdering; lo, hi)
+    i = lo
+    @inbounds while i <= hi && !f(v[i])
+        i += 1
+    end
+    j = i + 1
+    @inbounds while j <= hi
+        if !f(v[j])
+            v[i], v[j] = v[j], v[i]
+            i += 1
+        end
+        j += 1
+    end
+    return (lo, i-1), (i, hi)
+end
+
+function _sort!(v::AbstractVector, a::MissingOptimization, o::Ordering;
+                lo=firstindex(v), hi=lastindex(v), kw...)
+    if nonmissingtype(eltype(v)) != eltype(v) && o isa DirectOrdering
+        (lo, hi), _ = send_to_end!(ismissing, v, o; lo, hi)
+        _sort!(WithoutMissingVector(v, unsafe=true), a.next, o; lo, hi, kw...)
+        v
+    elseif eltype(v) <: Integer && o isa Perm{DirectOrdering} && nonmissingtype(eltype(o.data)) != eltype(o.data)
+        (lo, hi), _ = send_to_end!(i -> ismissing(@inbounds o.data[i]), v, o)
+        _sort!(v, a.next, Perm(o.order, WithoutMissingVector(o.data, unsafe=true)); lo, hi, kw...)
+    else
+        _sort!(v, a.next, o; lo, hi, kw...)
+    end
+end
+
+
+
+#
+# fast clever sorting for floats
+#
+struct IEEEFloatOptimization{T <: Algorithm} <: Algorithm
+    next::T
+end
+
+UIntType(::Type{Float16}) = UInt16
+UIntType(::Type{Float32}) = UInt32
+UIntType(::Type{Float64}) = UInt64
+after_zero(::ForwardOrdering, x) = 0 <= x
+after_zero(::ReverseOrdering, x) = x < 0
+is_concrete_IEEEFloat(T::Type) = T <: Base.IEEEFloat && isconcretetype(T)
+function _sort!(v::AbstractVector, a::IEEEFloatOptimization, o::Ordering;
+                lo=firstindex(v), hi=lastindex(v), kw...)
+    if is_concrete_IEEEFloat(eltype(v)) && o isa DirectOrdering
+        _, (lo, hi) = send_to_end!(!isnan, v, ReverseOrdering(o); lo, hi)
+        iv = reinterpret(UIntType(eltype(v)), v)
+        (_, j), _ = send_to_end!(x -> after_zero(o, x), v, Forward; lo, hi)
+        _sort!(iv, a.next, Reverse; lo, hi=j, kw...)
+        _sort!(iv, a.next, Forward; lo=j+1, hi, kw...)
+    elseif eltype(v) <: Integer && o isa Perm && o.order isa DirectOrdering && is_concrete_IEEEFloat(eltype(o.data))
+        _, (lo, hi) = send_to_end!(i -> !isnan(@inbounds o.data[i]), v, ReverseOrdering(o.order); lo, hi)
+        ip = reinterpret(UIntType(eltype(o.data)), o.data)
+        (_, j), _ = send_to_end!(i -> after_zero(o.order, @inbounds o.data[i]), v, Forward; lo, hi)
+        _sort!(v, a.next, Perm(Reverse, ip); lo, hi=j, kw...)
+        _sort!(v, a.next, Perm(Forward, ip); lo=j+1, hi, kw...)
+    else
+        _sort!(v, a.next, o; lo, hi, kw...)
+    end
+    v
+end
+
+
+
+# For AbstractVector{Bool}, counting sort is always best.
+# This is an implementation of counting sort specialized for Bools.
+# Accepts unused scratch to avoid method ambiguity.
+struct BoolOptimization{T <: Algorithm} <: Algorithm
+    next::T
+end
+_sort!(v::AbstractVector, a::BoolOptimization, o::Ordering; kw...) = _sort!(v, a.next, o; kw...)
+function _sort!(v::AbstractVector{Bool}, ::BoolOptimization, o::Ordering; lo::Integer, hi::Integer, kw...)
+    first = lt(o, false, true) ? false : lt(o, true, false) ? true : return v
+    count = 0
+    @inbounds for i in lo:hi
+        if v[i] == first
+            count += 1
+        end
+    end
+    @inbounds v[lo:lo+count-1] .= first
+    @inbounds v[lo+count:hi] .= !first
+    v
 end
-PartialQuickSort(k::Integer) = PartialQuickSort(missing, k)
-PartialQuickSort(k::OrdinalRange) = PartialQuickSort(first(k), last(k))
-_PartialQuickSort(k::Integer) = PartialQuickSort(k, k)
-_PartialQuickSort(k::OrdinalRange) = PartialQuickSort(k)
+
+
+
+#
+#
+#
+struct IsUIntMappable{T <: Algorithm, U <: Algorithm} <: Algorithm
+    yes::T
+    no::U
+end
+function _sort!(v::AbstractVector, a::IsUIntMappable, o::Ordering;
+                U = UIntMappable(eltype(v), o), kw...)
+    if U !== nothing
+        _sort!(v, a.yes, o; U, kw...)
+    else
+        _sort!(v, a.no, o; kw...)
+    end
+end
+
+
+
+#
+#
+#
+struct Small{N, T <: Algorithm, U <: Algorithm} <: Algorithm
+    small::T
+    big::U
+end
+Small{N}(big) where N = Small{N, typeof(SMALL_ALGORITHM), typeof(big)}(SMALL_ALGORITHM, big)
+function _sort!(v::AbstractVector, a::Small{N}, o::Ordering;
+                lo::Integer=firstindex(v), hi::Integer=lastindex(v), lenm1 = hi-lo, kw...) where N
+    if lenm1 < N
+        _sort!(v, a.small, o; lo, hi, lenm1, kw...)
+    else
+        _sort!(v, a.big, o; lo, hi, lenm1, kw...)
+    end
+end
+
+
+
+#
+#
+#
+struct InsertionSortAlg <: Algorithm end
 
 """
     InsertionSort
@@ -455,79 +612,241 @@ Characteristics:
     it is well-suited to small collections but should not be used for large ones.
 """
 const InsertionSort = InsertionSortAlg()
+const SMALL_ALGORITHM = InsertionSort
+function _sort!(v::AbstractVector, ::InsertionSortAlg, o::Ordering;
+                lo=firstindex(v), hi=lastindex(v), kw...)
+    lo_plus_1 = (lo + 1)::Integer
+    @inbounds for i = lo_plus_1:hi
+        j = i
+        x = v[i]
+        while j > lo
+            y = v[j-1]
+            if !(lt(o, x, y)::Bool)
+                break
+            end
+            v[j] = y
+            j -= 1
+        end
+        v[j] = x
+    end
+    return v
+end
+
+
+
+#
+#
+#
+struct CheckSorted{T <: Algorithm} <: Algorithm
+    next::T
+end
+function _sort!(v::AbstractVector, a::CheckSorted, o::Ordering;
+                lo=firstindex(v), hi=lastindex(v), lenm1 = hi-lo, kw...)
+    # For most arrays, a presorted check is cheap (overhead < 5%) and for most large
+    # arrays it is essentially free (<1%). Insertion sort runs in a fast O(n) on presorted
+    # input and this guarantees presorted input will always be efficiently handled
+    _issorted(v, lo, hi, o) && return v
+
+    # For large arrays, a reverse-sorted check is essentially free (overhead < 1%)
+    if lenm1 >= 500 && _issorted(v, lo, hi, ReverseOrdering(o))
+        # If reversing is valid, do so. This does not violate stability
+        # because being UIntMappable implies a linear order.
+        reverse!(v, lo, hi)
+        return v
+    end
+
+    _sort!(v, a.next, o; lo, hi, lenm1, kw...)
+end
+
+
+
+#
+# Prerequisite: region to be sorted [lo, hi] is nonempty
+#
+struct ComputeExtrema{T <: Algorithm} <: Algorithm
+    next::T
+end
+function _sort!(v::AbstractVector, a::ComputeExtrema, o::Ordering;
+                lo=firstindex(v), hi=lastindex(v), kw...)
+    mn = mx = v[lo]
+    @inbounds for i in (lo+1):hi
+        vi = v[i]
+        lt(o, vi, mn) && (mn = vi)
+        lt(o, mx, vi) && (mx = vi)
+    end
+    mn, mx
+
+    lt(o, mn, mx) || return v # all same
+
+    _sort!(v, a.next, o; lo, hi, mn, mx, kw...)
+end
+
+
+
+#
+# Consider counting sort
+#
+struct ConsiderCountingSort{T <: Algorithm, U <: Algorithm} <: Algorithm
+    counting::T
+    next::U
+end
+ConsiderCountingSort(next) = ConsiderCountingSort(CountingSort(), next)
+function _sort!(v::AbstractVector{<:Integer}, a::ConsiderCountingSort, o::DirectOrdering;
+                lo=firstindex(v), hi=lastindex(v), lenm1=hi-lo,
+                U = UIntMapping(eltype(v), o),
+                mn, mx, range=maybe_unsigned(o === Reverse ? mn-mx : mx-mn), kw...)
+
+    if range < (sizeof(U) > 8 ? 5lenm1-100 : div(lenm1, 2))
+        _sort!(v, a.counting, o; lo, hi, lenm1, mn, mx, range, kw...)
+    else
+        _sort!(v, a.next, o; lo, hi, lenm1, mn, mx, range, kw...)
+    end
+end
+_sort!(v::AbstractVector, a::ConsiderCountingSort, o::Ordering; kw...) = _sort!(v, a.next, o; kw...)
+
+
+
+#
+# Counting sort
+#
+struct CountingSort <: Algorithm end
+maybe_reverse(o::ForwardOrdering, x) = x
+maybe_reverse(o::ReverseOrdering, x) = reverse(x)
+function _sort!(v::AbstractVector{<:Integer}, ::CountingSort, o::DirectOrdering;
+                lo=firstindex(v), hi=lastindex(v), lenm1=hi-lo,
+                mn, mx, range=maybe_unsigned(o === Reverse ? mn-mx : mx-mn), kw...)
+    offs = 1 - (o === Reverse ? mx : mn)
+
+    counts = fill(0, range+1)
+    @inbounds for i = lo:hi
+        counts[v[i] + offs] += 1
+    end
+
+    idx = lo
+    @inbounds for i = maybe_reverse(o, 1:range+1)
+        lastidx = idx + counts[i] - 1
+        val = i-offs
+        for j = idx:lastidx
+            v[j] = val
+        end
+        idx = lastidx + 1
+    end
+
+    v
+end
+
+
+
+#
+# Consider radix sort
+#
+struct ConsiderRadixSort{T <: Algorithm, U <: Algorithm} <: Algorithm
+    radix::T
+    next::U
+end
+ConsiderRadixSort(next) = ConsiderRadixSort(RadixSort(), next)
+function _sort!(v::AbstractVector, a::ConsiderRadixSort, o::DirectOrdering;
+                lo=firstindex(v), hi=lastindex(v), lenm1=hi-lo,
+                U = UIntMappable(eltype(v), o),
+                mn, mx, umn=uint_map(mn, o), umx=uint_map(mx, o), urange=umx-umn,
+                bits = unsigned(8sizeof(urange) - leading_zeros(urange)), kw...)
+    if sizeof(U) <= 8 && bits+70 < 22log(lenm1)
+        _sort!(v, a.radix, o; lo, hi, lenm1, mn, mx, umn, umx, urange, bits, kw...)
+    else
+        _sort!(v, a.next, o; lo, hi, lenm1, mn, mx, umn, umx, urange, bits, kw...)
+    end
+end
+
+
 
+#
+# Radix sort
+#
+struct RadixSort <: Algorithm end
+function _sort!(v::AbstractVector, a::RadixSort, o::DirectOrdering;
+                lo=firstindex(v), hi=lastindex(v), lenm1=hi-lo,
+                mn, mx, umn=uint_mapping(mn, o), umx=uint_mapping(mx, o), urange=umx-umn,
+                bits = unsigned(8sizeof(urange) - leading_zeros(urange)),
+                U = UIntMappable(eltype(v), o), scratch=nothing, kw...)
+
+    # At this point, we are committed to radix sort.
+    u = uint_map!(v, lo, hi, o)
+
+    # we subtract umn to avoid radixing over unnecessary bits. For example,
+    # Int32[3, -1, 2] uint_maps to UInt32[0x80000003, 0x7fffffff, 0x80000002]
+    # which uses all 32 bits, but once we subtract umn = 0x7fffffff, we are left with
+    # UInt32[0x00000004, 0x00000000, 0x00000003] which uses only 3 bits, and
+    # Float32[2.012, 400.0, 12.345] uint_maps to UInt32[0x3fff3b63, 0x3c37ffff, 0x414570a4]
+    # which is reduced to UInt32[0x03c73b64, 0x00000000, 0x050d70a5] using only 26 bits.
+    # the overhead for this subtraction is small enough that it is worthwhile in many cases.
+
+    # this is faster than u[lo:hi] .-= umn as of v1.9.0-DEV.100
+    @inbounds for i in lo:hi
+        u[i] -= umn
+    end
+
+    len = lenm1 + 1
+    if scratch !== nothing && checkbounds(Bool, scratch, lo:hi) # Fully preallocated and aligned scratch
+        u2 = radix_sort!(u, lo, hi, bits, reinterpret(U, scratch))
+        uint_unmap!(v, u2, lo, hi, o, umn)
+    elseif scratch !== nothing && (applicable(resize!, scratch, len) || length(scratch) >= len) # Viable scratch
+        length(scratch) >= len || resize!(scratch, len)
+        t1 = axes(scratch, 1) isa OneTo ? scratch : view(scratch, firstindex(scratch):lastindex(scratch))
+        u2 = radix_sort!(view(u, lo:hi), 1, len, bits, reinterpret(U, t1))
+        uint_unmap!(view(v, lo:hi), u2, 1, len, o, umn)
+    else # No viable scratch
+        u2 = radix_sort!(u, lo, hi, bits, similar(u))
+        uint_unmap!(v, u2, lo, hi, o, umn)
+    end
+end
+
+
+
+#
+# Quicksort
+#
 """
-    QuickSort
+    PartialQuickSort(lo::Union{Integer, Missing}, hi::Union{Integer, Missing})
 
-Indicate that a sorting function should use the quick sort algorithm.
+Indicate that a sorting function should use the partial quick sort algorithm.
 
-Quick sort picks a pivot element, partitions the array based on the pivot,
-and then sorts the elements before and after the pivot recursively.
+Partial quick sort finds and sorts the elements that would end up in positions
+`lo:hi` using [`QuickSort`](@ref).
 
 Characteristics:
   * *stable*: preserves the ordering of elements which compare equal
     (e.g. "a" and "A" in a sort of letters which ignores case).
   * *not in-place* in memory.
   * *divide-and-conquer*: sort strategy similar to [`MergeSort`](@ref).
-  * *good performance* for almost all large collections.
-  * *quadratic worst case runtime* in pathological cases
-    (vanishingly rare for non-malicious input)
 """
-const QuickSort = PartialQuickSort(missing, missing)
+struct PartialQuickSort{L<:Union{Integer,Missing}, H<:Union{Integer,Missing}, T<:Algorithm} <: Algorithm
+    lo::L
+    hi::H
+    next::T
+end
+PartialQuickSort(k::Integer) = InitialOptimizations(PartialQuickSort(missing, k, SMALL_ALGORITHM))
+PartialQuickSort(k::OrdinalRange) = InitialOptimizations(PartialQuickSort(first(k), last(k), SMALL_ALGORITHM))
+_PartialQuickSort(k::Integer) = InitialOptimizations(PartialQuickSort(k:k))
+_PartialQuickSort(k::OrdinalRange) = InitialOptimizations(PartialQuickSort(k))
 
 """
-    MergeSort
+    QuickSort
 
-Indicate that a sorting function should use the merge sort algorithm.
+Indicate that a sorting function should use the quick sort algorithm.
 
-Merge sort divides the collection into subcollections and
-repeatedly merges them, sorting each subcollection at each step,
-until the entire collection has been recombined in sorted form.
+Quick sort picks a pivot element, partitions the array based on the pivot,
+and then sorts the elements before and after the pivot recursively.
 
 Characteristics:
-  * *stable*: preserves the ordering of elements which compare
-    equal (e.g. "a" and "A" in a sort of letters which ignores
-    case).
+  * *stable*: preserves the ordering of elements which compare equal
+    (e.g. "a" and "A" in a sort of letters which ignores case).
   * *not in-place* in memory.
-  * *divide-and-conquer* sort strategy.
-"""
-const MergeSort = MergeSortAlg()
-
-"""
-    AdaptiveSort
-
-Indicate that a sorting function should use the fastest available stable algorithm.
-
-Currently, AdaptiveSort uses
-  * [`InsertionSort`](@ref) for short vectors
-  * [`QuickSort`](@ref) for vectors that are not [`UIntMappable`](@ref)
-  * Radix sort for long vectors
-  * Counting sort for vectors of integers spanning a short range
+  * *divide-and-conquer*: sort strategy similar to [`MergeSort`](@ref).
+  * *good performance* for almost all large collections.
+  * *quadratic worst case runtime* in pathological cases
+    (vanishingly rare for non-malicious input)
 """
-const AdaptiveSort = AdaptiveSortAlg()
-
-const DEFAULT_UNSTABLE = AdaptiveSort
-const DEFAULT_STABLE   = AdaptiveSort
-const SMALL_ALGORITHM  = InsertionSort
-const SMALL_THRESHOLD  = 20
-
-function sort!(v::AbstractVector, lo::Integer, hi::Integer, ::InsertionSortAlg, o::Ordering)
-    lo_plus_1 = (lo + 1)::Integer
-    @inbounds for i = lo_plus_1:hi
-        j = i
-        x = v[i]
-        while j > lo
-            y = v[j-1]
-            if !(lt(o, x, y)::Bool)
-                break
-            end
-            v[j] = y
-            j -= 1
-        end
-        v[j] = x
-    end
-    return v
-end
+const QuickSort = PartialQuickSort(missing, missing, SMALL_ALGORITHM)
 
 # select a pivot for QuickSort
 #
@@ -570,20 +889,10 @@ function partition!(t::AbstractVector, lo::Integer, hi::Integer, o::Ordering, v:
     pivot, lo-trues
 end
 
-function sort!(v::AbstractVector, lo::Integer, hi::Integer, a::PartialQuickSort,
-               o::Ordering, t::AbstractVector=similar(v), swap=false, rev=false;
-               check_presorted=true)
-
-    if check_presorted && !rev && !swap
-        # Even if we are only sorting a short region, we can only short-circuit if the whole
-        # vector is presorted. A weaker condition is possible, but unlikely to be useful.
-        if _issorted(v, lo, hi, o)
-            return v
-        elseif _issorted(v, lo, hi, Lt((x, y) -> !lt(o, x, y)))
-            # Reverse only if necessary. Using issorted(..., Reverse(o)) would violate stability.
-            return reverse!(v, lo, hi)
-        end
-    end
+function _sort!(v::AbstractVector, a::PartialQuickSort, o::Ordering;
+                lo=firstindex(v), hi=lastindex(v), scratch=similar(v),
+                t=reinterpret(eltype(v), checkbounds(Bool, scratch, lo:hi) ? scratch : resize!(scratch, length(v))),
+                swap=false, rev=false, kw...)
 
     while lo < hi && hi - lo > SMALL_THRESHOLD
         pivot, j = swap ? partition!(v, lo, hi, o, t, rev) : partition!(t, lo, hi, o, v, rev)
@@ -603,62 +912,41 @@ function sort!(v::AbstractVector, lo::Integer, hi::Integer, a::PartialQuickSort,
         elseif j-lo < hi-j
             # Sort the lower part recursively because it is smaller. Recursing on the
             # smaller part guarantees O(log(n)) stack space even on pathological inputs.
-            sort!(v, lo, j-1, a, o, t, swap, rev; check_presorted=false)
+            _sort!(v, a, o; lo, hi=j-1, scratch, t, swap, rev, kw...)
             lo = j+1
             rev = !rev
         else # Sort the higher part recursively
-            sort!(v, j+1, hi, a, o, t, swap, !rev; check_presorted=false)
+            _sort!(v, a, o; lo=j+1, hi, scratch, t, swap, rev=!rev, kw...)
             hi = j-1
         end
     end
     hi < lo && return v
     swap && copyto!(v, lo, t, lo, hi-lo+1)
     rev && reverse!(v, lo, hi)
-    sort!(v, lo, hi, SMALL_ALGORITHM, o)
+    _sort!(v, a.next, o; lo, hi, scratch, t, kw...)
 end
 
-function sort!(v::AbstractVector{T}, lo::Integer, hi::Integer, a::MergeSortAlg, o::Ordering,
-        t0::Union{AbstractVector{T}, Nothing}=nothing) where T
-    @inbounds if lo < hi
-        hi-lo <= SMALL_THRESHOLD && return sort!(v, lo, hi, SMALL_ALGORITHM, o)
 
-        m = midpoint(lo, hi)
-
-        t = t0 === nothing ? similar(v, m-lo+1) : t0
-        length(t) < m-lo+1 && resize!(t, m-lo+1)
-        require_one_based_indexing(t)
-
-        sort!(v, lo,  m,  a, o, t)
-        sort!(v, m+1, hi, a, o, t)
-
-        i, j = 1, lo
-        while j <= m
-            t[i] = v[j]
-            i += 1
-            j += 1
-        end
 
-        i, k = 1, lo
-        while k < j <= hi
-            if lt(o, v[j], t[i])
-                v[k] = v[j]
-                j += 1
-            else
-                v[k] = t[i]
-                i += 1
-            end
-            k += 1
-        end
-        while k < j
-            v[k] = t[i]
-            k += 1
-            i += 1
-        end
+#
+# StableCheckSorted
+#
+struct StableCheckSorted{T<:Algorithm} <: Algorithm
+    next::T
+end
+function _sort!(v::AbstractVector, a::StableCheckSorted, o::Ordering;
+                lo=firstindex(v), hi=lastindex(v), kw...)
+    if _issorted(v, lo, hi, o)
+        return v
+    elseif _issorted(v, lo, hi, Lt((x, y) -> !lt(o, x, y)))
+        # Reverse only if necessary. Using issorted(..., Reverse(o)) would violate stability.
+        return reverse!(v, lo, hi)
     end
 
-    return v
+    _sort!(v, a.next, o; lo, hi, kw...)
 end
 
+
 # This is a stable least significant bit first radix sort.
 #
 # That is, it first sorts the entire vector by the last chunk_size bits, then by the second
@@ -725,23 +1013,6 @@ function radix_chunk_size_heuristic(lo::Integer, hi::Integer, bits::Unsigned)
     UInt8(cld(bits, cld(bits, guess)))
 end
 
-# For AbstractVector{Bool}, counting sort is always best.
-# This is an implementation of counting sort specialized for Bools.
-# Accepts unused scratch space to avoid method ambiguity.
-function sort!(v::AbstractVector{Bool}, lo::Integer, hi::Integer, ::AdaptiveSortAlg, o::Ordering,
-        t::Union{AbstractVector{Bool}, Nothing}=nothing)
-    first = lt(o, false, true) ? false : lt(o, true, false) ? true : return v
-    count = 0
-    @inbounds for i in lo:hi
-        if v[i] == first
-            count += 1
-        end
-    end
-    @inbounds v[lo:lo+count-1] .= first
-    @inbounds v[lo+count:hi] .= !first
-    v
-end
-
 maybe_unsigned(x::Integer) = x # this is necessary to avoid calling unsigned on BigInt
 maybe_unsigned(x::BitSigned) = unsigned(x)
 function _extrema(v::AbstractVector, lo::Integer, hi::Integer, o::Ordering)
@@ -760,130 +1031,20 @@ function _issorted(v::AbstractVector, lo::Integer, hi::Integer, o::Ordering)
     end
     true
 end
-function sort!(v::AbstractVector{T}, lo::Integer, hi::Integer, ::AdaptiveSortAlg, o::Ordering,
-               t::Union{AbstractVector{T}, Nothing}=nothing) where T
-    # if the sorting task is not UIntMappable, then we can't radix sort or sort_int_range!
-    # so we skip straight to the fallback algorithm which is comparison based.
-    U = UIntMappable(eltype(v), o)
-    U === nothing && return sort!(v, lo, hi, QuickSort, o)
-
-    # to avoid introducing excessive detection costs for the trivial sorting problem
-    # and to avoid overflow, we check for small inputs before any other runtime checks
-    hi <= lo && return v
-    lenm1 = maybe_unsigned(hi-lo) # adding 1 would risk overflow
-    # only count sort on a short range can compete with insertion sort when lenm1 < 40
-    # and the optimization is not worth the detection cost, so we use insertion sort.
-    lenm1 < 40 && return sort!(v, lo, hi, SMALL_ALGORITHM, o)
-
-    # For most arrays, a presorted check is cheap (overhead < 5%) and for most large
-    # arrays it is essentially free (<1%). Insertion sort runs in a fast O(n) on presorted
-    # input and this guarantees presorted input will always be efficiently handled
-    _issorted(v, lo, hi, o) && return v
-
-    # For large arrays, a reverse-sorted check is essentially free (overhead < 1%)
-    if lenm1 >= 500 && _issorted(v, lo, hi, ReverseOrdering(o))
-        # If reversing is valid, do so. This does not violate stability
-        # because being UIntMappable implies a linear order.
-        reverse!(v, lo, hi)
-        return v
-    end
-
-    # UInt128 does not support fast bit shifting so we never
-    # dispatch to radix sort but we may still perform count sort
-    if sizeof(U) > 8
-        if T <: Integer && o isa DirectOrdering
-            v_min, v_max = _extrema(v, lo, hi, Forward)
-            v_range = maybe_unsigned(v_max-v_min)
-            v_range == 0 && return v # all same
-
-            # we know lenm1 ≥ 40, so this will never underflow.
-            # if lenm1 > 3.7e18 (59 exabytes), then this may incorrectly dispatch to fallback
-            if v_range < 5lenm1-100 # count sort will outperform comparison sort if v's range is small
-                return sort_int_range!(v, Int(v_range+1), v_min, o === Forward ? identity : reverse, lo, hi)
-            end
-        end
-        return sort!(v, lo, hi, QuickSort, o; check_presorted=false)
-    end
-
-    v_min, v_max = _extrema(v, lo, hi, o)
-    lt(o, v_min, v_max) || return v # all same
-    if T <: Integer && o isa DirectOrdering
-        R = o === Reverse
-        v_range = maybe_unsigned(R ? v_min-v_max : v_max-v_min)
-        if v_range < div(lenm1, 2) # count sort will be superior if v's range is very small
-            return sort_int_range!(v, Int(v_range+1), R ? v_max : v_min, R ? reverse : identity, lo, hi)
-        end
-    end
-
-    u_min, u_max = uint_map(v_min, o), uint_map(v_max, o)
-    u_range = maybe_unsigned(u_max-u_min)
-    if u_range < div(lenm1, 2) # count sort will be superior if u's range is very small
-        u = uint_map!(v, lo, hi, o)
-        sort_int_range!(u, Int(u_range+1), u_min, identity, lo, hi)
-        return uint_unmap!(v, u, lo, hi, o)
-    end
-
-    # if u's range is small, then once we subtract out v_min, we'll get a vector like
-    # UInt16[0x001a, 0x0015, 0x0006, 0x001b, 0x0008, 0x000c, 0x0001, 0x000e, 0x001c, 0x0009]
-    # where we only need to radix over the last few bits (5, in the example).
-    bits = unsigned(8sizeof(u_range) - leading_zeros(u_range))
-
-    # radix sort runs in O(bits * lenm1), quick sort runs in O(lenm1 * log(lenm1)).
-    # dividing both sides by lenm1 and introducing empirical constant factors yields
-    # the following heuristic for when QuickSort is faster than RadixSort
-    if 22log(lenm1) < bits + 70
-        return if lenm1 > 80
-            sort!(v, lo, hi, QuickSort, o; check_presorted=false)
-        else
-            sort!(v, lo, hi, SMALL_ALGORITHM, o)
-        end
-    end
 
-    # At this point, we are committed to radix sort.
-    u = uint_map!(v, lo, hi, o)
-
-    # we subtract u_min to avoid radixing over unnecessary bits. For example,
-    # Int32[3, -1, 2] uint_maps to UInt32[0x80000003, 0x7fffffff, 0x80000002]
-    # which uses all 32 bits, but once we subtract u_min = 0x7fffffff, we are left with
-    # UInt32[0x00000004, 0x00000000, 0x00000003] which uses only 3 bits, and
-    # Float32[2.012, 400.0, 12.345] uint_maps to UInt32[0x3fff3b63, 0x3c37ffff, 0x414570a4]
-    # which is reduced to UInt32[0x03c73b64, 0x00000000, 0x050d70a5] using only 26 bits.
-    # the overhead for this subtraction is small enough that it is worthwhile in many cases.
+## default sorting policy ##
 
-    # this is faster than u[lo:hi] .-= u_min as of v1.9.0-DEV.100
-    @inbounds for i in lo:hi
-        u[i] -= u_min
-    end
+InitialOptimizations(x) = MissingOptimization(BoolOptimization(Small{10}(IEEEFloatOptimization(x))))
+const DEFAULT_STABLE = InitialOptimizations(IsUIntMappable(
+    Small{40}(CheckSorted(ComputeExtrema(ConsiderCountingSort(ConsiderRadixSort(Small{80}(QuickSort)))))),
+    StableCheckSorted(QuickSort)))
+const DEFAULT_UNSTABLE = DEFAULT_STABLE
+const SMALL_THRESHOLD  = 20
 
-    len = lenm1 + 1
-    if t !== nothing && checkbounds(Bool, t, lo:hi) # Fully preallocated and aligned scratch space
-        u2 = radix_sort!(u, lo, hi, bits, reinterpret(U, t))
-        uint_unmap!(v, u2, lo, hi, o, u_min)
-    elseif t !== nothing && (applicable(resize!, t, len) || length(t) >= len) # Viable scratch space
-        length(t) >= len || resize!(t, len)
-        t1 = axes(t, 1) isa OneTo ? t : view(t, firstindex(t):lastindex(t))
-        u2 = radix_sort!(view(u, lo:hi), 1, len, bits, reinterpret(U, t1))
-        uint_unmap!(view(v, lo:hi), u2, 1, len, o, u_min)
-    else # No viable scratch space
-        u2 = radix_sort!(u, lo, hi, bits, similar(u))
-        uint_unmap!(v, u2, lo, hi, o, u_min)
-    end
-end
 
-## generic sorting methods ##
 
 defalg(v::AbstractArray) = DEFAULT_STABLE
 
-function sort!(v::AbstractVector{T}, alg::Algorithm,
-               order::Ordering, t::Union{AbstractVector{T}, Nothing}=nothing) where T
-    sort!(v, firstindex(v), lastindex(v), alg, order, t)
-end
-
-function sort!(v::AbstractVector{T}, lo::Integer, hi::Integer, alg::Algorithm,
-               order::Ordering, t::Union{AbstractVector{T}, Nothing}=nothing) where T
-    sort!(v, lo, hi, alg, order)
-end
-
 """
     sort!(v; alg::Algorithm=defalg(v), lt=isless, by=identity, rev::Bool=false, order::Ordering=Forward)
 
@@ -931,30 +1092,11 @@ function sort!(v::AbstractVector{T};
                rev::Union{Bool,Nothing}=nothing,
                order::Ordering=Forward,
                scratch::Union{AbstractVector{T}, Nothing}=nothing) where T
-    sort!(v, alg, ord(lt,by,rev,order), scratch)
-end
-
-# sort! for vectors of few unique integers
-function sort_int_range!(x::AbstractVector{<:Integer}, rangelen, minval, maybereverse,
-                         lo=firstindex(x), hi=lastindex(x))
-    offs = 1 - minval
-
-    counts = fill(0, rangelen)
-    @inbounds for i = lo:hi
-        counts[x[i] + offs] += 1
-    end
-
-    idx = lo
-    @inbounds for i = maybereverse(1:rangelen)
-        lastidx = idx + counts[i] - 1
-        val = i-offs
-        for j = idx:lastidx
-            x[j] = val
-        end
-        idx = lastidx + 1
+    if scratch === nothing # TODO: reduce redundancy
+        _sort!(v, alg, ord(lt,by,rev,order))
+    else
+        _sort!(v, alg, ord(lt,by,rev,order); scratch)
     end
-
-    return x
 end
 
 """
@@ -1080,7 +1222,7 @@ function partialsortperm!(ix::AbstractVector{<:Integer}, v::AbstractVector,
     end
 
     # do partial quicksort
-    sort!(ix, _PartialQuickSort(k), Perm(ord(lt, by, rev, order), v))
+    _sort!(ix, _PartialQuickSort(k), Perm(ord(lt, by, rev, order), v))
 
     maybeview(ix, k)
 end
@@ -1288,13 +1430,22 @@ function sort(A::AbstractArray{T};
     end
 end
 
-@noinline function sort_chunks!(Av, n, alg, order, t)
+@noinline function sort_chunks!(Av, n, alg, order, scratch)
     inds = LinearIndices(Av)
-    for s = first(inds):n:last(inds)
-        sort!(Av, s, s+n-1, alg, order, t)
+    for lo = first(inds):n:last(inds)
+        _sort!(Av, alg, order; lo, hi=lo+n-1, scratch)
     end
     Av
 end
+# TODO: reduce redundancy
+@noinline function sort_chunks!(Av, n, alg, order, scratch::Nothing)
+    inds = LinearIndices(Av)
+    for lo = first(inds):n:last(inds)
+        _sort!(Av, alg, order; lo, hi=lo+n-1)
+    end
+    Av
+end
+
 
 """
     sort!(A; dims::Integer, alg::Algorithm=defalg(A), lt=isless, by=identity, rev::Bool=false, order::Ordering=Forward)
@@ -1346,7 +1497,7 @@ function _sort!(A::AbstractArray{T}, ::Val{K},
     remdims = ntuple(i -> i == K ? 1 : axes(A, i), nd)
     for idx in CartesianIndices(remdims)
         Av = view(A, ntuple(i -> i == K ? Colon() : idx[i], nd)...)
-        sort!(Av, alg, order, scratch)
+        sort!(Av; alg, order, scratch)
     end
     A
 end
@@ -1437,167 +1588,72 @@ function uint_unmap!(v::AbstractVector, u::AbstractVector{U}, lo::Integer, hi::I
 end
 
 
-## fast clever sorting for floats ##
-
-module Float
-using ..Sort
-using ...Order
-using Base: IEEEFloat
-
-import Core.Intrinsics: slt_int
-import ..Sort: sort!, UIntMappable, uint_map, uint_unmap
-import ...Order: lt, DirectOrdering
-
-# fpsort is not safe for vectors of mixed bitwidth such as Vector{Union{Float32, Float64}}.
-# This type allows us to dispatch only when it is safe to do so. See #42739 for more info.
-const FPSortable = Union{
-    AbstractVector{Union{Float16, Missing}},
-    AbstractVector{Union{Float32, Missing}},
-    AbstractVector{Union{Float64, Missing}},
-    AbstractVector{Float16},
-    AbstractVector{Float32},
-    AbstractVector{Float64},
-    AbstractVector{Missing}}
-
-struct Left <: Ordering end
-struct Right <: Ordering end
-
-left(::DirectOrdering) = Left()
-right(::DirectOrdering) = Right()
-
-left(o::Perm) = Perm(left(o.order), o.data)
-right(o::Perm) = Perm(right(o.order), o.data)
-
-lt(::Left, x::T, y::T) where {T<:IEEEFloat} = slt_int(y, x)
-lt(::Right, x::T, y::T) where {T<:IEEEFloat} = slt_int(x, y)
-
-uint_map(x::Float16, ::Left) = ~reinterpret(UInt16, x)
-uint_unmap(::Type{Float16}, u::UInt16, ::Left) = reinterpret(Float16, ~u)
-uint_map(x::Float16, ::Right) = reinterpret(UInt16, x)
-uint_unmap(::Type{Float16}, u::UInt16, ::Right) = reinterpret(Float16, u)
-UIntMappable(::Type{Float16}, ::Union{Left, Right}) = UInt16
-
-uint_map(x::Float32, ::Left) = ~reinterpret(UInt32, x)
-uint_unmap(::Type{Float32}, u::UInt32, ::Left) = reinterpret(Float32, ~u)
-uint_map(x::Float32, ::Right) = reinterpret(UInt32, x)
-uint_unmap(::Type{Float32}, u::UInt32, ::Right) = reinterpret(Float32, u)
-UIntMappable(::Type{Float32}, ::Union{Left, Right}) = UInt32
-
-uint_map(x::Float64, ::Left) = ~reinterpret(UInt64, x)
-uint_unmap(::Type{Float64}, u::UInt64, ::Left) = reinterpret(Float64, ~u)
-uint_map(x::Float64, ::Right) = reinterpret(UInt64, x)
-uint_unmap(::Type{Float64}, u::UInt64, ::Right) = reinterpret(Float64, u)
-UIntMappable(::Type{Float64}, ::Union{Left, Right}) = UInt64
-
-isnan(o::DirectOrdering, x::IEEEFloat) = (x!=x)
-isnan(o::DirectOrdering, x::Missing) = false
-isnan(o::Perm, i::Integer) = isnan(o.order,o.data[i])
-
-ismissing(o::DirectOrdering, x::IEEEFloat) = false
-ismissing(o::DirectOrdering, x::Missing) = true
-ismissing(o::Perm, i::Integer) = ismissing(o.order,o.data[i])
-
-allowsmissing(::AbstractVector{T}, ::DirectOrdering) where {T} = T >: Missing
-allowsmissing(::AbstractVector{<:Integer},
-              ::Perm{<:DirectOrdering,<:AbstractVector{T}}) where {T} =
-    T >: Missing
-
-function specials2left!(testf::Function, v::AbstractVector, o::Ordering,
-                        lo::Integer=firstindex(v), hi::Integer=lastindex(v))
-    i = lo
-    @inbounds while i <= hi && testf(o,v[i])
-        i += 1
-    end
-    j = i + 1
-    @inbounds while j <= hi
-        if testf(o,v[j])
-            v[i], v[j] = v[j], v[i]
-            i += 1
-        end
-        j += 1
-    end
-    return i, hi
+
+### Unused ###
+
+struct MergeSortAlg{T <: Algorithm} <: Algorithm
+    next::T
 end
-function specials2right!(testf::Function, v::AbstractVector, o::Ordering,
-                         lo::Integer=firstindex(v), hi::Integer=lastindex(v))
-    i = hi
-    @inbounds while lo <= i && testf(o,v[i])
-        i -= 1
-    end
-    j = i - 1
-    @inbounds while lo <= j
-        if testf(o,v[j])
-            v[i], v[j] = v[j], v[i]
-            i -= 1
+"""
+    MergeSort
+
+Indicate that a sorting function should use the merge sort algorithm.
+
+Merge sort divides the collection into subcollections and
+repeatedly merges them, sorting each subcollection at each step,
+until the entire collection has been recombined in sorted form.
+
+Characteristics:
+  * *stable*: preserves the ordering of elements which compare
+    equal (e.g. "a" and "A" in a sort of letters which ignores
+    case).
+  * *not in-place* in memory.
+  * *divide-and-conquer* sort strategy.
+"""
+const MergeSort = MergeSortAlg(SMALL_ALGORITHM)
+
+
+function _sort!(v::AbstractVector, a::MergeSortAlg, o::Ordering;
+                lo=firstindex(v), hi=lastindex(v), scratch=nothing)
+    @inbounds if lo < hi
+        hi-lo <= SMALL_THRESHOLD && return _sort!(v, a.next, o; lo, hi)
+
+        m = midpoint(lo, hi)
+
+        t = scratch === nothing ? similar(v, m-lo+1) : scratch
+        length(t) < m-lo+1 && resize!(t, m-lo+1)
+        Base.require_one_based_indexing(t)
+
+        _sort!(v, a, o; lo, hi=m, scratch=t)
+        _sort!(v, a, o; lo=m+1, hi, scratch=t)
+
+        i, j = 1, lo
+        while j <= m
+            t[i] = v[j]
+            i += 1
+            j += 1
         end
-        j -= 1
-    end
-    return lo, i
-end
 
-function specials2left!(v::AbstractVector, a::Algorithm, o::Ordering)
-    lo, hi = firstindex(v), lastindex(v)
-    if allowsmissing(v, o)
-        i, _ = specials2left!((v, o) -> ismissing(v, o) || isnan(v, o), v, o, lo, hi)
-        sort!(v, lo, i-1, a, o)
-        return i, hi
-    else
-        return specials2left!(isnan, v, o, lo, hi)
-    end
-end
-function specials2right!(v::AbstractVector, a::Algorithm, o::Ordering)
-    lo, hi = firstindex(v), lastindex(v)
-    if allowsmissing(v, o)
-        _, i = specials2right!((v, o) -> ismissing(v, o) || isnan(v, o), v, o, lo, hi)
-        sort!(v, i+1, hi, a, o)
-        return lo, i
-    else
-        return specials2right!(isnan, v, o, lo, hi)
+        i, k = 1, lo
+        while k < j <= hi
+            if lt(o, v[j], t[i])
+                v[k] = v[j]
+                j += 1
+            else
+                v[k] = t[i]
+                i += 1
+            end
+            k += 1
+        end
+        while k < j
+            v[k] = t[i]
+            k += 1
+            i += 1
+        end
     end
-end
 
-specials2end!(v::AbstractVector, a::Algorithm, o::ForwardOrdering) =
-    specials2right!(v, a, o)
-specials2end!(v::AbstractVector, a::Algorithm, o::ReverseOrdering) =
-    specials2left!(v, a, o)
-specials2end!(v::AbstractVector{<:Integer}, a::Algorithm, o::Perm{<:ForwardOrdering}) =
-    specials2right!(v, a, o)
-specials2end!(v::AbstractVector{<:Integer}, a::Algorithm, o::Perm{<:ReverseOrdering}) =
-    specials2left!(v, a, o)
-
-issignleft(o::ForwardOrdering, x::IEEEFloat) = lt(o, x, zero(x))
-issignleft(o::ReverseOrdering, x::IEEEFloat) = lt(o, x, -zero(x))
-issignleft(o::Perm, i::Integer) = issignleft(o.order, o.data[i])
-
-function fpsort!(v::AbstractVector{T}, a::Algorithm, o::Ordering,
-                 t::Union{AbstractVector{T}, Nothing}=nothing) where T
-    # fpsort!'s optimizations speed up comparisons, of which there are O(nlogn).
-    # The overhead is O(n). For n < 10, it's not worth it.
-    length(v) < 10 && return sort!(v, firstindex(v), lastindex(v), SMALL_ALGORITHM, o, t)
-
-    i, j = lo, hi = specials2end!(v,a,o)
-    @inbounds while true
-        while i <= j &&  issignleft(o,v[i]); i += 1; end
-        while i <= j && !issignleft(o,v[j]); j -= 1; end
-        i <= j || break
-        v[i], v[j] = v[j], v[i]
-        i += 1; j -= 1
-    end
-    sort!(v, lo, j,  a, left(o), t)
-    sort!(v, i,  hi, a, right(o), t)
     return v
 end
 
 
-function sort!(v::FPSortable, a::Algorithm, o::DirectOrdering,
-               t::Union{FPSortable, Nothing}=nothing)
-    fpsort!(v, a, o, t)
-end
-function sort!(v::AbstractVector{T}, a::Algorithm, o::Perm{<:DirectOrdering,<:FPSortable},
-               t::Union{AbstractVector{T}, Nothing}=nothing) where T <: Union{Signed, Unsigned}
-    fpsort!(v, a, o, t)
-end
-
-end # module Sort.Float
-
 end # module Sort
diff --git a/test/sorting.jl b/test/sorting.jl
index 4a0299b2217c2..bd22c62bab6fc 100644
--- a/test/sorting.jl
+++ b/test/sorting.jl
@@ -79,8 +79,9 @@ end
 end
 
 @testset "stability" begin
-    for Alg in [InsertionSort, MergeSort, QuickSort, Base.Sort.AdaptiveSort, Base.DEFAULT_STABLE,
-        PartialQuickSort(missing, 1729), PartialQuickSort(1729, missing)]
+    for Alg in [InsertionSort, MergeSort, QuickSort, Base.DEFAULT_STABLE,
+            PartialQuickSort(missing, 1729, Base.Sort.SMALL_ALGORITHM),
+            PartialQuickSort(1729, missing, Base.Sort.SMALL_ALGORITHM)]
         @test issorted(sort(1:2000, alg=Alg, by=x->0))
         @test issorted(sort(1:2000, alg=Alg, by=x->x÷100))
     end
@@ -534,11 +535,11 @@ end
     @test issorted(a)
 
     a = view([9:-1:0;], :)::SubArray
-    Base.Sort.sort_int_range!(a, 10, 0, identity)  # test it supports non-Vector
+    Base.Sort._sort!(a, Base.Sort.CountingSort(), Base.Forward, mn=0, mx=9)  # test it supports non-Vector
     @test issorted(a)
 
     a = OffsetArray([9:-1:0;], -5)
-    Base.Sort.sort_int_range!(a, 10, 0, identity)
+    Base.Sort._sort!(a, Base.Sort.CountingSort(), Base.Forward, mn=0, mx=9)
     @test issorted(a)
 end
 
@@ -632,9 +633,9 @@ end
 @testset "uint mappings" begin
 
     #Construct value lists
-    floats = [T[-π, -1.0, -1/π, 1/π, 1.0, π, -0.0, 0.0, Inf, -Inf, NaN, -NaN,
-                prevfloat(T(0)), nextfloat(T(0)), prevfloat(T(Inf)), nextfloat(T(-Inf))]
-        for T in [Float16, Float32, Float64]]
+    floats = [reinterpret(U, vcat(T[-π, -1.0, -1/π, 1/π, 1.0, π, -0.0, 0.0, Inf, -Inf, NaN, -NaN,
+                prevfloat(T(0)), nextfloat(T(0)), prevfloat(T(Inf)), nextfloat(T(-Inf))], randnans(4)))
+        for (U, T) in [(UInt16, Float16), (UInt32, Float32), (UInt64, Float64)]]
 
     ints = [T[17, -T(17), 0, -one(T), 1, typemax(T), typemin(T), typemax(T)-1, typemin(T)+1]
         for T in Base.BitInteger_types]
@@ -650,21 +651,18 @@ end
     UIntN(::Val{8}) = UInt64
     UIntN(::Val{16}) = UInt128
     map(vals) do x
+        x isa Base.ReinterpretArray && return
         T = eltype(x)
         U = UIntN(Val(sizeof(T)))
         append!(x, rand(T, 4))
         append!(x, reinterpret.(T, rand(U, 4)))
-        if T <: AbstractFloat
-            mask = reinterpret(U, T(NaN))
-            append!(x, reinterpret.(T, mask .| rand(U, 4)))
-        end
     end
 
     for x in vals
         T = eltype(x)
         U = UIntN(Val(sizeof(T)))
-        for order in [Forward, Reverse, Base.Sort.Float.Left(), Base.Sort.Float.Right(), By(Forward, identity)]
-            if order isa Base.Order.By || ((T <: AbstractFloat) == (order isa DirectOrdering))
+        for order in [Forward, Reverse, By(Forward, identity)]
+            if order isa Base.Order.By
                 @test Base.Sort.UIntMappable(T, order) === nothing
                 continue
             end
@@ -681,10 +679,6 @@ end
 
             for a in x
                 for b in x
-                    if order === Base.Sort.Float.Left() || order === Base.Sort.Float.Right()
-                        # Left and Right orderings guarantee homogeneous sign and no NaNs
-                        (isnan(a) || isnan(b) || signbit(a) != signbit(b)) && continue
-                    end
                     @test Base.Order.lt(order, a, b) === Base.Order.lt(Forward, Base.Sort.uint_map(a, order), Base.Sort.uint_map(b, order))
                 end
             end
@@ -705,7 +699,7 @@ end
 
     # Nevertheless, it still works...
     for alg in [InsertionSort, MergeSort, QuickSort,
-            Base.Sort.AdaptiveSort, Base.DEFAULT_STABLE, Base.DEFAULT_UNSTABLE]
+            Base.DEFAULT_STABLE, Base.DEFAULT_UNSTABLE]
         @test sort(v, alg=alg, lt = <=) == s
     end
     @test partialsort(v, 172, lt = <=) == s[172]
@@ -716,7 +710,7 @@ end
     # this invalid lt order.
     perm = reverse(sortperm(v, rev=true))
     for alg in [InsertionSort, MergeSort, QuickSort,
-            Base.Sort.AdaptiveSort, Base.DEFAULT_STABLE, Base.DEFAULT_UNSTABLE]
+            Base.DEFAULT_STABLE, Base.DEFAULT_UNSTABLE]
         @test sort(1:n, alg=alg, lt = (i,j) -> v[i]<=v[j]) == perm
     end
     @test partialsort(1:n, 172, lt = (i,j) -> v[i]<=v[j]) == perm[172]
@@ -724,7 +718,7 @@ end
 
     # lt can be very poorly behaved and sort will still permute its input in some way.
     for alg in [InsertionSort, MergeSort, QuickSort,
-            Base.Sort.AdaptiveSort, Base.DEFAULT_STABLE, Base.DEFAULT_UNSTABLE]
+            Base.DEFAULT_STABLE, Base.DEFAULT_UNSTABLE]
         @test sort!(sort(v, alg=alg, lt = (x,y) -> rand([false, true]))) == s
     end
     @test partialsort(v, 172, lt = (x,y) -> rand([false, true])) ∈ 1:5

From 61e4006e0d1a37bf1d0e86d9de8920104deb1a36 Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Sat, 29 Oct 2022 16:32:06 +0600
Subject: [PATCH 02/29] support 5- and 3-argument sort! for backwards
 compatability

---
 base/sort.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/base/sort.jl b/base/sort.jl
index edf946b6c24ab..4d7c260324b14 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -1655,5 +1655,8 @@ function _sort!(v::AbstractVector, a::MergeSortAlg, o::Ordering;
     return v
 end
 
+# Support 3- and 5-argument version of sort! for backwards compatability
+sort!(v::AbstractVector, a::Algorithm, o::Ordering) = _sort!(v, a, o)
+sort!(v::AbstractVector, lo::Integer, hi::Integer, a::Algorithm, o::Ordering) = _sort!(v, a, o; lo, hi)
 
 end # module Sort

From 901182cfb87d613fb790a69a79c568e474bd2e44 Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Mon, 17 Oct 2022 18:34:53 +0600
Subject: [PATCH 03/29] test for bug that slipped through test suite

---
 test/sorting.jl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/sorting.jl b/test/sorting.jl
index bd22c62bab6fc..95c303774b661 100644
--- a/test/sorting.jl
+++ b/test/sorting.jl
@@ -764,6 +764,10 @@ end
     end
 end
 
+@testset "Unions with missing" begin
+    @test issorted(sort(shuffle!(vcat(fill(missing, 10), rand(Int, 100)))))
+end
+
 # This testset is at the end of the file because it is slow.
 @testset "searchsorted" begin
     numTypes = [ Int8,  Int16,  Int32,  Int64,  Int128,

From e032ba6247bc56bca92ce2c90ec96cee58390d31 Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Mon, 17 Oct 2022 18:34:59 +0600
Subject: [PATCH 04/29] fix bug

---
 base/sort.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/base/sort.jl b/base/sort.jl
index 4d7c260324b14..9c914aeb88d4e 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -431,7 +431,7 @@ struct WithoutMissingVector{T, U <: AbstractVector{Union{T, Missing}}} <: Abstra
 end
 Base.@propagate_inbounds function Base.getindex(v::WithoutMissingVector, i)
     out = v.data[i]
-    @assert !out isa Missing
+    @assert !(out isa Missing)
     out::eltype(v)
 end
 Base.@propagate_inbounds function Base.setindex!(v::WithoutMissingVector{T}, x::T, i) where T
@@ -750,7 +750,7 @@ function _sort!(v::AbstractVector, a::ConsiderRadixSort, o::DirectOrdering;
                 U = UIntMappable(eltype(v), o),
                 mn, mx, umn=uint_map(mn, o), umx=uint_map(mx, o), urange=umx-umn,
                 bits = unsigned(8sizeof(urange) - leading_zeros(urange)), kw...)
-    if sizeof(U) <= 8 && bits+70 < 22log(lenm1)
+    if sizeof(U) <= 8 && bits+70 < 22log(lenm1) # TODO there are some unexpected allocations here
         _sort!(v, a.radix, o; lo, hi, lenm1, mn, mx, umn, umx, urange, bits, kw...)
     else
         _sort!(v, a.next, o; lo, hi, lenm1, mn, mx, umn, umx, urange, bits, kw...)

From e6cfee0ce46fc992a45227ed98ef500471c1166c Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Thu, 20 Oct 2022 17:36:45 +0600
Subject: [PATCH 05/29] make send_to_end more human friendly (and less compiler
 friendly! introduces regressions.)

---
 base/sort.jl | 55 ++++++++++++++++++++++++++--------------------------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/base/sort.jl b/base/sort.jl
index 9c914aeb88d4e..d39818e87c8fd 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -440,31 +440,17 @@ Base.@propagate_inbounds function Base.setindex!(v::WithoutMissingVector{T}, x::
 end
 Base.size(v::WithoutMissingVector) = size(v.data)
 
-# TODO simplify this further, remove redundancy, try a reverse view.
 """
-    send_to_end!(f::Function, v::AbstractVector)
+    send_to_end!(f::Function, v::AbstractVector; [lo, hi])
 
-Send every element of `v` for which `f` returns `true` to the end of that range and return
-the number of elements index of the last element which for which `f` returns `false`.
+Send every element of `v` for which `f` returns `true` to the end of the vector and return
+the index of the last element which for which `f` returns `false`.
+
+`send_to_end!(f, v, lo, hi)` is equivalent to `send_to_end!(f, view(v, lo:hi))+lo-1`
 
 Preserves the order of the elements that are not sent to the end.
 """
-function send_to_end!(f::Function, v::AbstractVector, ::ReverseOrdering; lo, hi)
-    i = hi
-    @inbounds while lo <= i && !f(v[i])
-        i -= 1
-    end
-    j = i - 1
-    @inbounds while lo <= j
-        if !f(v[j])
-            v[i], v[j] = v[j], v[i]
-            i -= 1
-        end
-        j -= 1
-    end
-    return (i+1, hi), (lo, i)
-end
-function send_to_end!(f::Function, v::AbstractVector, ::ForwardOrdering; lo, hi)
+function send_to_end!(f::F, v::AbstractVector; lo=firstindex(v), hi=lastindex(v)) where F <: Function
     i = lo
     @inbounds while i <= hi && !f(v[i])
         i += 1
@@ -477,17 +463,32 @@ function send_to_end!(f::Function, v::AbstractVector, ::ForwardOrdering; lo, hi)
         end
         j += 1
     end
-    return (lo, i-1), (i, hi)
+    i - 1
 end
+"""
+    send_to_end!(f::Function, v::AbstractVector, o::DirectOrdering[, end_stable]; lo, hi)
+
+Return `(a, b)` where `v[a:b]` are the elements that are not sent to the end.
+
+If `o isa ReverseOrdering` then the "end" of `v` is `v[lo]`.
+
+If `end_stable` is set, the elements that are sent to the end are stable instead of the
+elements that are not
+"""
+@inline send_to_end!(f::F, v::AbstractVector, ::ForwardOrdering, end_stable=false; lo, hi) where F <: Function =
+    end_stable ? (lo, hi-send_to_end!(!f, view(v, hi:-1:lo))) : (lo, send_to_end!(f, v; lo, hi))
+@inline send_to_end!(f::F, v::AbstractVector, ::ReverseOrdering, end_stable=false; lo, hi) where F <: Function =
+    end_stable ? (send_to_end!(!f, v; lo, hi)+1, hi) : (hi-send_to_end!(f, view(v, hi:-1:lo))+1, hi)
+
 
 function _sort!(v::AbstractVector, a::MissingOptimization, o::Ordering;
                 lo=firstindex(v), hi=lastindex(v), kw...)
     if nonmissingtype(eltype(v)) != eltype(v) && o isa DirectOrdering
-        (lo, hi), _ = send_to_end!(ismissing, v, o; lo, hi)
+        lo, hi = send_to_end!(ismissing, v, o; lo, hi)
         _sort!(WithoutMissingVector(v, unsafe=true), a.next, o; lo, hi, kw...)
         v
     elseif eltype(v) <: Integer && o isa Perm{DirectOrdering} && nonmissingtype(eltype(o.data)) != eltype(o.data)
-        (lo, hi), _ = send_to_end!(i -> ismissing(@inbounds o.data[i]), v, o)
+        lo, hi = send_to_end!(i -> ismissing(@inbounds o.data[i]), v, o)
         _sort!(v, a.next, Perm(o.order, WithoutMissingVector(o.data, unsafe=true)); lo, hi, kw...)
     else
         _sort!(v, a.next, o; lo, hi, kw...)
@@ -512,15 +513,15 @@ is_concrete_IEEEFloat(T::Type) = T <: Base.IEEEFloat && isconcretetype(T)
 function _sort!(v::AbstractVector, a::IEEEFloatOptimization, o::Ordering;
                 lo=firstindex(v), hi=lastindex(v), kw...)
     if is_concrete_IEEEFloat(eltype(v)) && o isa DirectOrdering
-        _, (lo, hi) = send_to_end!(!isnan, v, ReverseOrdering(o); lo, hi)
+        lo, hi = send_to_end!(isnan, v, o, true; lo, hi)
         iv = reinterpret(UIntType(eltype(v)), v)
-        (_, j), _ = send_to_end!(x -> after_zero(o, x), v, Forward; lo, hi)
+        j = send_to_end!(x -> after_zero(o, x), v; lo, hi)
         _sort!(iv, a.next, Reverse; lo, hi=j, kw...)
         _sort!(iv, a.next, Forward; lo=j+1, hi, kw...)
     elseif eltype(v) <: Integer && o isa Perm && o.order isa DirectOrdering && is_concrete_IEEEFloat(eltype(o.data))
-        _, (lo, hi) = send_to_end!(i -> !isnan(@inbounds o.data[i]), v, ReverseOrdering(o.order); lo, hi)
+        lo, hi = send_to_end!(i -> isnan(@inbounds o.data[i]), v, o.order, true; lo, hi)
         ip = reinterpret(UIntType(eltype(o.data)), o.data)
-        (_, j), _ = send_to_end!(i -> after_zero(o.order, @inbounds o.data[i]), v, Forward; lo, hi)
+        j = send_to_end!(i -> after_zero(o.order, @inbounds o.data[i]), v; lo, hi)
         _sort!(v, a.next, Perm(Reverse, ip); lo, hi=j, kw...)
         _sort!(v, a.next, Perm(Forward, ip); lo=j+1, hi, kw...)
     else

From f16058211d8dc677d279f1ad901a1e59ded623f8 Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Sat, 29 Oct 2022 11:51:35 +0600
Subject: [PATCH 06/29] Give each sorting pass and DEFAULT_STABLE a docstring

---
 base/sort.jl | 286 +++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 210 insertions(+), 76 deletions(-)

diff --git a/base/sort.jl b/base/sort.jl
index d39818e87c8fd..7216b97226f35 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -412,10 +412,15 @@ insorted(x, r::AbstractRange) = in(x, r)
 abstract type Algorithm end
 
 
+"""
+    MissingOptimization(next) <: Algorithm
 
-#
-# Missing values always go at the end
-#
+Filter out missing values.
+
+Missing values are placed after other values according to `DirectOrdering`s. This pass puts
+them there and passes on a view into the original vector that excludes the missing values.
+This pass is triggered for both `sort([1, missing, 3])` and `sortperm([1, missing, 3])`.
+"""
 struct MissingOptimization{T <: Algorithm} <: Algorithm
     next::T
 end
@@ -496,10 +501,16 @@ function _sort!(v::AbstractVector, a::MissingOptimization, o::Ordering;
 end
 
 
+"""
+    IEEEFloatOptimization(next) <: Algorithm
 
-#
-# fast clever sorting for floats
-#
+Move NaN values to the end, partition by sign, and reinterpret the rest as unsigned integers.
+
+IEEE floating point numbers (`Float64`, `Float32`, and `Float16`) compare the same as
+unsigned integers with the bits with a few exceptions. This pass
+
+This pass is triggered for both `sort([1.0, NaN, 3.0])` and `sortperm([1.0, NaN, 3.0])`.
+"""
 struct IEEEFloatOptimization{T <: Algorithm} <: Algorithm
     next::T
 end
@@ -531,10 +542,14 @@ function _sort!(v::AbstractVector, a::IEEEFloatOptimization, o::Ordering;
 end
 
 
+"""
+    BoolOptimization(next) <: Algorithm
+
+Sort `AbstractVector{Bool}`s using a specialized version of counting sort.
 
-# For AbstractVector{Bool}, counting sort is always best.
-# This is an implementation of counting sort specialized for Bools.
-# Accepts unused scratch to avoid method ambiguity.
+Accesses each element at most twice (one read and one write), and performs at most two
+comparisons.
+"""
 struct BoolOptimization{T <: Algorithm} <: Algorithm
     next::T
 end
@@ -553,10 +568,15 @@ function _sort!(v::AbstractVector{Bool}, ::BoolOptimization, o::Ordering; lo::In
 end
 
 
+"""
+    IsUIntMappable(yes, no) <: Algorithm
 
-#
-#
-#
+Determines if the elements of a vector can be mapped to unsigned integers while preserving
+their order under the specified ordering.
+
+If they can be, dispatch to the `yes` algorithm and record the unsigned integer type that
+the elements may be mapped to. Otherwise dispatch to the `no` algorithm.
+"""
 struct IsUIntMappable{T <: Algorithm, U <: Algorithm} <: Algorithm
     yes::T
     no::U
@@ -571,10 +591,12 @@ function _sort!(v::AbstractVector, a::IsUIntMappable, o::Ordering;
 end
 
 
+"""
+    Small{N}(small=SMALL_ALGORITHM, big) <: Algorithm
 
-#
-#
-#
+Sort inputs with `length(lo:hi) <= N` using the `small` algorithm. Otherwise use the `big`
+algorithm.
+"""
 struct Small{N, T <: Algorithm, U <: Algorithm} <: Algorithm
     small::T
     big::U
@@ -590,27 +612,21 @@ function _sort!(v::AbstractVector, a::Small{N}, o::Ordering;
 end
 
 
-
-#
-#
-#
 struct InsertionSortAlg <: Algorithm end
-
 """
-    InsertionSort
+    InseritonSort
 
-Indicate that a sorting function should use the insertion sort algorithm.
+Use the insertion sort algorithm.
 
 Insertion sort traverses the collection one element at a time, inserting
 each element into its correct, sorted position in the output vector.
 
 Characteristics:
-  * *stable*: preserves the ordering of elements which
-    compare equal (e.g. "a" and "A" in a sort of letters
-    which ignores case).
-  * *in-place* in memory.
-  * *quadratic performance* in the number of elements to be sorted:
-    it is well-suited to small collections but should not be used for large ones.
+* *stable*: preserves the ordering of elements which compare equal
+(e.g. "a" and "A" in a sort of letters which ignores case).
+* *in-place* in memory.
+* *quadratic performance* in the number of elements to be sorted:
+it is well-suited to small collections but should not be used for large ones.
 """
 const InsertionSort = InsertionSortAlg()
 const SMALL_ALGORITHM = InsertionSort
@@ -634,24 +650,24 @@ function _sort!(v::AbstractVector, ::InsertionSortAlg, o::Ordering;
 end
 
 
+"""
+    CheckSorted(next) <: Algorithm
 
-#
-#
-#
+Check if the input is already sorted and for large inputs, also check if it is
+reverse-sorted. The reverse-sorted check is unstable.
+"""
 struct CheckSorted{T <: Algorithm} <: Algorithm
     next::T
 end
 function _sort!(v::AbstractVector, a::CheckSorted, o::Ordering;
                 lo=firstindex(v), hi=lastindex(v), lenm1 = hi-lo, kw...)
     # For most arrays, a presorted check is cheap (overhead < 5%) and for most large
-    # arrays it is essentially free (<1%). Insertion sort runs in a fast O(n) on presorted
-    # input and this guarantees presorted input will always be efficiently handled
+    # arrays it is essentially free (<1%).
     _issorted(v, lo, hi, o) && return v
 
-    # For large arrays, a reverse-sorted check is essentially free (overhead < 1%)
+    # For most large arrays, a reverse-sorted check is essentially free (overhead < 1%)
     if lenm1 >= 500 && _issorted(v, lo, hi, ReverseOrdering(o))
-        # If reversing is valid, do so. This does not violate stability
-        # because being UIntMappable implies a linear order.
+        # If reversing is valid, do so. This does violates stability.
         reverse!(v, lo, hi)
         return v
     end
@@ -660,10 +676,14 @@ function _sort!(v::AbstractVector, a::CheckSorted, o::Ordering;
 end
 
 
+"""
+    ComputeExtrema(next) <: Algorithm
 
-#
-# Prerequisite: region to be sorted [lo, hi] is nonempty
-#
+Compute the extrema of the input under the provided order.
+
+If the minimum is no less than the maximum, then the input is already sorted. Otherwise,
+dispatch to the `next` algorithm.
+"""
 struct ComputeExtrema{T <: Algorithm} <: Algorithm
     next::T
 end
@@ -683,10 +703,16 @@ function _sort!(v::AbstractVector, a::ComputeExtrema, o::Ordering;
 end
 
 
+"""
+    ConsiderCountingSort(counting=CountingSort(), next) <: Algorithm
 
-#
-# Consider counting sort
-#
+If the input's range is small enough, use the `counting` algorithm. Otherwise, dispatch to
+the `next` algorithm.
+
+For most types, the threshold is if the range is shorter than half the length, but for types
+larger than Int64, bitshifts are expensive and RadixSort is not viable, so the threshold is
+much more generous.
+"""
 struct ConsiderCountingSort{T <: Algorithm, U <: Algorithm} <: Algorithm
     counting::T
     next::U
@@ -706,10 +732,15 @@ end
 _sort!(v::AbstractVector, a::ConsiderCountingSort, o::Ordering; kw...) = _sort!(v, a.next, o; kw...)
 
 
+"""
+    CountingSort <: Algorithm
 
-#
-# Counting sort
-#
+Use the counting sort algorithm.
+
+`CountingSort` is an algorithm for sorting integers that runs in Θ(length + range) time and
+space. It counts the number of occurrences of each value in the input and then iterates
+through those counts repopulating the input with the values in sorted order.
+"""
 struct CountingSort <: Algorithm end
 maybe_reverse(o::ForwardOrdering, x) = x
 maybe_reverse(o::ReverseOrdering, x) = reverse(x)
@@ -737,10 +768,12 @@ function _sort!(v::AbstractVector{<:Integer}, ::CountingSort, o::DirectOrdering;
 end
 
 
+"""
+    ConsiderRadixSort(radix=RadixSort(), next) <: Algorithm
 
-#
-# Consider radix sort
-#
+If the number of bits in the input's range is small enough and the input supports efficient
+bitshifts, use the `radix` algorithm. Otherwise, dispatch to the `next` algorithm.
+"""
 struct ConsiderRadixSort{T <: Algorithm, U <: Algorithm} <: Algorithm
     radix::T
     next::U
@@ -759,10 +792,27 @@ function _sort!(v::AbstractVector, a::ConsiderRadixSort, o::DirectOrdering;
 end
 
 
+"""
+    RadixSort <: Algorithm
 
-#
-# Radix sort
-#
+Use the radix sort algorithm.
+
+`RadixSort` is a stable least significant bit first radix sort algorithm that runs in
+`O(length * log(range))` time and linear space.
+
+It first sorts the entire vector by the last `chunk_size` bits, then by the second
+to last `chunk_size` bits, and so on. Stability means that it will not reorder two elements
+that compare equal. This is essential so that the order introduced by earlier,
+less significant passes is preserved by later passes.
+
+Each pass divides the input into `2^chunk_size == mask+1` buckets. To do this, it
+ * counts the number of entries that fall into each bucket
+ * uses those counts to compute the indices to move elements of those buckets into
+ * moves elements into the computed indices in the swap array
+ * switches the swap and working array
+
+`chunk_size` is larger for larger inputs and determined by an empirical heuristic.
+"""
 struct RadixSort <: Algorithm end
 function _sort!(v::AbstractVector, a::RadixSort, o::DirectOrdering;
                 lo=firstindex(v), hi=lastindex(v), lenm1=hi-lo,
@@ -802,17 +852,13 @@ function _sort!(v::AbstractVector, a::RadixSort, o::DirectOrdering;
 end
 
 
-
-#
-# Quicksort
-#
 """
-    PartialQuickSort(lo::Union{Integer, Missing}, hi::Union{Integer, Missing})
+    PartialQuickSort(lo::Union{Integer, Missing}, hi::Union{Integer, Missing}, next::Algorithm) <: Algorithm
 
 Indicate that a sorting function should use the partial quick sort algorithm.
 
-Partial quick sort finds and sorts the elements that would end up in positions
-`lo:hi` using [`QuickSort`](@ref).
+Partial quick sort finds and sorts the elements that would end up in positions `lo:hi` using
+[`QuickSort`](@ref). It is recursive and uses the `next` algorithm for small chunks
 
 Characteristics:
   * *stable*: preserves the ordering of elements which compare equal
@@ -928,10 +974,15 @@ function _sort!(v::AbstractVector, a::PartialQuickSort, o::Ordering;
 end
 
 
+"""
+    StableCheckSorted(next) <: Algorithm
 
-#
-# StableCheckSorted
-#
+Check if an input is sorted and/or reverse-sorted.
+
+The definition of reverse-sorted is that for every pair of adjacent elements, the latter is
+less than the former. This is stricter than `issorted(v, Reverse(o))` to avoid swapping pairs
+of elements that compare equal.
+"""
 struct StableCheckSorted{T<:Algorithm} <: Algorithm
     next::T
 end
@@ -948,19 +999,6 @@ function _sort!(v::AbstractVector, a::StableCheckSorted, o::Ordering;
 end
 
 
-# This is a stable least significant bit first radix sort.
-#
-# That is, it first sorts the entire vector by the last chunk_size bits, then by the second
-# to last chunk_size bits, and so on. Stability means that it will not reorder two elements
-# that compare equal. This is essential so that the order introduced by earlier,
-# less significant passes is preserved by later passes.
-#
-# Each pass divides the input into 2^chunk_size == mask+1 buckets. To do this, it
-#  * counts the number of entries that fall into each bucket
-#  * uses those counts to compute the indices to move elements of those buckets into
-#  * moves elements into the computed indices in the swap array
-#  * switches the swap and working array
-#
 # In the case of an odd number of passes, the returned vector will === the input vector t,
 # not v. This is one of the many reasons radix_sort! is not exported.
 function radix_sort!(v::AbstractVector{U}, lo::Integer, hi::Integer, bits::Unsigned,
@@ -1033,17 +1071,113 @@ function _issorted(v::AbstractVector, lo::Integer, hi::Integer, o::Ordering)
     true
 end
 
+
 ## default sorting policy ##
 
-InitialOptimizations(x) = MissingOptimization(BoolOptimization(Small{10}(IEEEFloatOptimization(x))))
+"""
+    InitialOptimizations(next) <: Algorithm
+
+Attempt to apply a suite of low-cost optimizations to the input vector before sorting.
+
+`InitialOptimizations` is an implementation detail and subject to change or removal in
+future versions of Julia.
+
+If `next` is stable, then `InitialOptimizations(next)` is also stable.
+
+The specific optimizations attempted by `InitialOptimizations` are
+[`MissingOptimization`](@ref), [`BoolOptimization`](@ref), dispatch to
+[`InsertionSort`](@ref) for inputs with `length <= 10`, and [`IEEEFloatOptimization`](@ref).
+"""
+InitialOptimizations(next) = MissingOptimization(BoolOptimization(Small{10}(IEEEFloatOptimization(next))))
+"""
+    DEFAULT_STABLE
+
+The default sorting algorithm.
+
+This algorithm is guaranteed to be stable (i.e. it will not reorder elements that compare
+equal). It makes an effort to be fast for most inputs.
+
+The algorithms used by `DEFAULT_STABLE` are an implementation detail. See extended help
+for the current dispatch system.
+
+# Extended Help
+
+`DEFAULT_STABLE` is composed of two parts: the [`InitialOptimizations`](@ref) and a hybrid
+of Radix, Insertion, Counting, Quick sorts.
+
+We begin with MissingOptimization because it has no runtime cost when it is not
+triggered and can enable other optimizations to be applied later. For example,
+BoolOptimization cannot apply to an `AbstractVector{Union{Missing, Bool}}`, but after
+[`MissingOptimization`](@ref) is applied, that input will be converted into am
+`AbstractVector{Bool}`.
+
+We next apply [`BoolOptimization`](@ref) because it also has no runtime cost when it is not
+triggered and when it is triggered, it is an incredibly efficient algorithm (sorting `Bool`s
+is quite easy).
+
+Next, we dispatch to [`InsertionSort`](@ref) for inputs with `length <= 10`. This dispatch
+occurs before the [`IEEEFloatOptimization`](@ref) pass because the
+[`IEEEFloatOptimization`](@ref)s are not beneficial for very small inputs.
+
+To conclude the [`InitialOptimizations`](@ref), we apply [`IEEEFloatOptimization`](@ref).
+
+After these optimizations, we branch on whether radix sort and related algorithms can be
+applied to the input vector and ordering. We conduct this branch by testing if
+`UIntMappable(v, order) !== nothing`. That is, we see if we know of a reversible mapping
+from `eltype(v)` to `UInt` that preserves the ordering `order`. We perform this check after
+the initial optimizations because they can change the input vector's type and ordering to
+make them `UIntMappable`.
+
+If the input is not [`UIntMappable`](@ref), then we perform a presorted check and dispatch
+to [`QuickSort`](@ref).
+
+Otherwise, we dispatch to [`InsertionSort`](@ref) for inputs with `length <= 40` and then
+perform a presorted check ([`CheckSorted`](@ref)).
+
+We check for short inputs before performing the presorted check to avoid the overhead of the
+check for small inputs. Because the alternate dispatch is to [`InseritonSort`](@ref) which
+has efficient `O(n)` runtime on presorted inputs, the check is not necessary for small
+inputs.
+
+We check if the input is reverse-sorted for long vectors (more than 500 elements) because
+the check is essentially free unless the input is almost entirely reverse sorted.
+
+Note that once the input is determined to be [`UIntMappable`](@ref), we know the order forms
+a [total order](wikipedia.org/wiki/Total_order) over the inputs and so it is impossible to
+perform an unstable sort because no two elements can compare equal unless they _are_ equal,
+in which case switching them is undetectable. We utilize this fact to perform a more
+aggressive reverse sorted check that will reverse the vector `[3, 2, 2, 1]`.
+
+After these potential fast-paths are tried and failed, we [`ComputeExtrema`](@ref) of the
+input. This computation has a fairly fast `O(n)` runtime, but we still try to delay it until
+it is necessary.
+
+Next, we [`ConsiderCountingSort`](@ref). If the range the input is small compared to its
+length, we apply [`CountingSort`](@ref).
+
+Next, we [`ConsiderRadixSort`](@ref). This is similar to the dispatch to counting sort,
+but we conside rthe number of _bits_ in the range, rather than the range itself.
+Consequently, we apply [`RadixSort`](@ref) for any reasonably long inputs that reach this
+stage.
+
+Finally, if the input has length less than 80, we dispatch to [`InsertionSort`](@ref) and
+otherwise we dispatch to [`QuickSort`](@ref).
+"""
 const DEFAULT_STABLE = InitialOptimizations(IsUIntMappable(
     Small{40}(CheckSorted(ComputeExtrema(ConsiderCountingSort(ConsiderRadixSort(Small{80}(QuickSort)))))),
     StableCheckSorted(QuickSort)))
+"""
+    DEFAULT_UNSTABLE
+
+An efficient sorting algorithm.
+
+The algorithms used by `DEFAULT_UNSTABLE` are an implementation detail. They are currently
+the same as those used by [`DEFAULT_STABLE`](@ref), but this is subject to change in future.
+"""
 const DEFAULT_UNSTABLE = DEFAULT_STABLE
 const SMALL_THRESHOLD  = 20
 
 
-
 defalg(v::AbstractArray) = DEFAULT_STABLE
 
 """

From 15a44842e39493a8f5c00ea5056b67ad26b677b2 Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Sun, 30 Oct 2022 14:20:56 +0600
Subject: [PATCH 07/29] add tests and fix typos they unveiled

---
 base/sort.jl    |  4 ++--
 test/sorting.jl | 49 ++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/base/sort.jl b/base/sort.jl
index 7216b97226f35..42b10d32486d8 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -720,7 +720,7 @@ end
 ConsiderCountingSort(next) = ConsiderCountingSort(CountingSort(), next)
 function _sort!(v::AbstractVector{<:Integer}, a::ConsiderCountingSort, o::DirectOrdering;
                 lo=firstindex(v), hi=lastindex(v), lenm1=hi-lo,
-                U = UIntMapping(eltype(v), o),
+                U = UIntMappable(eltype(v), o),
                 mn, mx, range=maybe_unsigned(o === Reverse ? mn-mx : mx-mn), kw...)
 
     if range < (sizeof(U) > 8 ? 5lenm1-100 : div(lenm1, 2))
@@ -816,7 +816,7 @@ Each pass divides the input into `2^chunk_size == mask+1` buckets. To do this, i
 struct RadixSort <: Algorithm end
 function _sort!(v::AbstractVector, a::RadixSort, o::DirectOrdering;
                 lo=firstindex(v), hi=lastindex(v), lenm1=hi-lo,
-                mn, mx, umn=uint_mapping(mn, o), umx=uint_mapping(mx, o), urange=umx-umn,
+                mn, mx, umn=uint_map(mn, o), umx=uint_map(mx, o), urange=umx-umn,
                 bits = unsigned(8sizeof(urange) - leading_zeros(urange)),
                 U = UIntMappable(eltype(v), o), scratch=nothing, kw...)
 
diff --git a/test/sorting.jl b/test/sorting.jl
index 95c303774b661..d0892b2afb407 100644
--- a/test/sorting.jl
+++ b/test/sorting.jl
@@ -733,7 +733,6 @@ end
     @test issorted(k[idx], rev=true)
 end
 
-# This testset is at the end of the file because it is slow
 @testset "sort(x; scratch)" begin
     for n in [1,10,100,1000]
         v = rand(n)
@@ -768,6 +767,54 @@ end
     @test issorted(sort(shuffle!(vcat(fill(missing, 10), rand(Int, 100)))))
 end
 
+@testset "Specific algorithms" begin
+    let
+        requires_uint_mappable = Union{Base.Sort.RadixSort, Base.Sort.ConsiderRadixSort,
+            Base.Sort.CountingSort, Base.Sort.ConsiderCountingSort,
+            typeof(Base.Sort.DEFAULT_STABLE.next.next.big.next.yes),
+            typeof(Base.Sort.DEFAULT_STABLE.next.next.big.next.yes.big),
+            typeof(Base.Sort.DEFAULT_STABLE.next.next.big.next.yes.big.next)}
+
+        function test_alg(kw, alg, float=true)
+            for order in [Base.Forward, Base.Reverse, Base.By(x -> x^2)]
+                order isa Base.By && alg isa requires_uint_mappable && continue
+                for n in [1,7,179,1312]
+
+                    n == 1 && alg isa Base.Sort.RadixSort && continue
+
+                    x = rand(1:n+1, n)
+                    y = sort(x; order)
+                    @test y == Base.Sort._sort!(x, alg, order, (;kw(y)...)) === x
+
+                    alg isa requires_uint_mappable && continue
+
+                    x = randn(n)
+                    y = sort(x; order)
+                    @test y == Base.Sort._sort!(x, alg, order, (;kw(y)...)) === x
+                end
+            end
+        end
+        test_alg(alg) = test_alg(x -> (), alg)
+
+        function test_alg_rec(alg, extrema=false)
+            if extrema
+                test_alg(alg) do y
+                    (;mn=first(y),mx=last(y))
+                end
+            else
+                test_alg(alg)
+            end
+            extrema |= alg isa Base.Sort.ComputeExtrema
+            for name in fieldnames(typeof(alg))
+                a = getfield(alg, name)
+                a isa Base.Sort.Algorithm && test_alg_rec(a, extrema)
+            end
+        end
+
+        test_alg_rec(Base.DEFAULT_STABLE)
+    end
+end
+
 # This testset is at the end of the file because it is slow.
 @testset "searchsorted" begin
     numTypes = [ Int8,  Int16,  Int32,  Int64,  Int128,

From d82b09095afd307c52ea0fd36ecee70d9aa29a97 Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Tue, 1 Nov 2022 19:05:11 +0600
Subject: [PATCH 08/29] avoid potential name conflict

---
 base/sort.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/base/sort.jl b/base/sort.jl
index 42b10d32486d8..e1fba0466a397 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -1619,9 +1619,9 @@ function sort!(A::AbstractArray{T};
                rev::Union{Bool,Nothing}=nothing,
                order::Ordering=Forward,
                scratch::Union{AbstractVector{T}, Nothing}=similar(A, size(A, dims))) where T
-    _sort!(A, Val(dims), alg, ord(lt, by, rev, order), scratch)
+    __sort!(A, Val(dims), alg, ord(lt, by, rev, order), scratch)
 end
-function _sort!(A::AbstractArray{T}, ::Val{K},
+function __sort!(A::AbstractArray{T}, ::Val{K},
                 alg::Algorithm,
                 order::Ordering,
                 scratch::Union{AbstractVector{T}, Nothing}) where {K,T}

From 029cbaed74fefbe260a91cc0a88781cd83a168ff Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Tue, 1 Nov 2022 19:11:39 +0600
Subject: [PATCH 09/29] switch to custom keyword handling

FIXES UNEXPECTED ALLOCATIONS
removes code that previously harbored bugs that slipped through the test suite
---
 base/sort.jl    | 189 ++++++++++++++++++++++++++----------------------
 test/sorting.jl |   4 +-
 2 files changed, 104 insertions(+), 89 deletions(-)

diff --git a/base/sort.jl b/base/sort.jl
index e1fba0466a397..1ab423f0404a9 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -86,7 +86,7 @@ issorted(itr;
     issorted(itr, ord(lt,by,rev,order))
 
 function partialsort!(v::AbstractVector, k::Union{Integer,OrdinalRange}, o::Ordering)
-    _sort!(v, _PartialQuickSort(k), o)
+    _sort!(v, _PartialQuickSort(k), o, (;))
     maybeview(v, k)
 end
 
@@ -407,6 +407,42 @@ function insorted end
 insorted(x, v::AbstractVector; kw...) = !isempty(searchsorted(v, x; kw...))
 insorted(x, r::AbstractRange) = in(x, r)
 
+## Alternative keyword management
+
+macro getkw(syms...)
+    usyms = (Symbol(:_, sym) for sym in syms)
+    Expr(:block, (:($(esc(:((kw, $sym) = $usym(v, o, kw))))) for (sym, usym) in zip(syms, usyms))...)
+end
+
+for (sym, deps, exp, type) in [
+        (:lo, (), :(firstindex(v)), Integer),
+        (:hi, (), :(lastindex(v)),  Integer),
+        (:U, (), :(UIntMappable(eltype(v), o)),  Any), #type checking this comes at a runtime performance cost ???
+        (:lenm1, (:lo, :hi), :(hi-lo), Integer),
+        (:mn, (), :(throw(ArgumentError("mn is needed but has not been computed"))), :(eltype(v))),
+        (:mx, (), :(throw(ArgumentError("mx is needed but has not been computed"))), :(eltype(v))),
+        (:range, (:mn, :mx), quote
+            o isa DirectOrdering || throw(ArgumentError("Cannot compute range under ordering $o"))
+            maybe_unsigned(o === Reverse ? mn-mx : mx-mn)
+        end, Integer),
+        (:umn, (:mn,), :(uint_map(mn, o)), Unsigned),
+        (:umx, (:mx,), :(uint_map(mx, o)), Unsigned),
+        (:urange, (:umn, :umx), :(umx-umn), Unsigned),
+        (:bits, (:urange,), :(unsigned(8sizeof(urange) - leading_zeros(urange))), Unsigned),
+        (:scratch, (), nothing, :(Union{Nothing, AbstractVector})), # could have different eltype
+        (:t, (:lo, :hi, :scratch), quote
+            scratch === nothing ? similar(v) : reinterpret(eltype(v), checkbounds(Bool, scratch, lo:hi) ? scratch : resize!(scratch, length(v)))
+        end, :(AbstractVector{eltype(v)}))]
+    str = string(sym)
+    usym = Symbol(:_, sym)
+    @eval function $usym(v, o, kw)
+        Symbol($str) ∈ keys(kw) && return kw, kw[Symbol($str)]::$type # TODO this interpolation feels too complicated
+        @getkw $(deps...)
+        $sym = $exp
+        (;kw..., $sym), $sym::$type
+    end
+end
+
 ## sorting algorithm components ##
 
 abstract type Algorithm end
@@ -486,17 +522,17 @@ elements that are not
     end_stable ? (send_to_end!(!f, v; lo, hi)+1, hi) : (hi-send_to_end!(f, view(v, hi:-1:lo))+1, hi)
 
 
-function _sort!(v::AbstractVector, a::MissingOptimization, o::Ordering;
-                lo=firstindex(v), hi=lastindex(v), kw...)
+function _sort!(v::AbstractVector, a::MissingOptimization, o::Ordering, kw)
+    @getkw lo hi
     if nonmissingtype(eltype(v)) != eltype(v) && o isa DirectOrdering
         lo, hi = send_to_end!(ismissing, v, o; lo, hi)
-        _sort!(WithoutMissingVector(v, unsafe=true), a.next, o; lo, hi, kw...)
+        _sort!(WithoutMissingVector(v, unsafe=true), a.next, o, (;kw..., lo, hi))
         v
     elseif eltype(v) <: Integer && o isa Perm{DirectOrdering} && nonmissingtype(eltype(o.data)) != eltype(o.data)
         lo, hi = send_to_end!(i -> ismissing(@inbounds o.data[i]), v, o)
-        _sort!(v, a.next, Perm(o.order, WithoutMissingVector(o.data, unsafe=true)); lo, hi, kw...)
+        _sort!(v, a.next, Perm(o.order, WithoutMissingVector(o.data, unsafe=true)), (;kw..., lo, hi))
     else
-        _sort!(v, a.next, o; lo, hi, kw...)
+        _sort!(v, a.next, o, kw)
     end
 end
 
@@ -521,22 +557,22 @@ UIntType(::Type{Float64}) = UInt64
 after_zero(::ForwardOrdering, x) = 0 <= x
 after_zero(::ReverseOrdering, x) = x < 0
 is_concrete_IEEEFloat(T::Type) = T <: Base.IEEEFloat && isconcretetype(T)
-function _sort!(v::AbstractVector, a::IEEEFloatOptimization, o::Ordering;
-                lo=firstindex(v), hi=lastindex(v), kw...)
+function _sort!(v::AbstractVector, a::IEEEFloatOptimization, o::Ordering, kw)
+    @getkw lo hi
     if is_concrete_IEEEFloat(eltype(v)) && o isa DirectOrdering
         lo, hi = send_to_end!(isnan, v, o, true; lo, hi)
         iv = reinterpret(UIntType(eltype(v)), v)
         j = send_to_end!(x -> after_zero(o, x), v; lo, hi)
-        _sort!(iv, a.next, Reverse; lo, hi=j, kw...)
-        _sort!(iv, a.next, Forward; lo=j+1, hi, kw...)
+        _sort!(iv, a.next, Reverse, (;kw..., lo, hi=j))
+        _sort!(iv, a.next, Forward, (;kw..., lo=j+1, hi))
     elseif eltype(v) <: Integer && o isa Perm && o.order isa DirectOrdering && is_concrete_IEEEFloat(eltype(o.data))
         lo, hi = send_to_end!(i -> isnan(@inbounds o.data[i]), v, o.order, true; lo, hi)
         ip = reinterpret(UIntType(eltype(o.data)), o.data)
         j = send_to_end!(i -> after_zero(o.order, @inbounds o.data[i]), v; lo, hi)
-        _sort!(v, a.next, Perm(Reverse, ip); lo, hi=j, kw...)
-        _sort!(v, a.next, Perm(Forward, ip); lo=j+1, hi, kw...)
+        _sort!(v, a.next, Perm(Reverse, ip), (;kw..., lo, hi=j))
+        _sort!(v, a.next, Perm(Forward, ip), (;kw..., lo=j+1, hi))
     else
-        _sort!(v, a.next, o; lo, hi, kw...)
+        _sort!(v, a.next, o, kw)
     end
     v
 end
@@ -553,9 +589,10 @@ comparisons.
 struct BoolOptimization{T <: Algorithm} <: Algorithm
     next::T
 end
-_sort!(v::AbstractVector, a::BoolOptimization, o::Ordering; kw...) = _sort!(v, a.next, o; kw...)
-function _sort!(v::AbstractVector{Bool}, ::BoolOptimization, o::Ordering; lo::Integer, hi::Integer, kw...)
+_sort!(v::AbstractVector, a::BoolOptimization, o::Ordering, kw) = _sort!(v, a.next, o, kw)
+function _sort!(v::AbstractVector{Bool}, ::BoolOptimization, o::Ordering, kw)
     first = lt(o, false, true) ? false : lt(o, true, false) ? true : return v
+    @getkw lo hi
     count = 0
     @inbounds for i in lo:hi
         if v[i] == first
@@ -581,12 +618,12 @@ struct IsUIntMappable{T <: Algorithm, U <: Algorithm} <: Algorithm
     yes::T
     no::U
 end
-function _sort!(v::AbstractVector, a::IsUIntMappable, o::Ordering;
-                U = UIntMappable(eltype(v), o), kw...)
+function _sort!(v::AbstractVector, a::IsUIntMappable, o::Ordering, kw)
+    @getkw U
     if U !== nothing
-        _sort!(v, a.yes, o; U, kw...)
+        _sort!(v, a.yes, o, kw)
     else
-        _sort!(v, a.no, o; kw...)
+        _sort!(v, a.no, o, kw)
     end
 end
 
@@ -602,12 +639,12 @@ struct Small{N, T <: Algorithm, U <: Algorithm} <: Algorithm
     big::U
 end
 Small{N}(big) where N = Small{N, typeof(SMALL_ALGORITHM), typeof(big)}(SMALL_ALGORITHM, big)
-function _sort!(v::AbstractVector, a::Small{N}, o::Ordering;
-                lo::Integer=firstindex(v), hi::Integer=lastindex(v), lenm1 = hi-lo, kw...) where N
+function _sort!(v::AbstractVector, a::Small{N}, o::Ordering, kw) where N
+    @getkw lenm1
     if lenm1 < N
-        _sort!(v, a.small, o; lo, hi, lenm1, kw...)
+        _sort!(v, a.small, o, kw)
     else
-        _sort!(v, a.big, o; lo, hi, lenm1, kw...)
+        _sort!(v, a.big, o, kw)
     end
 end
 
@@ -630,8 +667,8 @@ it is well-suited to small collections but should not be used for large ones.
 """
 const InsertionSort = InsertionSortAlg()
 const SMALL_ALGORITHM = InsertionSort
-function _sort!(v::AbstractVector, ::InsertionSortAlg, o::Ordering;
-                lo=firstindex(v), hi=lastindex(v), kw...)
+function _sort!(v::AbstractVector, ::InsertionSortAlg, o::Ordering, kw)
+    @getkw lo hi
     lo_plus_1 = (lo + 1)::Integer
     @inbounds for i = lo_plus_1:hi
         j = i
@@ -659,8 +696,9 @@ reverse-sorted. The reverse-sorted check is unstable.
 struct CheckSorted{T <: Algorithm} <: Algorithm
     next::T
 end
-function _sort!(v::AbstractVector, a::CheckSorted, o::Ordering;
-                lo=firstindex(v), hi=lastindex(v), lenm1 = hi-lo, kw...)
+function _sort!(v::AbstractVector, a::CheckSorted, o::Ordering, kw)
+    @getkw lo hi lenm1
+
     # For most arrays, a presorted check is cheap (overhead < 5%) and for most large
     # arrays it is essentially free (<1%).
     _issorted(v, lo, hi, o) && return v
@@ -672,7 +710,7 @@ function _sort!(v::AbstractVector, a::CheckSorted, o::Ordering;
         return v
     end
 
-    _sort!(v, a.next, o; lo, hi, lenm1, kw...)
+    _sort!(v, a.next, o, kw)
 end
 
 
@@ -687,8 +725,8 @@ dispatch to the `next` algorithm.
 struct ComputeExtrema{T <: Algorithm} <: Algorithm
     next::T
 end
-function _sort!(v::AbstractVector, a::ComputeExtrema, o::Ordering;
-                lo=firstindex(v), hi=lastindex(v), kw...)
+function _sort!(v::AbstractVector, a::ComputeExtrema, o::Ordering, kw)
+    @getkw lo hi
     mn = mx = v[lo]
     @inbounds for i in (lo+1):hi
         vi = v[i]
@@ -699,7 +737,7 @@ function _sort!(v::AbstractVector, a::ComputeExtrema, o::Ordering;
 
     lt(o, mn, mx) || return v # all same
 
-    _sort!(v, a.next, o; lo, hi, mn, mx, kw...)
+    _sort!(v, a.next, o, (;kw..., mn, mx))
 end
 
 
@@ -718,18 +756,15 @@ struct ConsiderCountingSort{T <: Algorithm, U <: Algorithm} <: Algorithm
     next::U
 end
 ConsiderCountingSort(next) = ConsiderCountingSort(CountingSort(), next)
-function _sort!(v::AbstractVector{<:Integer}, a::ConsiderCountingSort, o::DirectOrdering;
-                lo=firstindex(v), hi=lastindex(v), lenm1=hi-lo,
-                U = UIntMappable(eltype(v), o),
-                mn, mx, range=maybe_unsigned(o === Reverse ? mn-mx : mx-mn), kw...)
-
+function _sort!(v::AbstractVector{<:Integer}, a::ConsiderCountingSort, o::DirectOrdering, kw)
+    @getkw lenm1 range U
     if range < (sizeof(U) > 8 ? 5lenm1-100 : div(lenm1, 2))
-        _sort!(v, a.counting, o; lo, hi, lenm1, mn, mx, range, kw...)
+        _sort!(v, a.counting, o, kw)
     else
-        _sort!(v, a.next, o; lo, hi, lenm1, mn, mx, range, kw...)
+        _sort!(v, a.next, o, kw)
     end
 end
-_sort!(v::AbstractVector, a::ConsiderCountingSort, o::Ordering; kw...) = _sort!(v, a.next, o; kw...)
+_sort!(v::AbstractVector, a::ConsiderCountingSort, o::Ordering, kw) = _sort!(v, a.next, o, kw)
 
 
 """
@@ -744,9 +779,8 @@ through those counts repopulating the input with the values in sorted order.
 struct CountingSort <: Algorithm end
 maybe_reverse(o::ForwardOrdering, x) = x
 maybe_reverse(o::ReverseOrdering, x) = reverse(x)
-function _sort!(v::AbstractVector{<:Integer}, ::CountingSort, o::DirectOrdering;
-                lo=firstindex(v), hi=lastindex(v), lenm1=hi-lo,
-                mn, mx, range=maybe_unsigned(o === Reverse ? mn-mx : mx-mn), kw...)
+function _sort!(v::AbstractVector{<:Integer}, ::CountingSort, o::DirectOrdering, kw)
+    @getkw lo hi mn mx range
     offs = 1 - (o === Reverse ? mx : mn)
 
     counts = fill(0, range+1)
@@ -779,15 +813,12 @@ struct ConsiderRadixSort{T <: Algorithm, U <: Algorithm} <: Algorithm
     next::U
 end
 ConsiderRadixSort(next) = ConsiderRadixSort(RadixSort(), next)
-function _sort!(v::AbstractVector, a::ConsiderRadixSort, o::DirectOrdering;
-                lo=firstindex(v), hi=lastindex(v), lenm1=hi-lo,
-                U = UIntMappable(eltype(v), o),
-                mn, mx, umn=uint_map(mn, o), umx=uint_map(mx, o), urange=umx-umn,
-                bits = unsigned(8sizeof(urange) - leading_zeros(urange)), kw...)
-    if sizeof(U) <= 8 && bits+70 < 22log(lenm1) # TODO there are some unexpected allocations here
-        _sort!(v, a.radix, o; lo, hi, lenm1, mn, mx, umn, umx, urange, bits, kw...)
+function _sort!(v::AbstractVector, a::ConsiderRadixSort, o::DirectOrdering, kw)
+    @getkw U bits lenm1
+    if sizeof(U) <= 8 && bits+70 < 22log(lenm1)
+        _sort!(v, a.radix, o, kw)
     else
-        _sort!(v, a.next, o; lo, hi, lenm1, mn, mx, umn, umx, urange, bits, kw...)
+        _sort!(v, a.next, o, kw)
     end
 end
 
@@ -814,11 +845,8 @@ Each pass divides the input into `2^chunk_size == mask+1` buckets. To do this, i
 `chunk_size` is larger for larger inputs and determined by an empirical heuristic.
 """
 struct RadixSort <: Algorithm end
-function _sort!(v::AbstractVector, a::RadixSort, o::DirectOrdering;
-                lo=firstindex(v), hi=lastindex(v), lenm1=hi-lo,
-                mn, mx, umn=uint_map(mn, o), umx=uint_map(mx, o), urange=umx-umn,
-                bits = unsigned(8sizeof(urange) - leading_zeros(urange)),
-                U = UIntMappable(eltype(v), o), scratch=nothing, kw...)
+function _sort!(v::AbstractVector, a::RadixSort, o::DirectOrdering, kw)
+    @getkw lo hi umn U scratch lenm1 bits
 
     # At this point, we are committed to radix sort.
     u = uint_map!(v, lo, hi, o)
@@ -936,10 +964,9 @@ function partition!(t::AbstractVector, lo::Integer, hi::Integer, o::Ordering, v:
     pivot, lo-trues
 end
 
-function _sort!(v::AbstractVector, a::PartialQuickSort, o::Ordering;
-                lo=firstindex(v), hi=lastindex(v), scratch=similar(v),
-                t=reinterpret(eltype(v), checkbounds(Bool, scratch, lo:hi) ? scratch : resize!(scratch, length(v))),
-                swap=false, rev=false, kw...)
+function _sort!(v::AbstractVector, a::PartialQuickSort, o::Ordering, kw;
+                t=nothing, swap=false, rev=false)
+    @getkw lo hi t
 
     while lo < hi && hi - lo > SMALL_THRESHOLD
         pivot, j = swap ? partition!(v, lo, hi, o, t, rev) : partition!(t, lo, hi, o, v, rev)
@@ -959,18 +986,18 @@ function _sort!(v::AbstractVector, a::PartialQuickSort, o::Ordering;
         elseif j-lo < hi-j
             # Sort the lower part recursively because it is smaller. Recursing on the
             # smaller part guarantees O(log(n)) stack space even on pathological inputs.
-            _sort!(v, a, o; lo, hi=j-1, scratch, t, swap, rev, kw...)
+            _sort!(v, a, o, (;kw..., lo, hi=j-1); swap, rev)
             lo = j+1
             rev = !rev
         else # Sort the higher part recursively
-            _sort!(v, a, o; lo=j+1, hi, scratch, t, swap, rev=!rev, kw...)
+            _sort!(v, a, o, (;kw..., lo=j+1, hi); swap, rev=!rev)
             hi = j-1
         end
     end
     hi < lo && return v
     swap && copyto!(v, lo, t, lo, hi-lo+1)
     rev && reverse!(v, lo, hi)
-    _sort!(v, a.next, o; lo, hi, scratch, t, kw...)
+    _sort!(v, a.next, o, (;kw..., lo, hi))
 end
 
 
@@ -986,8 +1013,8 @@ of elements that compare equal.
 struct StableCheckSorted{T<:Algorithm} <: Algorithm
     next::T
 end
-function _sort!(v::AbstractVector, a::StableCheckSorted, o::Ordering;
-                lo=firstindex(v), hi=lastindex(v), kw...)
+function _sort!(v::AbstractVector, a::StableCheckSorted, o::Ordering, kw)
+    @getkw lo hi
     if _issorted(v, lo, hi, o)
         return v
     elseif _issorted(v, lo, hi, Lt((x, y) -> !lt(o, x, y)))
@@ -995,7 +1022,7 @@ function _sort!(v::AbstractVector, a::StableCheckSorted, o::Ordering;
         return reverse!(v, lo, hi)
     end
 
-    _sort!(v, a.next, o; lo, hi, kw...)
+    _sort!(v, a.next, o, kw)
 end
 
 
@@ -1227,11 +1254,7 @@ function sort!(v::AbstractVector{T};
                rev::Union{Bool,Nothing}=nothing,
                order::Ordering=Forward,
                scratch::Union{AbstractVector{T}, Nothing}=nothing) where T
-    if scratch === nothing # TODO: reduce redundancy
-        _sort!(v, alg, ord(lt,by,rev,order))
-    else
-        _sort!(v, alg, ord(lt,by,rev,order); scratch)
-    end
+    _sort!(v, alg, ord(lt,by,rev,order), (;scratch))
 end
 
 """
@@ -1357,7 +1380,7 @@ function partialsortperm!(ix::AbstractVector{<:Integer}, v::AbstractVector,
     end
 
     # do partial quicksort
-    _sort!(ix, _PartialQuickSort(k), Perm(ord(lt, by, rev, order), v))
+    _sort!(ix, _PartialQuickSort(k), Perm(ord(lt, by, rev, order), v), (;))
 
     maybeview(ix, k)
 end
@@ -1568,15 +1591,7 @@ end
 @noinline function sort_chunks!(Av, n, alg, order, scratch)
     inds = LinearIndices(Av)
     for lo = first(inds):n:last(inds)
-        _sort!(Av, alg, order; lo, hi=lo+n-1, scratch)
-    end
-    Av
-end
-# TODO: reduce redundancy
-@noinline function sort_chunks!(Av, n, alg, order, scratch::Nothing)
-    inds = LinearIndices(Av)
-    for lo = first(inds):n:last(inds)
-        _sort!(Av, alg, order; lo, hi=lo+n-1)
+        _sort!(Av, alg, order, (; lo, hi=lo+n-1, scratch))
     end
     Av
 end
@@ -1748,10 +1763,10 @@ Characteristics:
 const MergeSort = MergeSortAlg(SMALL_ALGORITHM)
 
 
-function _sort!(v::AbstractVector, a::MergeSortAlg, o::Ordering;
-                lo=firstindex(v), hi=lastindex(v), scratch=nothing)
+function _sort!(v::AbstractVector, a::MergeSortAlg, o::Ordering, kw)
+    @getkw lo hi scratch
     @inbounds if lo < hi
-        hi-lo <= SMALL_THRESHOLD && return _sort!(v, a.next, o; lo, hi)
+        hi-lo <= SMALL_THRESHOLD && return _sort!(v, a.next, o, kw)
 
         m = midpoint(lo, hi)
 
@@ -1759,8 +1774,8 @@ function _sort!(v::AbstractVector, a::MergeSortAlg, o::Ordering;
         length(t) < m-lo+1 && resize!(t, m-lo+1)
         Base.require_one_based_indexing(t)
 
-        _sort!(v, a, o; lo, hi=m, scratch=t)
-        _sort!(v, a, o; lo=m+1, hi, scratch=t)
+        _sort!(v, a, o, (;kw..., hi=m, scratch=t))
+        _sort!(v, a, o, (;kw..., lo=m+1, scratch=t))
 
         i, j = 1, lo
         while j <= m
@@ -1791,7 +1806,7 @@ function _sort!(v::AbstractVector, a::MergeSortAlg, o::Ordering;
 end
 
 # Support 3- and 5-argument version of sort! for backwards compatability
-sort!(v::AbstractVector, a::Algorithm, o::Ordering) = _sort!(v, a, o)
-sort!(v::AbstractVector, lo::Integer, hi::Integer, a::Algorithm, o::Ordering) = _sort!(v, a, o; lo, hi)
+sort!(v::AbstractVector, a::Algorithm, o::Ordering) = _sort!(v, a, o, (;))
+sort!(v::AbstractVector, lo::Integer, hi::Integer, a::Algorithm, o::Ordering) = _sort!(v, a, o, (; lo, hi))
 
 end # module Sort
diff --git a/test/sorting.jl b/test/sorting.jl
index d0892b2afb407..f9a3e5bd7438a 100644
--- a/test/sorting.jl
+++ b/test/sorting.jl
@@ -535,11 +535,11 @@ end
     @test issorted(a)
 
     a = view([9:-1:0;], :)::SubArray
-    Base.Sort._sort!(a, Base.Sort.CountingSort(), Base.Forward, mn=0, mx=9)  # test it supports non-Vector
+    Base.Sort._sort!(a, Base.Sort.CountingSort(), Base.Forward, (; mn=0, mx=9))  # test it supports non-Vector
     @test issorted(a)
 
     a = OffsetArray([9:-1:0;], -5)
-    Base.Sort._sort!(a, Base.Sort.CountingSort(), Base.Forward, mn=0, mx=9)
+    Base.Sort._sort!(a, Base.Sort.CountingSort(), Base.Forward, (; mn=0, mx=9))
     @test issorted(a)
 end
 

From d3bdca3be5906f4a0898846b00d44cfb51dae504 Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Wed, 2 Nov 2022 07:01:28 +0600
Subject: [PATCH 10/29] remove InsertionSortAlg and MergeSortAlg

---
 base/sort.jl | 48 +++++++++++++++++++++++++-----------------------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/base/sort.jl b/base/sort.jl
index 1ab423f0404a9..21b1a56d826d8 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -649,9 +649,8 @@ function _sort!(v::AbstractVector, a::Small{N}, o::Ordering, kw) where N
 end
 
 
-struct InsertionSortAlg <: Algorithm end
 """
-    InseritonSort
+    InsertionSort()
 
 Use the insertion sort algorithm.
 
@@ -665,9 +664,10 @@ Characteristics:
 * *quadratic performance* in the number of elements to be sorted:
 it is well-suited to small collections but should not be used for large ones.
 """
-const InsertionSort = InsertionSortAlg()
-const SMALL_ALGORITHM = InsertionSort
-function _sort!(v::AbstractVector, ::InsertionSortAlg, o::Ordering, kw)
+struct InsertionSort <: Algorithm end
+
+const SMALL_ALGORITHM = InsertionSort()
+function _sort!(v::AbstractVector, ::InsertionSort, o::Ordering, kw)
     @getkw lo hi
     lo_plus_1 = (lo + 1)::Integer
     @inbounds for i = lo_plus_1:hi
@@ -1248,13 +1248,13 @@ julia> v = [(1, "c"), (3, "a"), (2, "b")]; sort!(v, by = x -> x[2]); v
 ```
 """
 function sort!(v::AbstractVector{T};
-               alg::Algorithm=defalg(v),
+               alg::Union{Algorithm, Type{<:Algorithm}}=defalg(v),
                lt=isless,
                by=identity,
                rev::Union{Bool,Nothing}=nothing,
                order::Ordering=Forward,
                scratch::Union{AbstractVector{T}, Nothing}=nothing) where T
-    _sort!(v, alg, ord(lt,by,rev,order), (;scratch))
+    _sort!(v, getalg(alg), ord(lt,by,rev,order), (;scratch))
 end
 
 """
@@ -1432,7 +1432,7 @@ julia> sortperm(A, dims = 2)
 ```
 """
 function sortperm(A::AbstractArray;
-                  alg::Algorithm=DEFAULT_UNSTABLE,
+                  alg::Union{Algorithm, Type{<:Algorithm}}=DEFAULT_UNSTABLE,
                   lt=isless,
                   by=identity,
                   rev::Union{Bool,Nothing}=nothing,
@@ -1492,7 +1492,7 @@ julia> sortperm!(p, A; dims=2); p
 ```
 """
 function sortperm!(ix::AbstractArray{T}, A::AbstractArray;
-                   alg::Algorithm=DEFAULT_UNSTABLE,
+                   alg::Union{Algorithm, Type{<:Algorithm}}=DEFAULT_UNSTABLE,
                    lt=isless,
                    by=identity,
                    rev::Union{Bool,Nothing}=nothing,
@@ -1566,7 +1566,7 @@ julia> sort(A, dims = 2)
 """
 function sort(A::AbstractArray{T};
               dims::Integer,
-              alg::Algorithm=defalg(A),
+              alg::Union{Algorithm, Type{<:Algorithm}}=defalg(A),
               lt=isless,
               by=identity,
               rev::Union{Bool,Nothing}=nothing,
@@ -1591,7 +1591,7 @@ end
 @noinline function sort_chunks!(Av, n, alg, order, scratch)
     inds = LinearIndices(Av)
     for lo = first(inds):n:last(inds)
-        _sort!(Av, alg, order, (; lo, hi=lo+n-1, scratch))
+        _sort!(Av, getalg(alg), order, (; lo, hi=lo+n-1, scratch))
     end
     Av
 end
@@ -1628,16 +1628,16 @@ julia> sort!(A, dims = 2); A
 """
 function sort!(A::AbstractArray{T};
                dims::Integer,
-               alg::Algorithm=defalg(A),
+               alg::Union{Algorithm, Type{<:Algorithm}}=defalg(A),
                lt=isless,
                by=identity,
                rev::Union{Bool,Nothing}=nothing,
                order::Ordering=Forward,
                scratch::Union{AbstractVector{T}, Nothing}=similar(A, size(A, dims))) where T
-    __sort!(A, Val(dims), alg, ord(lt, by, rev, order), scratch)
+    __sort!(A, Val(dims), getalg(alg), ord(lt, by, rev, order), scratch)
 end
 function __sort!(A::AbstractArray{T}, ::Val{K},
-                alg::Algorithm,
+                alg::Union{Algorithm, Type{<:Algorithm}},
                 order::Ordering,
                 scratch::Union{AbstractVector{T}, Nothing}) where {K,T}
     nd = ndims(A)
@@ -1741,11 +1741,8 @@ end
 
 ### Unused ###
 
-struct MergeSortAlg{T <: Algorithm} <: Algorithm
-    next::T
-end
 """
-    MergeSort
+    MergeSort()
 
 Indicate that a sorting function should use the merge sort algorithm.
 
@@ -1760,10 +1757,12 @@ Characteristics:
   * *not in-place* in memory.
   * *divide-and-conquer* sort strategy.
 """
-const MergeSort = MergeSortAlg(SMALL_ALGORITHM)
-
+struct MergeSort{T <: Algorithm} <: Algorithm
+    next::T
+end
+MergeSort() = MergeSort(SMALL_ALGORITHM)
 
-function _sort!(v::AbstractVector, a::MergeSortAlg, o::Ordering, kw)
+function _sort!(v::AbstractVector, a::MergeSort, o::Ordering, kw)
     @getkw lo hi scratch
     @inbounds if lo < hi
         hi-lo <= SMALL_THRESHOLD && return _sort!(v, a.next, o, kw)
@@ -1806,7 +1805,10 @@ function _sort!(v::AbstractVector, a::MergeSortAlg, o::Ordering, kw)
 end
 
 # Support 3- and 5-argument version of sort! for backwards compatability
-sort!(v::AbstractVector, a::Algorithm, o::Ordering) = _sort!(v, a, o, (;))
-sort!(v::AbstractVector, lo::Integer, hi::Integer, a::Algorithm, o::Ordering) = _sort!(v, a, o, (; lo, hi))
+sort!(v::AbstractVector, a::Union{Algorithm, Type{<:Algorithm}}, o::Ordering) = _sort!(v, getalg(a), o, (;))
+sort!(v::AbstractVector, lo::Integer, hi::Integer, a::Union{Algorithm, Type{<:Algorithm}}, o::Ordering) = _sort!(v, getalg(a), o, (; lo, hi))
+# Support alg=InsertionSort and alg=MergeSort for backwards compatability
+getalg(a::Algorithm) = a
+getalg(::Type{A}) where A <: Algorithm = A()
 
 end # module Sort

From 2232cac9ca03ca3fbd9612b6e2d834df85ae05a4 Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Wed, 2 Nov 2022 08:24:29 +0600
Subject: [PATCH 11/29] better algorithm display

---
 base/sort.jl    | 45 ++++++++++++++++++++++++++++++++++++++++-----
 test/sorting.jl |  7 +++++++
 2 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/base/sort.jl b/base/sort.jl
index 21b1a56d826d8..8cdb97f36d48f 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -638,7 +638,8 @@ struct Small{N, T <: Algorithm, U <: Algorithm} <: Algorithm
     small::T
     big::U
 end
-Small{N}(big) where N = Small{N, typeof(SMALL_ALGORITHM), typeof(big)}(SMALL_ALGORITHM, big)
+Small{N}(small, big) where N = Small{N, typeof(small), typeof(big)}(small, big)
+Small{N}(big) where N = Small{N}(SMALL_ALGORITHM, big)
 function _sort!(v::AbstractVector, a::Small{N}, o::Ordering, kw) where N
     @getkw lenm1
     if lenm1 < N
@@ -1115,7 +1116,11 @@ The specific optimizations attempted by `InitialOptimizations` are
 [`MissingOptimization`](@ref), [`BoolOptimization`](@ref), dispatch to
 [`InsertionSort`](@ref) for inputs with `length <= 10`, and [`IEEEFloatOptimization`](@ref).
 """
-InitialOptimizations(next) = MissingOptimization(BoolOptimization(Small{10}(IEEEFloatOptimization(next))))
+InitialOptimizations(next) = MissingOptimization(
+    BoolOptimization(
+        Small{10}(
+            IEEEFloatOptimization(
+                next))))
 """
     DEFAULT_STABLE
 
@@ -1190,9 +1195,17 @@ stage.
 Finally, if the input has length less than 80, we dispatch to [`InsertionSort`](@ref) and
 otherwise we dispatch to [`QuickSort`](@ref).
 """
-const DEFAULT_STABLE = InitialOptimizations(IsUIntMappable(
-    Small{40}(CheckSorted(ComputeExtrema(ConsiderCountingSort(ConsiderRadixSort(Small{80}(QuickSort)))))),
-    StableCheckSorted(QuickSort)))
+const DEFAULT_STABLE = InitialOptimizations(
+    IsUIntMappable(
+        Small{40}(
+            CheckSorted(
+                ComputeExtrema(
+                    ConsiderCountingSort(
+                        ConsiderRadixSort(
+                            Small{80}(
+                                QuickSort)))))),
+        StableCheckSorted(
+            QuickSort)))
 """
     DEFAULT_UNSTABLE
 
@@ -1204,6 +1217,28 @@ the same as those used by [`DEFAULT_STABLE`](@ref), but this is subject to chang
 const DEFAULT_UNSTABLE = DEFAULT_STABLE
 const SMALL_THRESHOLD  = 20
 
+function Base.show(io::IO, alg::Algorithm)
+    print_tree(io, alg, 0)
+end
+function print_tree(io::IO, alg::Algorithm, cols::Int)
+    print(io, "    "^cols)
+    show_type(io, alg)
+    print(io, '(')
+    for (i, name) in enumerate(fieldnames(typeof(alg)))
+        arg = getproperty(alg, name)
+        i > 1 && print(io, ',')
+        if arg isa Algorithm
+            println(io)
+            print_tree(io, arg, cols+1)
+        else
+            i > 1 && print(io, ' ')
+            print(io, arg)
+        end
+    end
+    print(io, ')')
+end
+show_type(io::IO, alg::Algorithm) = Base.show_type_name(io, typeof(alg).name)
+show_type(io::IO, alg::Small{N}) where N = print(io, "Base.Sort.Small{$N}")
 
 defalg(v::AbstractArray) = DEFAULT_STABLE
 
diff --git a/test/sorting.jl b/test/sorting.jl
index f9a3e5bd7438a..e6c6f9bd30874 100644
--- a/test/sorting.jl
+++ b/test/sorting.jl
@@ -815,6 +815,13 @@ end
     end
 end
 
+@testset "show(::Algorithm)" begin
+    @test eval(Meta.parse(string(Base.DEFAULT_STABLE))) === Base.DEFAULT_STABLE
+    lines = split(string(Base.DEFAULT_STABLE), '\n')
+    @test 10 < maximum(length, lines) < 100
+    @test 1 < length(lines) < 30
+end
+
 # This testset is at the end of the file because it is slow.
 @testset "searchsorted" begin
     numTypes = [ Int8,  Int16,  Int32,  Int64,  Int128,

From a574c7f78d90f387af9180dc193e47d37fe653c6 Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Wed, 2 Nov 2022 14:00:00 +0600
Subject: [PATCH 12/29] stop passing U around

Fixes a few remaining unexpected allocations
U can be statically computed from the type of v and order so there is no need.
Further, U is infered as ::DataType rather than Type{U} which causes type instabilities.
---
 base/sort.jl | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/base/sort.jl b/base/sort.jl
index 8cdb97f36d48f..4e92b601c03b1 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -417,7 +417,6 @@ end
 for (sym, deps, exp, type) in [
         (:lo, (), :(firstindex(v)), Integer),
         (:hi, (), :(lastindex(v)),  Integer),
-        (:U, (), :(UIntMappable(eltype(v), o)),  Any), #type checking this comes at a runtime performance cost ???
         (:lenm1, (:lo, :hi), :(hi-lo), Integer),
         (:mn, (), :(throw(ArgumentError("mn is needed but has not been computed"))), :(eltype(v))),
         (:mx, (), :(throw(ArgumentError("mx is needed but has not been computed"))), :(eltype(v))),
@@ -619,8 +618,7 @@ struct IsUIntMappable{T <: Algorithm, U <: Algorithm} <: Algorithm
     no::U
 end
 function _sort!(v::AbstractVector, a::IsUIntMappable, o::Ordering, kw)
-    @getkw U
-    if U !== nothing
+    if UIntMappable(eltype(v), o) !== nothing
         _sort!(v, a.yes, o, kw)
     else
         _sort!(v, a.no, o, kw)
@@ -758,8 +756,8 @@ struct ConsiderCountingSort{T <: Algorithm, U <: Algorithm} <: Algorithm
 end
 ConsiderCountingSort(next) = ConsiderCountingSort(CountingSort(), next)
 function _sort!(v::AbstractVector{<:Integer}, a::ConsiderCountingSort, o::DirectOrdering, kw)
-    @getkw lenm1 range U
-    if range < (sizeof(U) > 8 ? 5lenm1-100 : div(lenm1, 2))
+    @getkw lenm1 range
+    if range < (sizeof(eltype(v)) > 8 ? 5lenm1-100 : div(lenm1, 2))
         _sort!(v, a.counting, o, kw)
     else
         _sort!(v, a.next, o, kw)
@@ -815,8 +813,8 @@ struct ConsiderRadixSort{T <: Algorithm, U <: Algorithm} <: Algorithm
 end
 ConsiderRadixSort(next) = ConsiderRadixSort(RadixSort(), next)
 function _sort!(v::AbstractVector, a::ConsiderRadixSort, o::DirectOrdering, kw)
-    @getkw U bits lenm1
-    if sizeof(U) <= 8 && bits+70 < 22log(lenm1)
+    @getkw bits lenm1
+    if sizeof(eltype(v)) <= 8 && bits+70 < 22log(lenm1)
         _sort!(v, a.radix, o, kw)
     else
         _sort!(v, a.next, o, kw)
@@ -847,7 +845,7 @@ Each pass divides the input into `2^chunk_size == mask+1` buckets. To do this, i
 """
 struct RadixSort <: Algorithm end
 function _sort!(v::AbstractVector, a::RadixSort, o::DirectOrdering, kw)
-    @getkw lo hi umn U scratch lenm1 bits
+    @getkw lo hi umn scratch lenm1 bits
 
     # At this point, we are committed to radix sort.
     u = uint_map!(v, lo, hi, o)
@@ -866,6 +864,7 @@ function _sort!(v::AbstractVector, a::RadixSort, o::DirectOrdering, kw)
     end
 
     len = lenm1 + 1
+    U = UIntMappable(eltype(v), o)
     if scratch !== nothing && checkbounds(Bool, scratch, lo:hi) # Fully preallocated and aligned scratch
         u2 = radix_sort!(u, lo, hi, bits, reinterpret(U, scratch))
         uint_unmap!(v, u2, lo, hi, o, umn)

From 05de36ea4b1e82a235d1fdcfb47f3d30f7b0baa3 Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Sun, 6 Nov 2022 10:14:01 +0600
Subject: [PATCH 13/29] remove lenm1

it is invalid to cache lenm1 because lo and hi may be redefined
and we have no cache invalidation system
---
 base/sort.jl | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/base/sort.jl b/base/sort.jl
index 4e92b601c03b1..f5822db1108cb 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -417,7 +417,6 @@ end
 for (sym, deps, exp, type) in [
         (:lo, (), :(firstindex(v)), Integer),
         (:hi, (), :(lastindex(v)),  Integer),
-        (:lenm1, (:lo, :hi), :(hi-lo), Integer),
         (:mn, (), :(throw(ArgumentError("mn is needed but has not been computed"))), :(eltype(v))),
         (:mx, (), :(throw(ArgumentError("mx is needed but has not been computed"))), :(eltype(v))),
         (:range, (:mn, :mx), quote
@@ -639,8 +638,8 @@ end
 Small{N}(small, big) where N = Small{N, typeof(small), typeof(big)}(small, big)
 Small{N}(big) where N = Small{N}(SMALL_ALGORITHM, big)
 function _sort!(v::AbstractVector, a::Small{N}, o::Ordering, kw) where N
-    @getkw lenm1
-    if lenm1 < N
+    @getkw lo hi
+    if (hi-lo) < N
         _sort!(v, a.small, o, kw)
     else
         _sort!(v, a.big, o, kw)
@@ -696,14 +695,14 @@ struct CheckSorted{T <: Algorithm} <: Algorithm
     next::T
 end
 function _sort!(v::AbstractVector, a::CheckSorted, o::Ordering, kw)
-    @getkw lo hi lenm1
+    @getkw lo hi
 
     # For most arrays, a presorted check is cheap (overhead < 5%) and for most large
     # arrays it is essentially free (<1%).
     _issorted(v, lo, hi, o) && return v
 
     # For most large arrays, a reverse-sorted check is essentially free (overhead < 1%)
-    if lenm1 >= 500 && _issorted(v, lo, hi, ReverseOrdering(o))
+    if hi-lo >= 500 && _issorted(v, lo, hi, ReverseOrdering(o))
         # If reversing is valid, do so. This does violates stability.
         reverse!(v, lo, hi)
         return v
@@ -756,8 +755,8 @@ struct ConsiderCountingSort{T <: Algorithm, U <: Algorithm} <: Algorithm
 end
 ConsiderCountingSort(next) = ConsiderCountingSort(CountingSort(), next)
 function _sort!(v::AbstractVector{<:Integer}, a::ConsiderCountingSort, o::DirectOrdering, kw)
-    @getkw lenm1 range
-    if range < (sizeof(eltype(v)) > 8 ? 5lenm1-100 : div(lenm1, 2))
+    @getkw lo hi range
+    if range < (sizeof(eltype(v)) > 8 ? 5(hi-lo)-100 : div(hi-lo, 2))
         _sort!(v, a.counting, o, kw)
     else
         _sort!(v, a.next, o, kw)
@@ -813,8 +812,8 @@ struct ConsiderRadixSort{T <: Algorithm, U <: Algorithm} <: Algorithm
 end
 ConsiderRadixSort(next) = ConsiderRadixSort(RadixSort(), next)
 function _sort!(v::AbstractVector, a::ConsiderRadixSort, o::DirectOrdering, kw)
-    @getkw bits lenm1
-    if sizeof(eltype(v)) <= 8 && bits+70 < 22log(lenm1)
+    @getkw bits lo hi
+    if sizeof(eltype(v)) <= 8 && bits+70 < 22log(hi-lo)
         _sort!(v, a.radix, o, kw)
     else
         _sort!(v, a.next, o, kw)
@@ -845,7 +844,7 @@ Each pass divides the input into `2^chunk_size == mask+1` buckets. To do this, i
 """
 struct RadixSort <: Algorithm end
 function _sort!(v::AbstractVector, a::RadixSort, o::DirectOrdering, kw)
-    @getkw lo hi umn scratch lenm1 bits
+    @getkw lo hi umn scratch bits
 
     # At this point, we are committed to radix sort.
     u = uint_map!(v, lo, hi, o)
@@ -863,7 +862,7 @@ function _sort!(v::AbstractVector, a::RadixSort, o::DirectOrdering, kw)
         u[i] -= umn
     end
 
-    len = lenm1 + 1
+    len = hi-lo + 1
     U = UIntMappable(eltype(v), o)
     if scratch !== nothing && checkbounds(Bool, scratch, lo:hi) # Fully preallocated and aligned scratch
         u2 = radix_sort!(u, lo, hi, bits, reinterpret(U, scratch))

From 70290d65c16fb7be070e2193024f20de084ce4b0 Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Mon, 7 Nov 2022 15:04:11 +0600
Subject: [PATCH 14/29] fix unexpected allocations in Radix Sort

fixes #47474
in this PR rather than separate to avoid dealing with the merge
---
 base/sort.jl | 56 +++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 38 insertions(+), 18 deletions(-)

diff --git a/base/sort.jl b/base/sort.jl
index f5822db1108cb..207a6535444a5 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -864,17 +864,30 @@ function _sort!(v::AbstractVector, a::RadixSort, o::DirectOrdering, kw)
 
     len = hi-lo + 1
     U = UIntMappable(eltype(v), o)
+    # A large if-else chain to avoid type instabilities and dynamic dispatch
     if scratch !== nothing && checkbounds(Bool, scratch, lo:hi) # Fully preallocated and aligned scratch
-        u2 = radix_sort!(u, lo, hi, bits, reinterpret(U, scratch))
-        uint_unmap!(v, u2, lo, hi, o, umn)
+        t = reinterpret(U, scratch)
+        if radix_sort!(u, lo, hi, bits, t)
+            uint_unmap!(v, u, lo, hi, o, umn)
+        else
+            uint_unmap!(v, t, lo, hi, o, umn)
+        end
     elseif scratch !== nothing && (applicable(resize!, scratch, len) || length(scratch) >= len) # Viable scratch
         length(scratch) >= len || resize!(scratch, len)
         t1 = axes(scratch, 1) isa OneTo ? scratch : view(scratch, firstindex(scratch):lastindex(scratch))
-        u2 = radix_sort!(view(u, lo:hi), 1, len, bits, reinterpret(U, t1))
-        uint_unmap!(view(v, lo:hi), u2, 1, len, o, umn)
+        t = reinterpret(U, t1)
+        if radix_sort!(view(u, lo:hi), 1, len, bits, t)
+            uint_unmap!(view(v, lo:hi), view(u, lo:hi), 1, len, o, umn)
+        else
+            uint_unmap!(view(v, lo:hi), t, 1, len, o, umn)
+        end
     else # No viable scratch
-        u2 = radix_sort!(u, lo, hi, bits, similar(u))
-        uint_unmap!(v, u2, lo, hi, o, umn)
+        t = similar(u)
+        if radix_sort!(u, lo, hi, bits, t)
+            uint_unmap!(v, u, lo, hi, o, umn)
+        else
+            uint_unmap!(v, t, lo, hi, o, umn)
+        end
     end
 end
 
@@ -1025,16 +1038,28 @@ function _sort!(v::AbstractVector, a::StableCheckSorted, o::Ordering, kw)
 end
 
 
-# In the case of an odd number of passes, the returned vector will === the input vector t,
-# not v. This is one of the many reasons radix_sort! is not exported.
+# The return value indicates whether v is sorted (true) or t is sorted (false)
+# This is one of the many reasons radix_sort! is not exported.
 function radix_sort!(v::AbstractVector{U}, lo::Integer, hi::Integer, bits::Unsigned,
                      t::AbstractVector{U}, chunk_size=radix_chunk_size_heuristic(lo, hi, bits)) where U <: Unsigned
     # bits is unsigned for performance reasons.
-    mask = UInt(1) << chunk_size - 1
-    counts = Vector{Int}(undef, mask+2)
-
-    @inbounds for shift in 0:chunk_size:bits-1
-
+    counts = Vector{Int}(undef, 1 << chunk_size + 1)
+
+    shift = 0
+    while true
+        @noinline radix_sort_pass!(t, lo, hi, counts, v, shift, chunk_size)
+        # the latest data resides in t
+        shift += chunk_size
+        shift < bits || return false
+        @noinline radix_sort_pass!(v, lo, hi, counts, t, shift, chunk_size)
+        # the latest data resides in v
+        shift += chunk_size
+        shift < bits || return true
+    end
+end
+function radix_sort_pass!(t, lo, hi, counts, v, shift, chunk_size)
+    mask = UInt(1) << chunk_size - 1  # mask is defined in pass so that the compiler
+    @inbounds begin                   #  ↳ knows it's shape
         # counts[2:mask+2] will store the number of elements that fall into each bucket.
         # if chunk_size = 8, counts[2] is bucket 0x00 and counts[257] is bucket 0xff.
         counts .= 0
@@ -1058,12 +1083,7 @@ function radix_sort!(v::AbstractVector{U}, lo::Integer, hi::Integer, bits::Unsig
             t[j] = x                  # put the element where it belongs
             counts[i] = j + 1         # increment the target index for the next
         end                           #  ↳ element in this bucket
-
-        v, t = t, v # swap the now sorted destination vector t back into primary vector v
-
     end
-
-    v
 end
 function radix_chunk_size_heuristic(lo::Integer, hi::Integer, bits::Unsigned)
     # chunk_size is the number of bits to radix over at once.

From f06de1089edc3fc261875f201782f171dcecdcd9 Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Mon, 7 Nov 2022 19:58:31 +0600
Subject: [PATCH 15/29] fix doctests? I have no idea how

---
 base/sort.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/base/sort.jl b/base/sort.jl
index 207a6535444a5..ddc8ad70942f2 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -1259,6 +1259,9 @@ show_type(io::IO, alg::Algorithm) = Base.show_type_name(io, typeof(alg).name)
 show_type(io::IO, alg::Small{N}) where N = print(io, "Base.Sort.Small{$N}")
 
 defalg(v::AbstractArray) = DEFAULT_STABLE
+defalg(v::AbstractArray{<:Union{Number, Missing}}) = DEFAULT_UNSTABLE
+defalg(v::AbstractArray{Missing}) = DEFAULT_UNSTABLE # for method disambiguation
+defalg(v::AbstractArray{Union{}}) = DEFAULT_UNSTABLE # for method disambiguation
 
 """
     sort!(v; alg::Algorithm=defalg(v), lt=isless, by=identity, rev::Bool=false, order::Ordering=Forward)

From 38f4512d18bdb96d14b4b3e14d299b5d53affcd4 Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Wed, 9 Nov 2022 10:25:53 +0600
Subject: [PATCH 16/29] support and test backwards compatability with packages
 that depend in sorting internals

---
 base/sort.jl    | 36 ++++++++++++++++++++++++++++--------
 test/sorting.jl | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 8 deletions(-)

diff --git a/base/sort.jl b/base/sort.jl
index ddc8ad70942f2..481c2bd4996c1 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -410,8 +410,8 @@ insorted(x, r::AbstractRange) = in(x, r)
 ## Alternative keyword management
 
 macro getkw(syms...)
-    usyms = (Symbol(:_, sym) for sym in syms)
-    Expr(:block, (:($(esc(:((kw, $sym) = $usym(v, o, kw))))) for (sym, usym) in zip(syms, usyms))...)
+    getters = (getproperty(Sort, Symbol(:_, sym)) for sym in syms)
+    Expr(:block, (:($(esc(:((kw, $sym) = $getter(v, o, kw))))) for (sym, getter) in zip(syms, getters))...)
 end
 
 for (sym, deps, exp, type) in [
@@ -430,7 +430,8 @@ for (sym, deps, exp, type) in [
         (:scratch, (), nothing, :(Union{Nothing, AbstractVector})), # could have different eltype
         (:t, (:lo, :hi, :scratch), quote
             scratch === nothing ? similar(v) : reinterpret(eltype(v), checkbounds(Bool, scratch, lo:hi) ? scratch : resize!(scratch, length(v)))
-        end, :(AbstractVector{eltype(v)}))]
+        end, :(AbstractVector{eltype(v)})),
+        (:allow_legacy_dispatch, (), true, Bool)]
     str = string(sym)
     usym = Symbol(:_, sym)
     @eval function $usym(v, o, kw)
@@ -1795,7 +1796,7 @@ end
 
 
 
-### Unused ###
+### Unused constructs for backward compatability ###
 
 """
     MergeSort()
@@ -1860,11 +1861,30 @@ function _sort!(v::AbstractVector, a::MergeSort, o::Ordering, kw)
     return v
 end
 
-# Support 3- and 5-argument version of sort! for backwards compatability
-sort!(v::AbstractVector, a::Union{Algorithm, Type{<:Algorithm}}, o::Ordering) = _sort!(v, getalg(a), o, (;))
-sort!(v::AbstractVector, lo::Integer, hi::Integer, a::Union{Algorithm, Type{<:Algorithm}}, o::Ordering) = _sort!(v, getalg(a), o, (; lo, hi))
-# Support alg=InsertionSort and alg=MergeSort for backwards compatability
+# Support alg=InsertionSort and alg=MergeSort for backwards compatability (prefer InsertionSort() and MergeSort())
 getalg(a::Algorithm) = a
 getalg(::Type{A}) where A <: Algorithm = A()
 
+# Support 3- and 5-argument versions of sort! for calling into the internals in the old way
+sort!(v::AbstractVector, a::Union{Algorithm, Type{<:Algorithm}}, o::Ordering) = _sort!(v, getalg(a), o, (; allow_legacy_dispatch=false))
+sort!(v::AbstractVector, lo::Integer, hi::Integer, a::Union{Algorithm, Type{<:Algorithm}}, o::Ordering) = _sort!(v, getalg(a), o, (; lo, hi, allow_legacy_dispatch=false))
+
+# Support dispatch on custom algorithms in the old way
+# sort!(::AbstractVector, ::Integer, ::Integer, ::MyCustomAlgorithm, ::Ordering) = ...
+function _sort!(v::AbstractVector, a::Algorithm, o::Ordering, kw)
+    @getkw lo hi allow_legacy_dispatch
+    if allow_legacy_dispatch
+        sort!(v, lo, hi, a, o)
+    else
+        # This error prevents infinite recursion for unknown algorithms
+        throw(ArgumentError("Base.Sort._sort!(::$(typeof(v)), ::$(typeof(a)), ::$(typeof(o))) is not defined"))
+    end
+end
+
+# Keep old internal types so that people can keep dispatching with
+# sort!(::AbstractVector, ::Integer, ::Integer, ::Base.QuickSortAlg, ::Ordering) = ...
+const QuickSortAlg = typeof(QuickSort)
+const MergeSortAlg = typeof(MergeSort)
+const InsertionSortAlg = typeof(InsertionSort)
+
 end # module Sort
diff --git a/test/sorting.jl b/test/sorting.jl
index e6c6f9bd30874..d321734cdb177 100644
--- a/test/sorting.jl
+++ b/test/sorting.jl
@@ -822,6 +822,38 @@ end
     @test 1 < length(lines) < 30
 end
 
+@testset "Defining new algorithms & backwards compatibility with packages that use sorting internals" begin
+    struct MyFirstAlg <: Base.Sort.Algorithm end
+    # The pre 1.9 dispatch method
+    function Base.sort!(v::AbstractVector{Int}, lo::Integer, hi::Integer, ::MyFirstAlg, o::Base.Order.Ordering)
+        v[lo:hi] .= 7
+    end
+    @test sort([1,2,3], alg=MyFirstAlg()) == [7,7,7]
+    v = shuffle(vcat(fill(missing, 10), rand(Int, 100)))
+    @test all(sort(v, alg=Base.Sort.InitialOptimizations(MyFirstAlg())) .=== vcat(fill(7, 100), fill(missing, 10)))
+
+    # Use the pre 1.9 hook into the internals
+    function Base.sort!(v::AbstractVector{Int}, lo::Integer, hi::Integer, ::MyFirstAlg, o::Base.Order.Ordering)
+        sort!(v, lo, hi, Base.DEFAULT_STABLE, o)
+    end
+    @test sort([3,1,2], alg=MyFirstAlg()) == [1,2,3]
+    v = shuffle(vcat(fill(missing, 10), rand(Int, 100)))
+    @test issorted(sort(v, alg=Base.Sort.InitialOptimizations(MyFirstAlg())))
+
+    # Another pre 1.9 hook into the internals
+    @test issorted(sort!(rand(100), InsertionSort, Base.Order.Forward))
+
+    struct MySecondAlg <: Base.Sort.Algorithm end
+    # A new dispatch method
+    function Base.Sort._sort!(v::AbstractVector, ::MySecondAlg, o::Base.Order.Ordering, kw)
+        Base.Sort.@getkw lo hi
+        v[lo:hi] .= 9
+    end
+    @test sort([1,2,3], alg=MySecondAlg()) == [9,9,9]
+    v = shuffle(vcat(fill(missing, 10), rand(Int, 100)))
+    @test all(sort(v, alg=Base.Sort.InitialOptimizations(MySecondAlg())) .=== vcat(fill(9, 100), fill(missing, 10)))
+end
+
 # This testset is at the end of the file because it is slow.
 @testset "searchsorted" begin
     numTypes = [ Int8,  Int16,  Int32,  Int64,  Int128,

From d8ae968bc31ef35297d89612511992f55a568c1f Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Thu, 10 Nov 2022 07:22:13 +0600
Subject: [PATCH 17/29] improve extensibility tests

---
 test/sorting.jl | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/test/sorting.jl b/test/sorting.jl
index d321734cdb177..f9faecd2ff808 100644
--- a/test/sorting.jl
+++ b/test/sorting.jl
@@ -824,12 +824,16 @@ end
 
 @testset "Defining new algorithms & backwards compatibility with packages that use sorting internals" begin
     struct MyFirstAlg <: Base.Sort.Algorithm end
+
+    @test_throws ArgumentError sort([1,2,3], alg=MyFirstAlg()) # not a stack overflow error
+
+    v = shuffle(vcat(fill(missing, 10), rand(Int, 100)))
+
     # The pre 1.9 dispatch method
     function Base.sort!(v::AbstractVector{Int}, lo::Integer, hi::Integer, ::MyFirstAlg, o::Base.Order.Ordering)
         v[lo:hi] .= 7
     end
     @test sort([1,2,3], alg=MyFirstAlg()) == [7,7,7]
-    v = shuffle(vcat(fill(missing, 10), rand(Int, 100)))
     @test all(sort(v, alg=Base.Sort.InitialOptimizations(MyFirstAlg())) .=== vcat(fill(7, 100), fill(missing, 10)))
 
     # Use the pre 1.9 hook into the internals
@@ -837,7 +841,6 @@ end
         sort!(v, lo, hi, Base.DEFAULT_STABLE, o)
     end
     @test sort([3,1,2], alg=MyFirstAlg()) == [1,2,3]
-    v = shuffle(vcat(fill(missing, 10), rand(Int, 100)))
     @test issorted(sort(v, alg=Base.Sort.InitialOptimizations(MyFirstAlg())))
 
     # Another pre 1.9 hook into the internals
@@ -850,7 +853,6 @@ end
         v[lo:hi] .= 9
     end
     @test sort([1,2,3], alg=MySecondAlg()) == [9,9,9]
-    v = shuffle(vcat(fill(missing, 10), rand(Int, 100)))
     @test all(sort(v, alg=Base.Sort.InitialOptimizations(MySecondAlg())) .=== vcat(fill(9, 100), fill(missing, 10)))
 end
 

From c633419d63bd726667f9c4c02f40b7ece02f05f8 Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Fri, 11 Nov 2022 10:52:30 +0600
Subject: [PATCH 18/29] overhall scratch space handling

make _sort! return scratch space rather than sorted vector
so that things like IEEEFloatOptimization can re-use the
scratch space allocated on their first recursive call
---
 base/sort.jl    | 264 ++++++++++++++++++++++++++++++------------------
 test/sorting.jl |  10 +-
 2 files changed, 172 insertions(+), 102 deletions(-)

diff --git a/base/sort.jl b/base/sort.jl
index 481c2bd4996c1..93e41ece28725 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -427,10 +427,7 @@ for (sym, deps, exp, type) in [
         (:umx, (:mx,), :(uint_map(mx, o)), Unsigned),
         (:urange, (:umn, :umx), :(umx-umn), Unsigned),
         (:bits, (:urange,), :(unsigned(8sizeof(urange) - leading_zeros(urange))), Unsigned),
-        (:scratch, (), nothing, :(Union{Nothing, AbstractVector})), # could have different eltype
-        (:t, (:lo, :hi, :scratch), quote
-            scratch === nothing ? similar(v) : reinterpret(eltype(v), checkbounds(Bool, scratch, lo:hi) ? scratch : resize!(scratch, length(v)))
-        end, :(AbstractVector{eltype(v)})),
+        (:scratch, (), nothing, :(Union{Nothing, Vector})), # could have different eltype
         (:allow_legacy_dispatch, (), true, Bool)]
     str = string(sym)
     usym = Symbol(:_, sym)
@@ -442,8 +439,58 @@ for (sym, deps, exp, type) in [
     end
 end
 
+## Scratch space management
+
+"""
+    make_scratch(scratch::Union{Nothing, Vector}, T::Type, len::Integer)
+
+Returns `(s, t)` where `t` is an `AbstractVector` of type `T` with length at least `len`
+that is backed by the `Vector` `s`. If `scratch !== nothing`, then `s === scratch`.
+
+This function will allocate a new vector if `scratch === nothing`, `resize!` `scratch` if it
+is too short, and `reinterpret` `scratch` if its eltype is not `T`.
+"""
+function make_scratch(scratch::Nothing, T::Type, len::Integer)
+    s = Vector{T}(undef, len)
+    s, s
+end
+function make_scratch(scratch::Vector{T}, ::Type{T}, len::Integer) where T
+    len > length(scratch) && resize!(scratch, len)
+    scratch, scratch
+end
+function make_scratch(scratch::Vector, T::Type, len::Integer)
+    len_bytes = len * sizeof(T)
+    len_scratch = div(len_bytes, sizeof(eltype(scratch)))
+    len_scratch > length(scratch) && resize!(scratch, len_scratch)
+    scratch, reinterpret(T, scratch)
+end
+
+
 ## sorting algorithm components ##
 
+"""
+    _sort!(v::AbstractVector, a::Algorithm, o::Ordering, kw; t, offset)
+
+An internal function that sorts `v` using the algorithm `a` under the ordering `o`,
+subject to specifications provided in `kw` (such as `lo` and `hi` in which case it only
+sorts `view(v, lo:hi)`)
+
+Returns a scratch space if provided or constructed during the sort, or `nothing` if
+no scratch space is present.
+
+!!! note
+    `_sort!` modifies but does not return `v`.
+
+A returned scratch space will be a `Vector{T}` where `T` is usually the eltype of `v`. There
+are some exceptions, for example if `eltype(v) == Union{Missing, T}` then the scratch space
+may be be a `Vector{T}` due to `MissingOptimization` changing the eltype of `v` to `T`.
+
+`t` is an appropriate scratch space for the algorithm at hand, to be accessed as
+`t[i + offset]`. `t` is used for an algorithm to pass a scratch space back to itself in
+internal or recursive calls.
+"""
+function _sort! end
+
 abstract type Algorithm end
 
 
@@ -526,7 +573,6 @@ function _sort!(v::AbstractVector, a::MissingOptimization, o::Ordering, kw)
     if nonmissingtype(eltype(v)) != eltype(v) && o isa DirectOrdering
         lo, hi = send_to_end!(ismissing, v, o; lo, hi)
         _sort!(WithoutMissingVector(v, unsafe=true), a.next, o, (;kw..., lo, hi))
-        v
     elseif eltype(v) <: Integer && o isa Perm{DirectOrdering} && nonmissingtype(eltype(o.data)) != eltype(o.data)
         lo, hi = send_to_end!(i -> ismissing(@inbounds o.data[i]), v, o)
         _sort!(v, a.next, Perm(o.order, WithoutMissingVector(o.data, unsafe=true)), (;kw..., lo, hi))
@@ -562,18 +608,25 @@ function _sort!(v::AbstractVector, a::IEEEFloatOptimization, o::Ordering, kw)
         lo, hi = send_to_end!(isnan, v, o, true; lo, hi)
         iv = reinterpret(UIntType(eltype(v)), v)
         j = send_to_end!(x -> after_zero(o, x), v; lo, hi)
-        _sort!(iv, a.next, Reverse, (;kw..., lo, hi=j))
-        _sort!(iv, a.next, Forward, (;kw..., lo=j+1, hi))
+        scratch = _sort!(iv, a.next, Reverse, (;kw..., lo, hi=j))
+        if scratch === nothing # Union split
+            _sort!(iv, a.next, Forward, (;kw..., lo=j+1, hi, scratch))
+        else
+            _sort!(iv, a.next, Forward, (;kw..., lo=j+1, hi, scratch))
+        end
     elseif eltype(v) <: Integer && o isa Perm && o.order isa DirectOrdering && is_concrete_IEEEFloat(eltype(o.data))
         lo, hi = send_to_end!(i -> isnan(@inbounds o.data[i]), v, o.order, true; lo, hi)
         ip = reinterpret(UIntType(eltype(o.data)), o.data)
         j = send_to_end!(i -> after_zero(o.order, @inbounds o.data[i]), v; lo, hi)
-        _sort!(v, a.next, Perm(Reverse, ip), (;kw..., lo, hi=j))
-        _sort!(v, a.next, Perm(Forward, ip), (;kw..., lo=j+1, hi))
+        scratch = _sort!(v, a.next, Perm(Reverse, ip), (;kw..., lo, hi=j))
+        if scratch === nothing # Union split
+            _sort!(v, a.next, Perm(Forward, ip), (;kw..., lo=j+1, hi, scratch))
+        else
+            _sort!(v, a.next, Perm(Forward, ip), (;kw..., lo=j+1, hi, scratch))
+        end
     else
         _sort!(v, a.next, o, kw)
     end
-    v
 end
 
 
@@ -591,7 +644,7 @@ end
 _sort!(v::AbstractVector, a::BoolOptimization, o::Ordering, kw) = _sort!(v, a.next, o, kw)
 function _sort!(v::AbstractVector{Bool}, ::BoolOptimization, o::Ordering, kw)
     first = lt(o, false, true) ? false : lt(o, true, false) ? true : return v
-    @getkw lo hi
+    @getkw lo hi scratch
     count = 0
     @inbounds for i in lo:hi
         if v[i] == first
@@ -600,7 +653,7 @@ function _sort!(v::AbstractVector{Bool}, ::BoolOptimization, o::Ordering, kw)
     end
     @inbounds v[lo:lo+count-1] .= first
     @inbounds v[lo+count:hi] .= !first
-    v
+    scratch
 end
 
 
@@ -667,7 +720,7 @@ struct InsertionSort <: Algorithm end
 
 const SMALL_ALGORITHM = InsertionSort()
 function _sort!(v::AbstractVector, ::InsertionSort, o::Ordering, kw)
-    @getkw lo hi
+    @getkw lo hi scratch
     lo_plus_1 = (lo + 1)::Integer
     @inbounds for i = lo_plus_1:hi
         j = i
@@ -682,7 +735,7 @@ function _sort!(v::AbstractVector, ::InsertionSort, o::Ordering, kw)
         end
         v[j] = x
     end
-    return v
+    scratch
 end
 
 
@@ -696,17 +749,17 @@ struct CheckSorted{T <: Algorithm} <: Algorithm
     next::T
 end
 function _sort!(v::AbstractVector, a::CheckSorted, o::Ordering, kw)
-    @getkw lo hi
+    @getkw lo hi scratch
 
     # For most arrays, a presorted check is cheap (overhead < 5%) and for most large
     # arrays it is essentially free (<1%).
-    _issorted(v, lo, hi, o) && return v
+    _issorted(v, lo, hi, o) && return scratch
 
     # For most large arrays, a reverse-sorted check is essentially free (overhead < 1%)
     if hi-lo >= 500 && _issorted(v, lo, hi, ReverseOrdering(o))
         # If reversing is valid, do so. This does violates stability.
         reverse!(v, lo, hi)
-        return v
+        return scratch
     end
 
     _sort!(v, a.next, o, kw)
@@ -725,7 +778,7 @@ struct ComputeExtrema{T <: Algorithm} <: Algorithm
     next::T
 end
 function _sort!(v::AbstractVector, a::ComputeExtrema, o::Ordering, kw)
-    @getkw lo hi
+    @getkw lo hi scratch
     mn = mx = v[lo]
     @inbounds for i in (lo+1):hi
         vi = v[i]
@@ -734,7 +787,7 @@ function _sort!(v::AbstractVector, a::ComputeExtrema, o::Ordering, kw)
     end
     mn, mx
 
-    lt(o, mn, mx) || return v # all same
+    lt(o, mn, mx) || return scratch # all same
 
     _sort!(v, a.next, o, (;kw..., mn, mx))
 end
@@ -779,10 +832,10 @@ struct CountingSort <: Algorithm end
 maybe_reverse(o::ForwardOrdering, x) = x
 maybe_reverse(o::ReverseOrdering, x) = reverse(x)
 function _sort!(v::AbstractVector{<:Integer}, ::CountingSort, o::DirectOrdering, kw)
-    @getkw lo hi mn mx range
+    @getkw lo hi mn mx range scratch
     offs = 1 - (o === Reverse ? mx : mn)
 
-    counts = fill(0, range+1)
+    counts = fill(0, range+1) # TODO use scratch (but be aware of type stability)
     @inbounds for i = lo:hi
         counts[v[i] + offs] += 1
     end
@@ -797,7 +850,7 @@ function _sort!(v::AbstractVector{<:Integer}, ::CountingSort, o::DirectOrdering,
         idx = lastidx + 1
     end
 
-    v
+    scratch
 end
 
 
@@ -865,31 +918,14 @@ function _sort!(v::AbstractVector, a::RadixSort, o::DirectOrdering, kw)
 
     len = hi-lo + 1
     U = UIntMappable(eltype(v), o)
-    # A large if-else chain to avoid type instabilities and dynamic dispatch
-    if scratch !== nothing && checkbounds(Bool, scratch, lo:hi) # Fully preallocated and aligned scratch
-        t = reinterpret(U, scratch)
-        if radix_sort!(u, lo, hi, bits, t)
-            uint_unmap!(v, u, lo, hi, o, umn)
-        else
-            uint_unmap!(v, t, lo, hi, o, umn)
-        end
-    elseif scratch !== nothing && (applicable(resize!, scratch, len) || length(scratch) >= len) # Viable scratch
-        length(scratch) >= len || resize!(scratch, len)
-        t1 = axes(scratch, 1) isa OneTo ? scratch : view(scratch, firstindex(scratch):lastindex(scratch))
-        t = reinterpret(U, t1)
-        if radix_sort!(view(u, lo:hi), 1, len, bits, t)
-            uint_unmap!(view(v, lo:hi), view(u, lo:hi), 1, len, o, umn)
-        else
-            uint_unmap!(view(v, lo:hi), t, 1, len, o, umn)
-        end
-    else # No viable scratch
-        t = similar(u)
-        if radix_sort!(u, lo, hi, bits, t)
-            uint_unmap!(v, u, lo, hi, o, umn)
-        else
-            uint_unmap!(v, t, lo, hi, o, umn)
-        end
+    scratch, t = make_scratch(scratch, eltype(v), len)
+    tu = reinterpret(U, t)
+    if radix_sort!(u, lo, hi, bits, tu, 1-lo)
+        uint_unmap!(v, u, lo, hi, o, umn)
+    else
+        uint_unmap!(v, tu, lo, hi, o, umn, 1-lo)
     end
+    scratch
 end
 
 
@@ -948,67 +984,73 @@ select_pivot(lo::Integer, hi::Integer) = typeof(hi-lo)(hash(lo) % (hi-lo+1)) + l
 #
 # returns (pivot, pivot_index) where pivot_index is the location the pivot
 # should end up, but does not set t[pivot_index] = pivot
-function partition!(t::AbstractVector, lo::Integer, hi::Integer, o::Ordering, v::AbstractVector, rev::Bool)
+function partition!(t::AbstractVector, lo::Integer, hi::Integer, offset::Integer, o::Ordering, v::AbstractVector, rev::Bool)
     pivot_index = select_pivot(lo, hi)
-    trues = 0
     @inbounds begin
         pivot = v[pivot_index]
         while lo < pivot_index
             x = v[lo]
             fx = rev ? !lt(o, x, pivot) : lt(o, pivot, x)
-            t[(fx ? hi : lo) - trues] = x
-            trues += fx
+            t[(fx ? hi : lo) - offset] = x
+            offset += fx
             lo += 1
         end
         while lo < hi
             x = v[lo+1]
             fx = rev ? lt(o, pivot, x) : !lt(o, x, pivot)
-            t[(fx ? hi : lo) - trues] = x
-            trues += fx
+            t[(fx ? hi : lo) - offset] = x
+            offset += fx
             lo += 1
         end
     end
 
-    # pivot_index = lo-trues
+    # pivot_index = lo-offset
     # t[pivot_index] is whatever it was before
     # t[<pivot_index] <* pivot, stable
     # t[>pivot_index] >* pivot, reverse stable
 
-    pivot, lo-trues
+    pivot, lo-offset
 end
 
 function _sort!(v::AbstractVector, a::PartialQuickSort, o::Ordering, kw;
-                t=nothing, swap=false, rev=false)
-    @getkw lo hi t
+                t=nothing, offset=nothing, swap=false, rev=false)
+    @getkw lo hi scratch
+
+    if t === nothing
+        scratch, t = make_scratch(scratch, eltype(v), hi-lo+1)
+        offset = 1-lo
+        kw = (;kw..., scratch)
+    end
 
     while lo < hi && hi - lo > SMALL_THRESHOLD
-        pivot, j = swap ? partition!(v, lo, hi, o, t, rev) : partition!(t, lo, hi, o, v, rev)
+        pivot, j = swap ? partition!(v, lo+offset, hi+offset, offset, o, t, rev) : partition!(t, lo, hi, -offset, o, v, rev)
+        j -= !swap*offset
         @inbounds v[j] = pivot
         swap = !swap
 
         # For QuickSort, a.lo === a.hi === missing, so the first two branches get skipped
         if !ismissing(a.lo) && j <= a.lo # Skip sorting the lower part
-            swap && copyto!(v, lo, t, lo, j-lo)
+            swap && copyto!(v, lo, t, lo+offset, j-lo)
             rev && reverse!(v, lo, j-1)
             lo = j+1
             rev = !rev
         elseif !ismissing(a.hi) && a.hi <= j # Skip sorting the upper part
-            swap && copyto!(v, j+1, t, j+1, hi-j)
+            swap && copyto!(v, j+1, t, j+1+offset, hi-j)
             rev || reverse!(v, j+1, hi)
             hi = j-1
         elseif j-lo < hi-j
             # Sort the lower part recursively because it is smaller. Recursing on the
             # smaller part guarantees O(log(n)) stack space even on pathological inputs.
-            _sort!(v, a, o, (;kw..., lo, hi=j-1); swap, rev)
+            _sort!(v, a, o, (;kw..., lo, hi=j-1); t, offset, swap, rev)
             lo = j+1
             rev = !rev
         else # Sort the higher part recursively
-            _sort!(v, a, o, (;kw..., lo=j+1, hi); swap, rev=!rev)
+            _sort!(v, a, o, (;kw..., lo=j+1, hi); t, offset, swap, rev=!rev)
             hi = j-1
         end
     end
-    hi < lo && return v
-    swap && copyto!(v, lo, t, lo, hi-lo+1)
+    hi < lo && return scratch
+    swap && copyto!(v, lo, t, lo+offset, hi-lo+1)
     rev && reverse!(v, lo, hi)
     _sort!(v, a.next, o, (;kw..., lo, hi))
 end
@@ -1027,12 +1069,13 @@ struct StableCheckSorted{T<:Algorithm} <: Algorithm
     next::T
 end
 function _sort!(v::AbstractVector, a::StableCheckSorted, o::Ordering, kw)
-    @getkw lo hi
+    @getkw lo hi scratch
     if _issorted(v, lo, hi, o)
-        return v
+        return scratch
     elseif _issorted(v, lo, hi, Lt((x, y) -> !lt(o, x, y)))
         # Reverse only if necessary. Using issorted(..., Reverse(o)) would violate stability.
-        return reverse!(v, lo, hi)
+        reverse!(v, lo, hi)
+        return scratch
     end
 
     _sort!(v, a.next, o, kw)
@@ -1042,23 +1085,24 @@ end
 # The return value indicates whether v is sorted (true) or t is sorted (false)
 # This is one of the many reasons radix_sort! is not exported.
 function radix_sort!(v::AbstractVector{U}, lo::Integer, hi::Integer, bits::Unsigned,
-                     t::AbstractVector{U}, chunk_size=radix_chunk_size_heuristic(lo, hi, bits)) where U <: Unsigned
+                     t::AbstractVector{U}, offset::Integer,
+                     chunk_size=radix_chunk_size_heuristic(lo, hi, bits)) where U <: Unsigned
     # bits is unsigned for performance reasons.
-    counts = Vector{Int}(undef, 1 << chunk_size + 1)
+    counts = Vector{Int}(undef, 1 << chunk_size + 1) # TODO use scratch for this
 
     shift = 0
     while true
-        @noinline radix_sort_pass!(t, lo, hi, counts, v, shift, chunk_size)
+        @noinline radix_sort_pass!(t, lo, hi, offset, counts, v, shift, chunk_size)
         # the latest data resides in t
         shift += chunk_size
         shift < bits || return false
-        @noinline radix_sort_pass!(v, lo, hi, counts, t, shift, chunk_size)
+        @noinline radix_sort_pass!(v, lo+offset, hi+offset, -offset, counts, t, shift, chunk_size)
         # the latest data resides in v
         shift += chunk_size
         shift < bits || return true
     end
 end
-function radix_sort_pass!(t, lo, hi, counts, v, shift, chunk_size)
+function radix_sort_pass!(t, lo, hi, offset, counts, v, shift, chunk_size)
     mask = UInt(1) << chunk_size - 1  # mask is defined in pass so that the compiler
     @inbounds begin                   #  ↳ knows it's shape
         # counts[2:mask+2] will store the number of elements that fall into each bucket.
@@ -1081,7 +1125,7 @@ function radix_sort_pass!(t, lo, hi, counts, v, shift, chunk_size)
             x = v[k]                  # lookup the element
             i = (x >> shift)&mask + 1 # compute its bucket's index for this pass
             j = counts[i]             # lookup the target index
-            t[j] = x                  # put the element where it belongs
+            t[j + offset] = x         # put the element where it belongs
             counts[i] = j + 1         # increment the target index for the next
         end                           #  ↳ element in this bucket
     end
@@ -1310,8 +1354,9 @@ function sort!(v::AbstractVector{T};
                by=identity,
                rev::Union{Bool,Nothing}=nothing,
                order::Ordering=Forward,
-               scratch::Union{AbstractVector{T}, Nothing}=nothing) where T
+               scratch::Union{Vector{T}, Nothing}=nothing) where T
     _sort!(v, getalg(alg), ord(lt,by,rev,order), (;scratch))
+    v
 end
 
 """
@@ -1494,7 +1539,7 @@ function sortperm(A::AbstractArray;
                   by=identity,
                   rev::Union{Bool,Nothing}=nothing,
                   order::Ordering=Forward,
-                  scratch::Union{AbstractVector{<:Integer}, Nothing}=nothing,
+                  scratch::Union{Vector{<:Integer}, Nothing}=nothing,
                   dims...) #to optionally specify dims argument
     ordr = ord(lt,by,rev,order)
     if ordr === Forward && isa(A,Vector) && eltype(A)<:Integer
@@ -1555,7 +1600,7 @@ function sortperm!(ix::AbstractArray{T}, A::AbstractArray;
                    rev::Union{Bool,Nothing}=nothing,
                    order::Ordering=Forward,
                    initialized::Bool=false,
-                   scratch::Union{AbstractVector{T}, Nothing}=nothing,
+                   scratch::Union{Vector{T}, Nothing}=nothing,
                    dims...) where T <: Integer #to optionally specify dims argument
     (typeof(A) <: AbstractVector) == (:dims in keys(dims)) && throw(ArgumentError("Dims argument incorrect for type $(typeof(A))"))
     axes(ix) == axes(A) || throw(ArgumentError("index array must have the same size/axes as the source array, $(axes(ix)) != $(axes(A))"))
@@ -1628,7 +1673,7 @@ function sort(A::AbstractArray{T};
               by=identity,
               rev::Union{Bool,Nothing}=nothing,
               order::Ordering=Forward,
-              scratch::Union{AbstractVector{T}, Nothing}=similar(A, size(A, dims))) where T
+              scratch::Union{Vector{T}, Nothing}=nothing) where T
     dim = dims
     order = ord(lt,by,rev,order)
     n = length(axes(A, dim))
@@ -1636,19 +1681,31 @@ function sort(A::AbstractArray{T};
         pdims = (dim, setdiff(1:ndims(A), dim)...)  # put the selected dimension first
         Ap = permutedims(A, pdims)
         Av = vec(Ap)
-        sort_chunks!(Av, n, alg, order, scratch)
+        sort_chunks!(Av, n, getalg(alg), order, scratch)
         permutedims(Ap, invperm(pdims))
     else
         Av = A[:]
-        sort_chunks!(Av, n, alg, order, scratch)
+        sort_chunks!(Av, n, getalg(alg), order, scratch)
         reshape(Av, axes(A))
     end
 end
 
 @noinline function sort_chunks!(Av, n, alg, order, scratch)
     inds = LinearIndices(Av)
-    for lo = first(inds):n:last(inds)
-        _sort!(Av, getalg(alg), order, (; lo, hi=lo+n-1, scratch))
+    sort_chunks!(Av, n, alg, order, scratch, first(inds), last(inds))
+end
+
+@noinline function sort_chunks!(Av, n, alg, order, scratch::Nothing, fst, lst)
+    for lo = fst:n:lst
+        s = _sort!(Av, alg, order, (; lo, hi=lo+n-1, scratch))
+        s !== nothing && return sort_chunks!(Av, n, alg, order, s, lo+n, lst)
+    end
+    Av
+end
+
+@noinline function sort_chunks!(Av, n, alg, order, scratch::AbstractVector, fst, lst)
+    for lo = fst:n:lst
+        _sort!(Av, alg, order, (; lo, hi=lo+n-1, scratch))
     end
     Av
 end
@@ -1689,14 +1746,14 @@ function sort!(A::AbstractArray{T};
                lt=isless,
                by=identity,
                rev::Union{Bool,Nothing}=nothing,
-               order::Ordering=Forward,
-               scratch::Union{AbstractVector{T}, Nothing}=similar(A, size(A, dims))) where T
+               order::Ordering=Forward, # TODO stop eagerly over-allocating.
+               scratch::Union{Vector{T}, Nothing}=similar(A, size(A, dims))) where T
     __sort!(A, Val(dims), getalg(alg), ord(lt, by, rev, order), scratch)
 end
 function __sort!(A::AbstractArray{T}, ::Val{K},
                 alg::Union{Algorithm, Type{<:Algorithm}},
                 order::Ordering,
-                scratch::Union{AbstractVector{T}, Nothing}) where {K,T}
+                scratch::Union{Vector{T}, Nothing}) where {K,T}
     nd = ndims(A)
 
     1 <= K <= nd || throw(ArgumentError("dimension out of range"))
@@ -1787,9 +1844,10 @@ function uint_map!(v::AbstractVector, lo::Integer, hi::Integer, order::Ordering)
 end
 
 function uint_unmap!(v::AbstractVector, u::AbstractVector{U}, lo::Integer, hi::Integer,
-                     order::Ordering, offset::U=zero(U)) where U <: Unsigned
+                     order::Ordering, offset::U=zero(U),
+                     index_offset::Integer=0) where U <: Unsigned
     @inbounds for i in lo:hi
-        v[i] = uint_unmap(eltype(v), u[i]+offset, order)
+        v[i] = uint_unmap(eltype(v), u[i+index_offset]+offset, order)
     end
     v
 end
@@ -1819,46 +1877,47 @@ struct MergeSort{T <: Algorithm} <: Algorithm
 end
 MergeSort() = MergeSort(SMALL_ALGORITHM)
 
-function _sort!(v::AbstractVector, a::MergeSort, o::Ordering, kw)
+function _sort!(v::AbstractVector, a::MergeSort, o::Ordering, kw; t=nothing, offset=nothing)
     @getkw lo hi scratch
     @inbounds if lo < hi
         hi-lo <= SMALL_THRESHOLD && return _sort!(v, a.next, o, kw)
 
         m = midpoint(lo, hi)
 
-        t = scratch === nothing ? similar(v, m-lo+1) : scratch
-        length(t) < m-lo+1 && resize!(t, m-lo+1)
-        Base.require_one_based_indexing(t)
+        if t === nothing
+            scratch, t = make_scratch(scratch, eltype(v), m-lo+1)
+            offset = 1-lo
+        end
 
-        _sort!(v, a, o, (;kw..., hi=m, scratch=t))
-        _sort!(v, a, o, (;kw..., lo=m+1, scratch=t))
+        _sort!(v, a, o, (;kw..., hi=m, scratch); t, offset)
+        _sort!(v, a, o, (;kw..., lo=m+1, scratch); t, offset)
 
         i, j = 1, lo
         while j <= m
-            t[i] = v[j]
+            t[i+offset] = v[j]
             i += 1
             j += 1
         end
 
         i, k = 1, lo
         while k < j <= hi
-            if lt(o, v[j], t[i])
+            if lt(o, v[j], t[i+offset])
                 v[k] = v[j]
                 j += 1
             else
-                v[k] = t[i]
+                v[k] = t[i+offset]
                 i += 1
             end
             k += 1
         end
         while k < j
-            v[k] = t[i]
+            v[k] = t[i+offset]
             k += 1
             i += 1
         end
     end
 
-    return v
+    scratch
 end
 
 # Support alg=InsertionSort and alg=MergeSort for backwards compatability (prefer InsertionSort() and MergeSort())
@@ -1866,15 +1925,22 @@ getalg(a::Algorithm) = a
 getalg(::Type{A}) where A <: Algorithm = A()
 
 # Support 3- and 5-argument versions of sort! for calling into the internals in the old way
-sort!(v::AbstractVector, a::Union{Algorithm, Type{<:Algorithm}}, o::Ordering) = _sort!(v, getalg(a), o, (; allow_legacy_dispatch=false))
-sort!(v::AbstractVector, lo::Integer, hi::Integer, a::Union{Algorithm, Type{<:Algorithm}}, o::Ordering) = _sort!(v, getalg(a), o, (; lo, hi, allow_legacy_dispatch=false))
+function sort!(v::AbstractVector, a::Union{Algorithm, Type{<:Algorithm}}, o::Ordering)
+    _sort!(v, getalg(a), o, (; allow_legacy_dispatch=false))
+    v
+end
+function sort!(v::AbstractVector, lo::Integer, hi::Integer, a::Union{Algorithm, Type{<:Algorithm}}, o::Ordering)
+    _sort!(v, getalg(a), o, (; lo, hi, allow_legacy_dispatch=false))
+    v
+end
 
 # Support dispatch on custom algorithms in the old way
 # sort!(::AbstractVector, ::Integer, ::Integer, ::MyCustomAlgorithm, ::Ordering) = ...
 function _sort!(v::AbstractVector, a::Algorithm, o::Ordering, kw)
-    @getkw lo hi allow_legacy_dispatch
+    @getkw lo hi scratch allow_legacy_dispatch
     if allow_legacy_dispatch
         sort!(v, lo, hi, a, o)
+        scratch
     else
         # This error prevents infinite recursion for unknown algorithms
         throw(ArgumentError("Base.Sort._sort!(::$(typeof(v)), ::$(typeof(a)), ::$(typeof(o))) is not defined"))
diff --git a/test/sorting.jl b/test/sorting.jl
index f9faecd2ff808..99d84a8211697 100644
--- a/test/sorting.jl
+++ b/test/sorting.jl
@@ -784,13 +784,15 @@ end
 
                     x = rand(1:n+1, n)
                     y = sort(x; order)
-                    @test y == Base.Sort._sort!(x, alg, order, (;kw(y)...)) === x
+                    @test Base.Sort._sort!(x, alg, order, (;kw(y)...)) !== x
+                    @test all(y .=== x)
 
                     alg isa requires_uint_mappable && continue
 
                     x = randn(n)
                     y = sort(x; order)
-                    @test y == Base.Sort._sort!(x, alg, order, (;kw(y)...)) === x
+                    @test Base.Sort._sort!(x, alg, order, (;kw(y)...)) !== x
+                    @test all(y .=== x)
                 end
             end
         end
@@ -822,7 +824,9 @@ end
     @test 1 < length(lines) < 30
 end
 
-@testset "Defining new algorithms & backwards compatibility with packages that use sorting internals" begin
+@testset "Extensibility" begin
+    # Defining new algorithms & backwards compatibility with packages that use sorting internals
+
     struct MyFirstAlg <: Base.Sort.Algorithm end
 
     @test_throws ArgumentError sort([1,2,3], alg=MyFirstAlg()) # not a stack overflow error

From a2c264644437157e899afb438c48cd488111ab7a Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Tue, 15 Nov 2022 08:02:25 +0600
Subject: [PATCH 19/29] Consistency with other constructors

---
 base/sort.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/base/sort.jl b/base/sort.jl
index 93e41ece28725..cabfdfeb77ff1 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -948,8 +948,8 @@ struct PartialQuickSort{L<:Union{Integer,Missing}, H<:Union{Integer,Missing}, T<
     hi::H
     next::T
 end
-PartialQuickSort(k::Integer) = InitialOptimizations(PartialQuickSort(missing, k, SMALL_ALGORITHM))
-PartialQuickSort(k::OrdinalRange) = InitialOptimizations(PartialQuickSort(first(k), last(k), SMALL_ALGORITHM))
+PartialQuickSort(k::Integer) = PartialQuickSort(missing, k, SMALL_ALGORITHM)
+PartialQuickSort(k::OrdinalRange) = PartialQuickSort(first(k), last(k), SMALL_ALGORITHM)
 _PartialQuickSort(k::Integer) = InitialOptimizations(PartialQuickSort(k:k))
 _PartialQuickSort(k::OrdinalRange) = InitialOptimizations(PartialQuickSort(k))
 

From e752ea72d106e8acf76aa7621ce2e871f3518809 Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Fri, 18 Nov 2022 07:34:43 +0600
Subject: [PATCH 20/29] pass around even fewer easily computed things in kw to
 reduce load on the compiler

---
 base/sort.jl | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/base/sort.jl b/base/sort.jl
index cabfdfeb77ff1..d610193b8eb8a 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -419,14 +419,6 @@ for (sym, deps, exp, type) in [
         (:hi, (), :(lastindex(v)),  Integer),
         (:mn, (), :(throw(ArgumentError("mn is needed but has not been computed"))), :(eltype(v))),
         (:mx, (), :(throw(ArgumentError("mx is needed but has not been computed"))), :(eltype(v))),
-        (:range, (:mn, :mx), quote
-            o isa DirectOrdering || throw(ArgumentError("Cannot compute range under ordering $o"))
-            maybe_unsigned(o === Reverse ? mn-mx : mx-mn)
-        end, Integer),
-        (:umn, (:mn,), :(uint_map(mn, o)), Unsigned),
-        (:umx, (:mx,), :(uint_map(mx, o)), Unsigned),
-        (:urange, (:umn, :umx), :(umx-umn), Unsigned),
-        (:bits, (:urange,), :(unsigned(8sizeof(urange) - leading_zeros(urange))), Unsigned),
         (:scratch, (), nothing, :(Union{Nothing, Vector})), # could have different eltype
         (:allow_legacy_dispatch, (), true, Bool)]
     str = string(sym)
@@ -809,7 +801,9 @@ struct ConsiderCountingSort{T <: Algorithm, U <: Algorithm} <: Algorithm
 end
 ConsiderCountingSort(next) = ConsiderCountingSort(CountingSort(), next)
 function _sort!(v::AbstractVector{<:Integer}, a::ConsiderCountingSort, o::DirectOrdering, kw)
-    @getkw lo hi range
+    @getkw lo hi mn mx
+    range = maybe_unsigned(o === Reverse ? mn-mx : mx-mn)
+
     if range < (sizeof(eltype(v)) > 8 ? 5(hi-lo)-100 : div(hi-lo, 2))
         _sort!(v, a.counting, o, kw)
     else
@@ -832,7 +826,8 @@ struct CountingSort <: Algorithm end
 maybe_reverse(o::ForwardOrdering, x) = x
 maybe_reverse(o::ReverseOrdering, x) = reverse(x)
 function _sort!(v::AbstractVector{<:Integer}, ::CountingSort, o::DirectOrdering, kw)
-    @getkw lo hi mn mx range scratch
+    @getkw lo hi mn mx scratch
+    range = maybe_unsigned(o === Reverse ? mn-mx : mx-mn)
     offs = 1 - (o === Reverse ? mx : mn)
 
     counts = fill(0, range+1) # TODO use scratch (but be aware of type stability)
@@ -866,7 +861,9 @@ struct ConsiderRadixSort{T <: Algorithm, U <: Algorithm} <: Algorithm
 end
 ConsiderRadixSort(next) = ConsiderRadixSort(RadixSort(), next)
 function _sort!(v::AbstractVector, a::ConsiderRadixSort, o::DirectOrdering, kw)
-    @getkw bits lo hi
+    @getkw lo hi mn mx
+    urange = uint_map(mx, o)-uint_map(mn, o)
+    bits = unsigned(8sizeof(urange) - leading_zeros(urange))
     if sizeof(eltype(v)) <= 8 && bits+70 < 22log(hi-lo)
         _sort!(v, a.radix, o, kw)
     else
@@ -898,7 +895,10 @@ Each pass divides the input into `2^chunk_size == mask+1` buckets. To do this, i
 """
 struct RadixSort <: Algorithm end
 function _sort!(v::AbstractVector, a::RadixSort, o::DirectOrdering, kw)
-    @getkw lo hi umn scratch bits
+    @getkw lo hi mn mx scratch
+    umn = uint_map(mn, o)
+    urange = uint_map(mx, o)-umn
+    bits = unsigned(8sizeof(urange) - leading_zeros(urange))
 
     # At this point, we are committed to radix sort.
     u = uint_map!(v, lo, hi, o)
@@ -916,10 +916,8 @@ function _sort!(v::AbstractVector, a::RadixSort, o::DirectOrdering, kw)
         u[i] -= umn
     end
 
-    len = hi-lo + 1
-    U = UIntMappable(eltype(v), o)
-    scratch, t = make_scratch(scratch, eltype(v), len)
-    tu = reinterpret(U, t)
+    scratch, t = make_scratch(scratch, eltype(v), hi-lo+1)
+    tu = reinterpret(eltype(u), t)
     if radix_sort!(u, lo, hi, bits, tu, 1-lo)
         uint_unmap!(v, u, lo, hi, o, umn)
     else

From 69677ba4fa57d47b735a812727662936e5b793ad Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Sat, 19 Nov 2022 11:42:29 +0600
Subject: [PATCH 21/29] revert "remove InsertionSortAlg and MergeSortAlg" for
 backwards compatability with folks who use internals (DataFrames.jl)

---
 base/sort.jl | 59 +++++++++++++++++++++++++---------------------------
 1 file changed, 28 insertions(+), 31 deletions(-)

diff --git a/base/sort.jl b/base/sort.jl
index d610193b8eb8a..745a155b6abb7 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -693,8 +693,10 @@ function _sort!(v::AbstractVector, a::Small{N}, o::Ordering, kw) where N
 end
 
 
+struct InsertionSortAlg <: Algorithm end
+
 """
-    InsertionSort()
+    InsertionSort
 
 Use the insertion sort algorithm.
 
@@ -708,10 +710,10 @@ Characteristics:
 * *quadratic performance* in the number of elements to be sorted:
 it is well-suited to small collections but should not be used for large ones.
 """
-struct InsertionSort <: Algorithm end
+const InsertionSort = InsertionSortAlg()
+const SMALL_ALGORITHM = InsertionSortAlg()
 
-const SMALL_ALGORITHM = InsertionSort()
-function _sort!(v::AbstractVector, ::InsertionSort, o::Ordering, kw)
+function _sort!(v::AbstractVector, ::InsertionSortAlg, o::Ordering, kw)
     @getkw lo hi scratch
     lo_plus_1 = (lo + 1)::Integer
     @inbounds for i = lo_plus_1:hi
@@ -1347,13 +1349,13 @@ julia> v = [(1, "c"), (3, "a"), (2, "b")]; sort!(v, by = x -> x[2]); v
 ```
 """
 function sort!(v::AbstractVector{T};
-               alg::Union{Algorithm, Type{<:Algorithm}}=defalg(v),
+               alg::Algorithm=defalg(v),
                lt=isless,
                by=identity,
                rev::Union{Bool,Nothing}=nothing,
                order::Ordering=Forward,
                scratch::Union{Vector{T}, Nothing}=nothing) where T
-    _sort!(v, getalg(alg), ord(lt,by,rev,order), (;scratch))
+    _sort!(v, alg, ord(lt,by,rev,order), (;scratch))
     v
 end
 
@@ -1532,7 +1534,7 @@ julia> sortperm(A, dims = 2)
 ```
 """
 function sortperm(A::AbstractArray;
-                  alg::Union{Algorithm, Type{<:Algorithm}}=DEFAULT_UNSTABLE,
+                  alg::Algorithm=DEFAULT_UNSTABLE,
                   lt=isless,
                   by=identity,
                   rev::Union{Bool,Nothing}=nothing,
@@ -1592,7 +1594,7 @@ julia> sortperm!(p, A; dims=2); p
 ```
 """
 function sortperm!(ix::AbstractArray{T}, A::AbstractArray;
-                   alg::Union{Algorithm, Type{<:Algorithm}}=DEFAULT_UNSTABLE,
+                   alg::Algorithm=DEFAULT_UNSTABLE,
                    lt=isless,
                    by=identity,
                    rev::Union{Bool,Nothing}=nothing,
@@ -1666,7 +1668,7 @@ julia> sort(A, dims = 2)
 """
 function sort(A::AbstractArray{T};
               dims::Integer,
-              alg::Union{Algorithm, Type{<:Algorithm}}=defalg(A),
+              alg::Algorithm=defalg(A),
               lt=isless,
               by=identity,
               rev::Union{Bool,Nothing}=nothing,
@@ -1679,11 +1681,11 @@ function sort(A::AbstractArray{T};
         pdims = (dim, setdiff(1:ndims(A), dim)...)  # put the selected dimension first
         Ap = permutedims(A, pdims)
         Av = vec(Ap)
-        sort_chunks!(Av, n, getalg(alg), order, scratch)
+        sort_chunks!(Av, n, alg, order, scratch)
         permutedims(Ap, invperm(pdims))
     else
         Av = A[:]
-        sort_chunks!(Av, n, getalg(alg), order, scratch)
+        sort_chunks!(Av, n, alg, order, scratch)
         reshape(Av, axes(A))
     end
 end
@@ -1740,16 +1742,16 @@ julia> sort!(A, dims = 2); A
 """
 function sort!(A::AbstractArray{T};
                dims::Integer,
-               alg::Union{Algorithm, Type{<:Algorithm}}=defalg(A),
+               alg::Algorithm=defalg(A),
                lt=isless,
                by=identity,
                rev::Union{Bool,Nothing}=nothing,
                order::Ordering=Forward, # TODO stop eagerly over-allocating.
                scratch::Union{Vector{T}, Nothing}=similar(A, size(A, dims))) where T
-    __sort!(A, Val(dims), getalg(alg), ord(lt, by, rev, order), scratch)
+    __sort!(A, Val(dims), alg, ord(lt, by, rev, order), scratch)
 end
 function __sort!(A::AbstractArray{T}, ::Val{K},
-                alg::Union{Algorithm, Type{<:Algorithm}},
+                alg::Algorithm,
                 order::Ordering,
                 scratch::Union{Vector{T}, Nothing}) where {K,T}
     nd = ndims(A)
@@ -1852,10 +1854,14 @@ end
 
 
 
-### Unused constructs for backward compatability ###
+### Unused constructs for backward compatibility ###
+
+struct MergeSortAlg{T <: Algorithm} <: Algorithm
+    next::T
+end
 
 """
-    MergeSort()
+    MergeSort
 
 Indicate that a sorting function should use the merge sort algorithm.
 
@@ -1870,12 +1876,9 @@ Characteristics:
   * *not in-place* in memory.
   * *divide-and-conquer* sort strategy.
 """
-struct MergeSort{T <: Algorithm} <: Algorithm
-    next::T
-end
-MergeSort() = MergeSort(SMALL_ALGORITHM)
+const MergeSort = MergeSortAlg(SMALL_ALGORITHM)
 
-function _sort!(v::AbstractVector, a::MergeSort, o::Ordering, kw; t=nothing, offset=nothing)
+function _sort!(v::AbstractVector, a::MergeSortAlg, o::Ordering, kw; t=nothing, offset=nothing)
     @getkw lo hi scratch
     @inbounds if lo < hi
         hi-lo <= SMALL_THRESHOLD && return _sort!(v, a.next, o, kw)
@@ -1918,17 +1921,13 @@ function _sort!(v::AbstractVector, a::MergeSort, o::Ordering, kw; t=nothing, off
     scratch
 end
 
-# Support alg=InsertionSort and alg=MergeSort for backwards compatability (prefer InsertionSort() and MergeSort())
-getalg(a::Algorithm) = a
-getalg(::Type{A}) where A <: Algorithm = A()
-
 # Support 3- and 5-argument versions of sort! for calling into the internals in the old way
-function sort!(v::AbstractVector, a::Union{Algorithm, Type{<:Algorithm}}, o::Ordering)
-    _sort!(v, getalg(a), o, (; allow_legacy_dispatch=false))
+function sort!(v::AbstractVector, a::Algorithm, o::Ordering)
+    _sort!(v, a, o, (; allow_legacy_dispatch=false))
     v
 end
-function sort!(v::AbstractVector, lo::Integer, hi::Integer, a::Union{Algorithm, Type{<:Algorithm}}, o::Ordering)
-    _sort!(v, getalg(a), o, (; lo, hi, allow_legacy_dispatch=false))
+function sort!(v::AbstractVector, lo::Integer, hi::Integer, a::Algorithm, o::Ordering)
+    _sort!(v, a, o, (; lo, hi, allow_legacy_dispatch=false))
     v
 end
 
@@ -1948,7 +1947,5 @@ end
 # Keep old internal types so that people can keep dispatching with
 # sort!(::AbstractVector, ::Integer, ::Integer, ::Base.QuickSortAlg, ::Ordering) = ...
 const QuickSortAlg = typeof(QuickSort)
-const MergeSortAlg = typeof(MergeSort)
-const InsertionSortAlg = typeof(InsertionSort)
 
 end # module Sort

From bc27dcaef56fc562e1e046efc47abc8c4b3e4f1c Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Sun, 20 Nov 2022 15:55:30 +0600
Subject: [PATCH 22/29] remove type constraint that is trickkky for the
 compiler to handle

---
 base/sort.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/base/sort.jl b/base/sort.jl
index 745a155b6abb7..da9a25a5197b8 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -499,7 +499,7 @@ struct MissingOptimization{T <: Algorithm} <: Algorithm
     next::T
 end
 
-struct WithoutMissingVector{T, U <: AbstractVector{Union{T, Missing}}} <: AbstractVector{T}
+struct WithoutMissingVector{T, U} <: AbstractVector{T}
     data::U
     function WithoutMissingVector(data; unsafe=false)
         if !unsafe && any(ismissing, data)

From ab549f5fe6555e9ec7a8ffea5fd6cf29c8e20c6e Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Sun, 20 Nov 2022 19:29:46 +0600
Subject: [PATCH 23/29] improve legacy dispatch system

---
 base/sort.jl    | 5 +----
 test/sorting.jl | 8 ++++++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/base/sort.jl b/base/sort.jl
index da9a25a5197b8..a9e656720cc3f 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -1922,10 +1922,7 @@ function _sort!(v::AbstractVector, a::MergeSortAlg, o::Ordering, kw; t=nothing,
 end
 
 # Support 3- and 5-argument versions of sort! for calling into the internals in the old way
-function sort!(v::AbstractVector, a::Algorithm, o::Ordering)
-    _sort!(v, a, o, (; allow_legacy_dispatch=false))
-    v
-end
+sort!(v::AbstractVector, a::Algorithm, o::Ordering) = sort!(v, firstindex(v), lastindex(v), a, o)
 function sort!(v::AbstractVector, lo::Integer, hi::Integer, a::Algorithm, o::Ordering)
     _sort!(v, a, o, (; lo, hi, allow_legacy_dispatch=false))
     v
diff --git a/test/sorting.jl b/test/sorting.jl
index 99d84a8211697..bcee3245df0cb 100644
--- a/test/sorting.jl
+++ b/test/sorting.jl
@@ -840,14 +840,18 @@ end
     @test sort([1,2,3], alg=MyFirstAlg()) == [7,7,7]
     @test all(sort(v, alg=Base.Sort.InitialOptimizations(MyFirstAlg())) .=== vcat(fill(7, 100), fill(missing, 10)))
 
-    # Use the pre 1.9 hook into the internals
+    # Using the old hook with old entry-point
+    @test sort!([3,1,2], MyFirstAlg(), Base.Forward) == [7,7,7]
+    @test sort!([3,1,2], 1, 3, MyFirstAlg(), Base.Forward) == [7,7,7]
+
+    # Use the pre 1.9 entry-point into the internals
     function Base.sort!(v::AbstractVector{Int}, lo::Integer, hi::Integer, ::MyFirstAlg, o::Base.Order.Ordering)
         sort!(v, lo, hi, Base.DEFAULT_STABLE, o)
     end
     @test sort([3,1,2], alg=MyFirstAlg()) == [1,2,3]
     @test issorted(sort(v, alg=Base.Sort.InitialOptimizations(MyFirstAlg())))
 
-    # Another pre 1.9 hook into the internals
+    # Another pre 1.9 entry-point into the internals
     @test issorted(sort!(rand(100), InsertionSort, Base.Order.Forward))
 
     struct MySecondAlg <: Base.Sort.Algorithm end

From 9400b9383bc86d1b1d96d67cf12a49e389256957 Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Mon, 21 Nov 2022 09:19:24 +0600
Subject: [PATCH 24/29] fix 5-arg MergeSort and add tests for 5-arg sort

---
 base/sort.jl    |  9 ++++-----
 test/sorting.jl | 23 +++++++++++++++++++++++
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/base/sort.jl b/base/sort.jl
index a9e656720cc3f..837af5856d638 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -1887,7 +1887,6 @@ function _sort!(v::AbstractVector, a::MergeSortAlg, o::Ordering, kw; t=nothing,
 
         if t === nothing
             scratch, t = make_scratch(scratch, eltype(v), m-lo+1)
-            offset = 1-lo
         end
 
         _sort!(v, a, o, (;kw..., hi=m, scratch); t, offset)
@@ -1895,24 +1894,24 @@ function _sort!(v::AbstractVector, a::MergeSortAlg, o::Ordering, kw; t=nothing,
 
         i, j = 1, lo
         while j <= m
-            t[i+offset] = v[j]
+            t[i] = v[j]
             i += 1
             j += 1
         end
 
         i, k = 1, lo
         while k < j <= hi
-            if lt(o, v[j], t[i+offset])
+            if lt(o, v[j], t[i])
                 v[k] = v[j]
                 j += 1
             else
-                v[k] = t[i+offset]
+                v[k] = t[i]
                 i += 1
             end
             k += 1
         end
         while k < j
-            v[k] = t[i+offset]
+            v[k] = t[i]
             k += 1
             i += 1
         end
diff --git a/test/sorting.jl b/test/sorting.jl
index bcee3245df0cb..8bad942fb9c81 100644
--- a/test/sorting.jl
+++ b/test/sorting.jl
@@ -864,6 +864,29 @@ end
     @test all(sort(v, alg=Base.Sort.InitialOptimizations(MySecondAlg())) .=== vcat(fill(9, 100), fill(missing, 10)))
 end
 
+@testset "sort!(v, lo, hi, alg, order)" begin
+    v = Vector{Float64}(undef, 4000)
+    for alg in [MergeSort, QuickSort, InsertionSort, Base.DEFAULT_STABLE, Base.DEFAULT_UNSTABLE]
+        rand!(v)
+        sort!(v, 1, 2000, alg, Base.Forward)
+        @test issorted(v[1:2000])
+        @test !issorted(v)
+
+        sort!(v, 2001, 4000, alg, Base.Forward)
+        @test issorted(v[1:2000])
+        @test issorted(v[2001:4000])
+        @test !issorted(v)
+
+        sort!(v, 1001, 3000, alg, Base.Forward)
+        @test issorted(v[1:1000])
+        @test issorted(v[1001:3000])
+        @test issorted(v[3001:4000])
+        @test !issorted(v[1:2000])
+        @test !issorted(v[2001:4000])
+        @test !issorted(v)
+    end
+end
+
 # This testset is at the end of the file because it is slow.
 @testset "searchsorted" begin
     numTypes = [ Int8,  Int16,  Int32,  Int64,  Int128,

From a73825afa120b08a04d7018e32858ada624b5917 Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Mon, 21 Nov 2022 10:03:06 +0600
Subject: [PATCH 25/29] cleanup interpolation to make JET.jl happy

---
 base/sort.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/base/sort.jl b/base/sort.jl
index 837af5856d638..3e2c8aca1953d 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -421,10 +421,11 @@ for (sym, deps, exp, type) in [
         (:mx, (), :(throw(ArgumentError("mx is needed but has not been computed"))), :(eltype(v))),
         (:scratch, (), nothing, :(Union{Nothing, Vector})), # could have different eltype
         (:allow_legacy_dispatch, (), true, Bool)]
-    str = string(sym)
     usym = Symbol(:_, sym)
     @eval function $usym(v, o, kw)
-        Symbol($str) ∈ keys(kw) && return kw, kw[Symbol($str)]::$type # TODO this interpolation feels too complicated
+        # using missing instead of nothing because scratch could === nothing.
+        res = get(kw, $(Expr(:quote, sym)), missing)
+        res !== missing && return kw, res::$type
         @getkw $(deps...)
         $sym = $exp
         (;kw..., $sym), $sym::$type

From fef85c011853331659af2d946222c81f030e1282 Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Mon, 21 Nov 2022 10:29:19 +0600
Subject: [PATCH 26/29] fix and test handling -0.0 in IEEEFloatOptimization

---
 base/sort.jl    | 4 ++--
 test/sorting.jl | 7 +++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/base/sort.jl b/base/sort.jl
index 3e2c8aca1953d..7f8ea621be80a 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -592,8 +592,8 @@ end
 UIntType(::Type{Float16}) = UInt16
 UIntType(::Type{Float32}) = UInt32
 UIntType(::Type{Float64}) = UInt64
-after_zero(::ForwardOrdering, x) = 0 <= x
-after_zero(::ReverseOrdering, x) = x < 0
+after_zero(::ForwardOrdering, x) = !signbit(x)
+after_zero(::ReverseOrdering, x) = signbit(x)
 is_concrete_IEEEFloat(T::Type) = T <: Base.IEEEFloat && isconcretetype(T)
 function _sort!(v::AbstractVector, a::IEEEFloatOptimization, o::Ordering, kw)
     @getkw lo hi
diff --git a/test/sorting.jl b/test/sorting.jl
index 8bad942fb9c81..d8e422ff52aae 100644
--- a/test/sorting.jl
+++ b/test/sorting.jl
@@ -887,6 +887,13 @@ end
     end
 end
 
+@testset "IEEEFloatOptimization with -0.0" begin
+    x = vcat(round.(100 .* randn(1000)) ./ 100) # Also test lots of duplicates
+    x[rand(1:1000, 5)] .= 0.0
+    x[rand(1:1000, 5)] .= -0.0  # To be sure that -0.0 is present
+    @test issorted(sort!(x))
+end
+
 # This testset is at the end of the file because it is slow.
 @testset "searchsorted" begin
     numTypes = [ Int8,  Int16,  Int32,  Int64,  Int128,

From 9df25d4cae65129d6ea7ed97c7837a39b377570d Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Mon, 21 Nov 2022 17:10:09 +0600
Subject: [PATCH 27/29] fix and test bug where countsort's correct overflow
 behavior triggers error due to unexpected promotion to UInt

---
 base/sort.jl    | 2 +-
 test/sorting.jl | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/base/sort.jl b/base/sort.jl
index 7f8ea621be80a..313786cee8fb3 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -830,7 +830,7 @@ maybe_reverse(o::ForwardOrdering, x) = x
 maybe_reverse(o::ReverseOrdering, x) = reverse(x)
 function _sort!(v::AbstractVector{<:Integer}, ::CountingSort, o::DirectOrdering, kw)
     @getkw lo hi mn mx scratch
-    range = maybe_unsigned(o === Reverse ? mn-mx : mx-mn)
+    range = o === Reverse ? mn-mx : mx-mn
     offs = 1 - (o === Reverse ? mx : mn)
 
     counts = fill(0, range+1) # TODO use scratch (but be aware of type stability)
diff --git a/test/sorting.jl b/test/sorting.jl
index d8e422ff52aae..37bad7d23c94b 100644
--- a/test/sorting.jl
+++ b/test/sorting.jl
@@ -894,6 +894,11 @@ end
     @test issorted(sort!(x))
 end
 
+@testset "Count sort near the edge of its range" begin
+    @test issorted(sort(rand(typemin(Int):typemin(Int)+100, 1000)))
+    @test issorted(sort(rand(typemax(Int)-100:typemax(Int), 1000)))
+end
+
 # This testset is at the end of the file because it is slow.
 @testset "searchsorted" begin
     numTypes = [ Int8,  Int16,  Int32,  Int64,  Int128,

From 964e58fd6d9887c9ae2de5ba9f7f2e89ac9ec9d4 Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Mon, 21 Nov 2022 17:22:30 +0600
Subject: [PATCH 28/29] add type signature to reduce possible method
 ambiguities (e.g. with AbstractTrees v0.3.4's ImplicitRootState)

---
 base/sort.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/base/sort.jl b/base/sort.jl
index 313786cee8fb3..a397ff49a2c24 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -509,7 +509,7 @@ struct WithoutMissingVector{T, U} <: AbstractVector{T}
         new{nonmissingtype(eltype(data)), typeof(data)}(data)
     end
 end
-Base.@propagate_inbounds function Base.getindex(v::WithoutMissingVector, i)
+Base.@propagate_inbounds function Base.getindex(v::WithoutMissingVector, i::Integer)
     out = v.data[i]
     @assert !(out isa Missing)
     out::eltype(v)

From 037ae71042c576da3ee7078a474e2c761c70b571 Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Mon, 21 Nov 2022 17:28:54 +0600
Subject: [PATCH 29/29] support 6-argument sort! because people do actually use
 it.

---
 base/sort.jl | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/base/sort.jl b/base/sort.jl
index a397ff49a2c24..086cbb859f641 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -1921,12 +1921,17 @@ function _sort!(v::AbstractVector, a::MergeSortAlg, o::Ordering, kw; t=nothing,
     scratch
 end
 
-# Support 3- and 5-argument versions of sort! for calling into the internals in the old way
+# Support 3-, 5-, and 6-argument versions of sort! for calling into the internals in the old way
 sort!(v::AbstractVector, a::Algorithm, o::Ordering) = sort!(v, firstindex(v), lastindex(v), a, o)
 function sort!(v::AbstractVector, lo::Integer, hi::Integer, a::Algorithm, o::Ordering)
     _sort!(v, a, o, (; lo, hi, allow_legacy_dispatch=false))
     v
 end
+sort!(v::AbstractVector, lo::Integer, hi::Integer, a::Algorithm, o::Ordering, _) = sort!(v, lo, hi, a, o)
+function sort!(v::AbstractVector, lo::Integer, hi::Integer, a::Algorithm, o::Ordering, scratch::Vector)
+    _sort!(v, a, o, (; lo, hi, scratch, allow_legacy_dispatch=false))
+    v
+end
 
 # Support dispatch on custom algorithms in the old way
 # sort!(::AbstractVector, ::Integer, ::Integer, ::MyCustomAlgorithm, ::Ordering) = ...