Skip to content

Add optimize_for_size variants for stable and unstable sort as well as select_nth_unstable #129587

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Sep 24, 2024
1 change: 1 addition & 0 deletions library/core/src/slice/sort/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ pub mod stable;
pub mod unstable;

pub(crate) mod select;
#[cfg(not(feature = "optimize_for_size"))]
pub(crate) mod shared;
43 changes: 41 additions & 2 deletions library/core/src/slice/sort/select.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
//! better performance than one would get using heapsort as fallback.

use crate::mem::{self, SizedTypeProperties};
#[cfg(not(feature = "optimize_for_size"))]
use crate::slice::sort::shared::pivot::choose_pivot;
#[cfg(not(feature = "optimize_for_size"))]
use crate::slice::sort::shared::smallsort::insertion_sort_shift_left;
use crate::slice::sort::unstable::quicksort::partition;

Expand Down Expand Up @@ -40,7 +42,13 @@ where
let min_idx = min_index(v, &mut is_less).unwrap();
v.swap(min_idx, index);
} else {
partition_at_index_loop(v, index, None, &mut is_less);
cfg_if! {
if #[cfg(feature = "optimize_for_size")] {
median_of_medians(v, &mut is_less, index);
} else {
partition_at_index_loop(v, index, None, &mut is_less);
}
}
}

let (left, right) = v.split_at_mut(index);
Expand All @@ -53,6 +61,7 @@ where
// most once, it doesn't make sense to use something more sophisticated than insertion-sort.
const INSERTION_SORT_THRESHOLD: usize = 16;

#[cfg(not(feature = "optimize_for_size"))]
fn partition_at_index_loop<'a, T, F>(
mut v: &'a mut [T],
mut index: usize,
Expand Down Expand Up @@ -167,8 +176,15 @@ fn median_of_medians<T, F: FnMut(&T, &T) -> bool>(mut v: &mut [T], is_less: &mut
loop {
if v.len() <= INSERTION_SORT_THRESHOLD {
if v.len() >= 2 {
insertion_sort_shift_left(v, 1, is_less);
cfg_if! {
if #[cfg(feature = "optimize_for_size")] {
bubble_sort(v, is_less);
} else {
insertion_sort_shift_left(v, 1, is_less);
}
}
}

return;
}

Expand Down Expand Up @@ -298,3 +314,26 @@ fn median_idx<T, F: FnMut(&T, &T) -> bool>(
}
b
}

// It's possible to re-use the insertion sort in the smallsort module, but with optimize_for_size it
// would clutter that module with cfg statements and make it generally harder to read and develop.
// So to decouple things and simplify it, we use a an even smaller bubble sort.
#[cfg(feature = "optimize_for_size")]
fn bubble_sort<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
let mut n = v.len();
let mut did_swap = true;

while did_swap && n > 1 {
did_swap = false;
for i in 1..n {
// SAFETY: The loop construction implies that `i` and `i - 1` will always be in-bounds.
unsafe {
if is_less(v.get_unchecked(i), v.get_unchecked(i - 1)) {
v.swap_unchecked(i - 1, i);
did_swap = true;
}
}
}
n -= 1;
}
}
2 changes: 1 addition & 1 deletion library/core/src/slice/sort/shared/smallsort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ where

/// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the
/// value at position `b_pos` is less than the one at position `a_pos`.
pub unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
where
F: FnMut(&T, &T) -> bool,
{
Expand Down
58 changes: 44 additions & 14 deletions library/core/src/slice/sort/stable/mod.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,24 @@
//! This module contains the entry points for `slice::sort`.

#[cfg(not(feature = "optimize_for_size"))]
use crate::cmp;
use crate::intrinsics;
use crate::mem::{self, MaybeUninit, SizedTypeProperties};
#[cfg(not(feature = "optimize_for_size"))]
use crate::slice::sort::shared::smallsort::{
insertion_sort_shift_left, StableSmallSortTypeImpl, SMALL_SORT_GENERAL_SCRATCH_LEN,
};
use crate::{cmp, intrinsics};

pub(crate) mod drift;
pub(crate) mod merge;

#[cfg(not(feature = "optimize_for_size"))]
pub(crate) mod drift;
#[cfg(not(feature = "optimize_for_size"))]
pub(crate) mod quicksort;

#[cfg(feature = "optimize_for_size")]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This likely needs to be #[cfg(any(feature = "optimize_for_size", target_pointer_width = "16"))], see #130818

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch.

pub(crate) mod tiny;

/// Stable sort called driftsort by Orson Peters and Lukas Bergdoll.
/// Design document:
/// <https://github.com/Voultapher/sort-research-rs/blob/main/writeup/driftsort_introduction/text.md>
Expand All @@ -30,25 +39,46 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less
return;
}

// More advanced sorting methods than insertion sort are faster if called in
// a hot loop for small inputs, but for general-purpose code the small
// binary size of insertion sort is more important. The instruction cache in
// modern processors is very valuable, and for a single sort call in general
// purpose code any gains from an advanced method are cancelled by i-cache
// misses during the sort, and thrashing the i-cache for surrounding code.
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
insertion_sort_shift_left(v, 1, is_less);
return;
}
cfg_if! {
if #[cfg(feature = "optimize_for_size")] {
let alloc_len = len / 2;

// For small inputs 4KiB of stack storage suffices, which allows us to avoid
// calling the (de-)allocator. Benchmarks showed this was quite beneficial.
let mut stack_buf = AlignedStorage::<T, 4096>::new();
let stack_scratch = stack_buf.as_uninit_slice_mut();
let mut heap_buf;
let scratch = if stack_scratch.len() >= alloc_len {
stack_scratch
} else {
heap_buf = BufT::with_capacity(alloc_len);
heap_buf.as_uninit_slice_mut()
};

driftsort_main::<T, F, BufT>(v, is_less);
tiny::mergesort(v, scratch, is_less);
} else {
// More advanced sorting methods than insertion sort are faster if called in
// a hot loop for small inputs, but for general-purpose code the small
// binary size of insertion sort is more important. The instruction cache in
// modern processors is very valuable, and for a single sort call in general
// purpose code any gains from an advanced method are cancelled by i-cache
// misses during the sort, and thrashing the i-cache for surrounding code.
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
insertion_sort_shift_left(v, 1, is_less);
return;
}

driftsort_main::<T, F, BufT>(v, is_less);
}
}
}

/// See [`sort`]
///
/// Deliberately don't inline the main sorting routine entrypoint to ensure the
/// inlined insertion sort i-cache footprint remains minimal.
#[cfg(not(feature = "optimize_for_size"))]
#[inline(never)]
fn driftsort_main<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) {
// By allocating n elements of memory we can ensure the entire input can
Expand Down
75 changes: 75 additions & 0 deletions library/core/src/slice/sort/stable/tiny.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
//! Binary-size optimized mergesort inspired by https://github.com/voultapher/tiny-sort-rs.
use crate::mem::{ManuallyDrop, MaybeUninit};
use crate::ptr;
use crate::slice::sort::stable::merge;

/// Tiny recursive top-down merge sort optimized for binary size. It has no adaptiveness whatsoever,
/// no run detection, etc.
#[inline(always)]
pub fn mergesort<T, F: FnMut(&T, &T) -> bool>(
v: &mut [T],
scratch: &mut [MaybeUninit<T>],
is_less: &mut F,
) {
let len = v.len();

if len > 2 {
let mid = len / 2;

// SAFETY: mid is in-bounds.
unsafe {
// Sort the left half recursively.
mergesort(v.get_unchecked_mut(..mid), scratch, is_less);
// Sort the right half recursively.
mergesort(v.get_unchecked_mut(mid..), scratch, is_less);
}

merge::merge(v, scratch, mid, is_less);
} else if len == 2 {
// Branchless swap the two elements. This reduces the recursion depth and improves
// perf significantly at a small binary-size cost. Trades ~10% perf boost for integers
// for ~50 bytes in the binary.

// SAFETY: We checked the len, the pointers we create are valid and don't overlap.
unsafe {
swap_if_less(v.as_mut_ptr(), 0, 1, is_less);
}
}
}

/// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the
/// value at position `b_pos` is less than the one at position `a_pos`.
unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
where
F: FnMut(&T, &T) -> bool,
{
// SAFETY: the caller must guarantee that `a` and `b` each added to `v_base` yield valid
// pointers into `v_base`, and are properly aligned, and part of the same allocation.
unsafe {
let v_a = v_base.add(a_pos);
let v_b = v_base.add(b_pos);

// PANIC SAFETY: if is_less panics, no scratch memory was created and the slice should still be
// in a well defined state, without duplicates.

// Important to only swap if it is more and not if it is equal. is_less should return false for
// equal, so we don't swap.
let should_swap = is_less(&*v_b, &*v_a);

// This is a branchless version of swap if.
// The equivalent code with a branch would be:
//
// if should_swap {
// ptr::swap(left, right, 1);
// }

// The goal is to generate cmov instructions here.
let left_swap = if should_swap { v_b } else { v_a };
let right_swap = if should_swap { v_a } else { v_b };

let right_swap_tmp = ManuallyDrop::new(ptr::read(right_swap));
ptr::copy(left_swap, v_a, 1);
ptr::copy_nonoverlapping(&*right_swap_tmp, v_b, 1);
}
}
36 changes: 24 additions & 12 deletions library/core/src/slice/sort/unstable/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

use crate::intrinsics;
use crate::mem::SizedTypeProperties;
#[cfg(not(feature = "optimize_for_size"))]
use crate::slice::sort::shared::find_existing_run;
#[cfg(not(feature = "optimize_for_size"))]
use crate::slice::sort::shared::smallsort::insertion_sort_shift_left;

pub(crate) mod heapsort;
Expand All @@ -28,25 +30,35 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
return;
}

// More advanced sorting methods than insertion sort are faster if called in
// a hot loop for small inputs, but for general-purpose code the small
// binary size of insertion sort is more important. The instruction cache in
// modern processors is very valuable, and for a single sort call in general
// purpose code any gains from an advanced method are cancelled by i-cache
// misses during the sort, and thrashing the i-cache for surrounding code.
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
insertion_sort_shift_left(v, 1, is_less);
return;
}
cfg_if! {
if #[cfg(feature = "optimize_for_size")] {
// SAFETY: We checked that `len >= 2`.
unsafe {
heapsort::heapsort(v, is_less);
}
} else {
// More advanced sorting methods than insertion sort are faster if called in
// a hot loop for small inputs, but for general-purpose code the small
// binary size of insertion sort is more important. The instruction cache in
// modern processors is very valuable, and for a single sort call in general
// purpose code any gains from an advanced method are cancelled by i-cache
// misses during the sort, and thrashing the i-cache for surrounding code.
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
insertion_sort_shift_left(v, 1, is_less);
return;
}

ipnsort(v, is_less);
ipnsort(v, is_less);
}
}
}

/// See [`sort`]
///
/// Deliberately don't inline the main sorting routine entrypoint to ensure the
/// inlined insertion sort i-cache footprint remains minimal.
#[cfg(not(feature = "optimize_for_size"))]
#[inline(never)]
fn ipnsort<T, F>(v: &mut [T], is_less: &mut F)
where
Expand Down
Loading
Loading