Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 43 additions & 18 deletions src/uu/sort/src/chunks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,17 +53,36 @@ pub struct LineData<'a> {
pub num_infos: Vec<NumInfo>,
pub parsed_floats: Vec<GeneralBigDecimalParseResult>,
pub line_num_floats: Vec<Option<f64>>,
/// Arena buffer holding all collation sort keys concatenated.
pub collation_key_buffer: Vec<u8>,
/// End offsets into `collation_key_buffer` for each line's sort key.
pub collation_key_ends: Vec<usize>,
}

impl LineData<'_> {
/// Get the collation sort key for a line at the given index.
pub fn collation_key(&self, index: usize) -> &[u8] {
let start = if index == 0 {
0
} else {
self.collation_key_ends[index - 1]
};
let end = self.collation_key_ends[index];
&self.collation_key_buffer[start..end]
}
}

impl Chunk {
/// Destroy this chunk and return its components to be reused.
pub fn recycle(mut self) -> RecycledChunk {
let recycled_contents = self.with_dependent_mut(|_, contents| {
let mut recycled_contents = self.with_dependent_mut(|_, contents| {
contents.lines.clear();
contents.line_data.selections.clear();
contents.line_data.num_infos.clear();
contents.line_data.parsed_floats.clear();
contents.line_data.line_num_floats.clear();
contents.line_data.collation_key_buffer.clear();
contents.line_data.collation_key_ends.clear();
contents.token_buffer.clear();
let lines = unsafe {
// SAFETY: It is safe to (temporarily) transmute to a vector of lines with a longer lifetime,
Expand All @@ -81,26 +100,22 @@ impl Chunk {
&mut contents.line_data.selections,
))
};
(
RecycledChunk {
lines,
selections,
std::mem::take(&mut contents.line_data.num_infos),
std::mem::take(&mut contents.line_data.parsed_floats),
std::mem::take(&mut contents.line_data.line_num_floats),
std::mem::take(&mut contents.token_buffer),
contents.line_count_hint,
)
num_infos: std::mem::take(&mut contents.line_data.num_infos),
parsed_floats: std::mem::take(&mut contents.line_data.parsed_floats),
line_num_floats: std::mem::take(&mut contents.line_data.line_num_floats),
collation_key_buffer: std::mem::take(&mut contents.line_data.collation_key_buffer),
collation_key_ends: std::mem::take(&mut contents.line_data.collation_key_ends),
token_buffer: std::mem::take(&mut contents.token_buffer),
line_count_hint: contents.line_count_hint,
// buffer is set below after we consume `self`
buffer: Vec::new(),
}
});
RecycledChunk {
lines: recycled_contents.0,
selections: recycled_contents.1,
num_infos: recycled_contents.2,
parsed_floats: recycled_contents.3,
line_num_floats: recycled_contents.4,
token_buffer: recycled_contents.5,
line_count_hint: recycled_contents.6,
buffer: self.into_owner(),
}
recycled_contents.buffer = self.into_owner();
recycled_contents
}

pub fn lines(&self) -> &Vec<Line<'_>> {
Expand All @@ -118,6 +133,8 @@ pub struct RecycledChunk {
num_infos: Vec<NumInfo>,
parsed_floats: Vec<GeneralBigDecimalParseResult>,
line_num_floats: Vec<Option<f64>>,
collation_key_buffer: Vec<u8>,
collation_key_ends: Vec<usize>,
token_buffer: Vec<Range<usize>>,
line_count_hint: usize,
buffer: Vec<u8>,
Expand All @@ -131,6 +148,8 @@ impl RecycledChunk {
num_infos: Vec::new(),
parsed_floats: Vec::new(),
line_num_floats: Vec::new(),
collation_key_buffer: Vec::new(),
collation_key_ends: Vec::new(),
token_buffer: Vec::new(),
line_count_hint: 0,
buffer: vec![0; capacity],
Expand Down Expand Up @@ -176,6 +195,8 @@ pub fn read<T: Read>(
num_infos,
parsed_floats,
line_num_floats,
collation_key_buffer,
collation_key_ends,
mut token_buffer,
mut line_count_hint,
mut buffer,
Expand Down Expand Up @@ -214,6 +235,8 @@ pub fn read<T: Read>(
num_infos,
parsed_floats,
line_num_floats,
collation_key_buffer,
collation_key_ends,
};
parse_lines(
read,
Expand Down Expand Up @@ -253,6 +276,8 @@ fn parse_lines<'a>(
assert!(line_data.num_infos.is_empty());
assert!(line_data.parsed_floats.is_empty());
assert!(line_data.line_num_floats.is_empty());
assert!(line_data.collation_key_buffer.is_empty());
assert!(line_data.collation_key_ends.is_empty());
token_buffer.clear();
const SMALL_CHUNK_BYTES: usize = 64 * 1024;
let mut estimated = (*line_count_hint).max(1);
Expand Down
26 changes: 25 additions & 1 deletion src/uu/sort/src/sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ use uucore::error::{FromIo, strip_errno};
use uucore::error::{UError, UResult, USimpleError, UUsageError};
use uucore::extendedbigdecimal::ExtendedBigDecimal;
#[cfg(feature = "i18n-collator")]
use uucore::i18n::collator::locale_cmp;
use uucore::i18n::collator::{compute_sort_key_utf8, locale_cmp};
use uucore::i18n::decimal::locale_decimal_separator;
use uucore::line_ending::LineEnding;
use uucore::parser::num_parser::{ExtendedParser, ExtendedParserError};
Expand Down Expand Up @@ -324,6 +324,7 @@ struct Precomputed {
floats_per_line: usize,
selections_per_line: usize,
fast_lexicographic: bool,
fast_locale_collation: bool,
fast_ascii_insensitive: bool,
tokenize_blank_thousands_sep: bool,
tokenize_allow_unit_after_blank: bool,
Expand Down Expand Up @@ -387,6 +388,8 @@ impl GlobalSettings {

self.precomputed.fast_lexicographic =
!disable_fast_lexicographic && self.can_use_fast_lexicographic();
self.precomputed.fast_locale_collation =
disable_fast_lexicographic && self.can_use_fast_lexicographic();
self.precomputed.fast_ascii_insensitive = self.can_use_fast_ascii_insensitive();
}

Expand Down Expand Up @@ -632,6 +635,15 @@ impl<'a> Line<'a> {
token_buffer: &mut Vec<Field>,
settings: &GlobalSettings,
) -> Self {
#[cfg(feature = "i18n-collator")]
if settings.precomputed.fast_locale_collation {
compute_sort_key_utf8(line, &mut line_data.collation_key_buffer);
line_data
.collation_key_ends
.push(line_data.collation_key_buffer.len());
return Self { line, index };
}

let needs_line_data = settings.precomputed.needs_tokens
|| settings.precomputed.selections_per_line > 0
|| settings.precomputed.num_infos_per_line > 0
Expand Down Expand Up @@ -2614,6 +2626,18 @@ fn compare_by<'a>(
};
}

#[cfg(feature = "i18n-collator")]
if global_settings.precomputed.fast_locale_collation {
let a_key = a_line_data.collation_key(a.index);
let b_key = b_line_data.collation_key(b.index);
let cmp = a_key.cmp(b_key);
return if global_settings.reverse {
cmp.reverse()
} else {
cmp
};
}

if global_settings.precomputed.fast_ascii_insensitive {
let cmp = ascii_case_insensitive_cmp(a.line, b.line);
if cmp != Ordering::Equal || a.line == b.line {
Expand Down
11 changes: 11 additions & 0 deletions src/uucore/src/lib/features/i18n/collator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,17 @@ pub fn init_locale_collation() -> bool {
try_init_collator(opts)
}

/// Compute the ICU collation sort key for the given input bytes and append it to `buf`.
/// This allows pre-computing sort keys once per line, then comparing them with simple
/// byte comparison during sorting (much faster than calling `compare_utf8` per comparison).
pub fn compute_sort_key_utf8(input: &[u8], buf: &mut Vec<u8>) {
let c = COLLATOR
.get()
.expect("compute_sort_key_utf8 called before collator initialization");
c.write_sort_key_utf8_to(input, buf)
.expect("ICU write_sort_key_utf8_to failed");
}

/// Compare both strings with regard to the current locale.
pub fn locale_cmp(left: &[u8], right: &[u8]) -> Ordering {
// If the detected locale is 'C', just do byte-wise comparison
Expand Down
Loading