diff --git a/datafusion/physical-plan/src/sorts/cursor.rs b/datafusion/physical-plan/src/sorts/cursor.rs index 8ab603e04961e..54dc2414e4f08 100644 --- a/datafusion/physical-plan/src/sorts/cursor.rs +++ b/datafusion/physical-plan/src/sorts/cursor.rs @@ -289,120 +289,6 @@ impl CursorArray for StringViewArray { } } -/// Todo use arrow-rs side api after: and released -/// Builds a 128-bit composite key for an inline value: -/// -/// - High 96 bits: the inline data in big-endian byte order (for correct lexicographical sorting). -/// - Low 32 bits: the length in big-endian byte order, acting as a tiebreaker so shorter strings -/// (or those with fewer meaningful bytes) always numerically sort before longer ones. -/// -/// This function extracts the length and the 12-byte inline string data from the raw -/// little-endian `u128` representation, converts them to big-endian ordering, and packs them -/// into a single `u128` value suitable for fast, branchless comparisons. -/// -/// # Why include length? -/// -/// A pure 96-bit content comparison can’t distinguish between two values whose inline bytes -/// compare equal—either because one is a true prefix of the other or because zero-padding -/// hides extra bytes. By tucking the 32-bit length into the lower bits, a single `u128` compare -/// handles both content and length in one go. -/// -/// Example: comparing "bar" (3 bytes) vs "bar\0" (4 bytes) -/// -/// | String | Bytes 0–4 (length LE) | Bytes 4–16 (data + padding) | -/// |------------|-----------------------|---------------------------------| -/// | `"bar"` | `03 00 00 00` | `62 61 72` + 9 × `00` | -/// | `"bar\0"`| `04 00 00 00` | `62 61 72 00` + 8 × `00` | -/// -/// Both inline parts become `62 61 72 00…00`, so they tie on content. The length field -/// then differentiates: -/// -/// ```text -/// key("bar") = 0x0000000000000000000062617200000003 -/// key("bar\0") = 0x0000000000000000000062617200000004 -/// ⇒ key("bar") < key("bar\0") -/// ``` -/// # Inlining and Endianness -/// -/// - We start by calling `.to_le_bytes()` on the `raw` `u128`, because Rust’s native in‑memory -/// representation is little‑endian on x86/ARM. -/// - We extract the low 32 bits numerically (`raw as u32`)—this step is endianness‑free. -/// - We copy the 12 bytes of inline data (original order) into `buf[0..12]`. -/// - We serialize `length` as big‑endian into `buf[12..16]`. -/// - Finally, `u128::from_be_bytes(buf)` treats `buf[0]` as the most significant byte -/// and `buf[15]` as the least significant, producing a `u128` whose integer value -/// directly encodes “inline data then length” in big‑endian form. -/// -/// This ensures that a simple `u128` comparison is equivalent to the desired -/// lexicographical comparison of the inline bytes followed by length. -#[inline(always)] -pub fn inline_key_fast(raw: u128) -> u128 { - // 1. Decompose `raw` into little‑endian bytes: - // - raw_bytes[0..4] = length in LE - // - raw_bytes[4..16] = inline string data - let raw_bytes = raw.to_le_bytes(); - - // 2. Numerically truncate to get the low 32‑bit length (endianness‑free). - let length = raw as u32; - - // 3. Build a 16‑byte buffer in big‑endian order: - // - buf[0..12] = inline string bytes (in original order) - // - buf[12..16] = length.to_be_bytes() (BE) - let mut buf = [0u8; 16]; - buf[0..12].copy_from_slice(&raw_bytes[4..16]); // inline data - - // Why convert length to big-endian for comparison? - // - // Rust (on most platforms) stores integers in little-endian format, - // meaning the least significant byte is at the lowest memory address. - // For example, an u32 value like 0x22345677 is stored in memory as: - // - // [0x77, 0x56, 0x34, 0x22] // little-endian layout - // ^ ^ ^ ^ - // LSB ↑↑↑ MSB - // - // This layout is efficient for arithmetic but *not* suitable for - // lexicographic (dictionary-style) comparison of byte arrays. - // - // To compare values by byte order—e.g., for sorted keys or binary trees— - // we must convert them to **big-endian**, where: - // - // - The most significant byte (MSB) comes first (index 0) - // - The least significant byte (LSB) comes last (index N-1) - // - // In big-endian, the same u32 = 0x22345677 would be represented as: - // - // [0x22, 0x34, 0x56, 0x77] - // - // This ordering aligns with natural string/byte sorting, so calling - // `.to_be_bytes()` allows us to construct - // keys where standard numeric comparison (e.g., `<`, `>`) behaves - // like lexicographic byte comparison. - buf[12..16].copy_from_slice(&length.to_be_bytes()); // length in BE - - // 4. Deserialize the buffer as a big‑endian u128: - // buf[0] is MSB, buf[15] is LSB. - // Details: - // Note on endianness and layout: - // - // Although `buf[0]` is stored at the lowest memory address, - // calling `u128::from_be_bytes(buf)` interprets it as the **most significant byte (MSB)**, - // and `buf[15]` as the **least significant byte (LSB)**. - // - // This is the core principle of **big-endian decoding**: - // - Byte at index 0 maps to bits 127..120 (highest) - // - Byte at index 1 maps to bits 119..112 - // - ... - // - Byte at index 15 maps to bits 7..0 (lowest) - // - // So even though memory layout goes from low to high (left to right), - // big-endian treats the **first byte** as highest in value. - // - // This guarantees that comparing two `u128` keys is equivalent to lexicographically - // comparing the original inline bytes, followed by length. - u128::from_be_bytes(buf) -} - impl CursorValues for StringViewArray { fn len(&self) -> usize { self.views().len() @@ -460,7 +346,8 @@ impl CursorValues for StringViewArray { if l.data_buffers().is_empty() && r.data_buffers().is_empty() { let l_view = unsafe { l.views().get_unchecked(l_idx) }; let r_view = unsafe { r.views().get_unchecked(r_idx) }; - return inline_key_fast(*l_view).cmp(&inline_key_fast(*r_view)); + return StringViewArray::inline_key_fast(*l_view) + .cmp(&StringViewArray::inline_key_fast(*r_view)); } unsafe { GenericByteViewArray::compare_unchecked(l, l_idx, r, r_idx) } @@ -555,7 +442,6 @@ impl CursorValues for ArrayValues { #[cfg(test)] mod tests { - use arrow::array::GenericBinaryArray; use datafusion_execution::memory_pool::{ GreedyMemoryPool, MemoryConsumer, MemoryPool, }; @@ -720,100 +606,4 @@ mod tests { b.advance(); assert_eq!(a.cmp(&b), Ordering::Less); } - - /// Integration tests for `inline_key_fast` covering: - /// - /// 1. Monotonic ordering across increasing lengths and lexical variations. - /// 2. Cross-check against `GenericBinaryArray` comparison to ensure semantic equivalence. - /// - /// This also includes a specific test for the “bar” vs. “bar\0” case, demonstrating why - /// the length field is required even when all inline bytes fit in 12 bytes. - /// - /// The test includes strings that verify correct byte order (prevent reversal bugs), - /// and length-based tie-breaking in the composite key. - /// - /// The test confirms that `inline_key_fast` produces keys which sort consistently - /// with the expected lexicographical order of the raw byte arrays. - #[test] - fn test_inline_key_fast_various_lengths_and_lexical() { - /// Helper to create a raw u128 value representing an inline ByteView: - /// - `length`: number of meaningful bytes (must be ≤ 12) - /// - `data`: the actual inline data bytes - /// - /// The first 4 bytes encode length in little-endian, - /// the following 12 bytes contain the inline string data (unpadded). - fn make_raw_inline(length: u32, data: &[u8]) -> u128 { - assert!(length as usize <= 12, "Inline length must be ≤ 12"); - assert!( - data.len() == length as usize, - "Data length must match `length`" - ); - - let mut raw_bytes = [0u8; 16]; - raw_bytes[0..4].copy_from_slice(&length.to_le_bytes()); // length stored little-endian - raw_bytes[4..(4 + data.len())].copy_from_slice(data); // inline data - u128::from_le_bytes(raw_bytes) - } - - // Test inputs: various lengths and lexical orders, - // plus special cases for byte order and length tie-breaking - let test_inputs: Vec<&[u8]> = vec![ - b"a", - b"aa", - b"aaa", - b"aab", - b"abcd", - b"abcde", - b"abcdef", - b"abcdefg", - b"abcdefgh", - b"abcdefghi", - b"abcdefghij", - b"abcdefghijk", - b"abcdefghijkl", - // Tests for byte-order reversal bug: - // Without the fix, "backend one" would compare as "eno dnekcab", - // causing incorrect sort order relative to "backend two". - b"backend one", - b"backend two", - // Tests length-tiebreaker logic: - // "bar" (3 bytes) and "bar\0" (4 bytes) have identical inline data, - // so only the length differentiates their ordering. - b"bar", - b"bar\0", - // Additional lexical and length tie-breaking cases with same prefix, in correct lex order: - b"than12Byt", - b"than12Bytes", - b"than12Bytes\0", - b"than12Bytesx", - b"than12Bytex", - b"than12Bytez", - // Additional lexical tests - b"xyy", - b"xyz", - b"xza", - ]; - - // Create a GenericBinaryArray for cross-comparison of lex order - let array: GenericBinaryArray = GenericBinaryArray::from( - test_inputs.iter().map(|s| Some(*s)).collect::>(), - ); - - for i in 0..array.len() - 1 { - let v1 = array.value(i); - let v2 = array.value(i + 1); - - // Assert the array's natural lexical ordering is correct - assert!(v1 < v2, "Array compare failed: {v1:?} !< {v2:?}"); - - // Assert the keys produced by inline_key_fast reflect the same ordering - let key1 = inline_key_fast(make_raw_inline(v1.len() as u32, v1)); - let key2 = inline_key_fast(make_raw_inline(v2.len() as u32, v2)); - - assert!( - key1 < key2, - "Key compare failed: key({v1:?})=0x{key1:032x} !< key({v2:?})=0x{key2:032x}", - ); - } - } }