From 2ee80dbd5bf65697b5b82cf072f4b1ea261897fa Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 18 Sep 2025 23:16:31 +0300 Subject: [PATCH 1/2] perf: improve GenericByteBuilder::append_array to use SIMD for extending the offsets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changing from: ```rust let mut intermediate = Vec::::with_capacity(offsets.len() - 1); for &offset in &offsets[1..] { intermediate.push(offset + shift) } ``` to: ```rust let mut intermediate = vec![T::Offset::zero(); offsets.len() - 1]; for (index, &offset) in offsets[1..].iter().enumerate() { intermediate[index] = offset + shift; } ``` improve the performance of concating bytes array between 8% to 50% on local machine: ```bash concat str 1024 time: [7.2598 µs 7.2772 µs 7.2957 µs] change: [+12.545% +13.070% +13.571%] (p = 0.00 < 0.05) Performance has regressed. Found 6 outliers among 100 measurements (6.00%) 4 (4.00%) high mild 2 (2.00%) high severe concat str nulls 1024 time: [4.6791 µs 4.6895 µs 4.7010 µs] change: [+23.206% +23.792% +24.425%] (p = 0.00 < 0.05) Performance has regressed. Found 13 outliers among 100 measurements (13.00%) 5 (5.00%) high mild 8 (8.00%) high severe concat 1024 arrays str 4 time: [45.018 µs 45.213 µs 45.442 µs] change: [+6.4195% +8.7377% +11.279%] (p = 0.00 < 0.05) Performance has regressed. Found 13 outliers among 100 measurements (13.00%) 6 (6.00%) high mild 7 (7.00%) high severe concat str 8192 over 100 arrays time: [3.7561 ms 3.7814 ms 3.8086 ms] change: [+25.394% +26.833% +28.370%] (p = 0.00 < 0.05) Performance has regressed. Found 4 outliers among 100 measurements (4.00%) 4 (4.00%) high mild concat str nulls 8192 over 100 arrays time: [2.3144 ms 2.3269 ms 2.3403 ms] change: [+51.533% +52.826% +54.109%] (p = 0.00 < 0.05) Performance has regressed. Found 8 outliers among 100 measurements (8.00%) 6 (6.00%) high mild 2 (2.00%) high severe ``` When looking at the assembly > Used rustc 1.89.0 and compiler flags `-C opt-level=2 -C target-feature=+avx2 -C codegen-units=1` in [godbold](https://godbolt.org/) you see that for the old code: ```rust let mut intermediate = Vec::::with_capacity(offsets.len() - 1); for &offset in &offsets[1..] { intermediate.push(offset + shift) } ``` the assembly for the loop is: ```asm .LBB3_22: mov rbx, qword ptr [r13 + 8*rbp + 8] add rbx, r15 cmp rbp, qword ptr [rsp] jne .LBB3_25 mov rdi, rsp lea rsi, [rip + .Lanon.da681cffc384a5add117668a344b291b.6] call qword ptr [rip + alloc::raw_vec::RawVec::grow_one::ha1b398ade64b0727@GOTPCREL] mov r14, qword ptr [rsp + 8] jmp .LBB3_25 .LBB3_25: mov qword ptr [r14 + 8*rbp], rbx inc rbp mov qword ptr [rsp + 16], rbp add r12, -8 je .LBB3_9 ``` and for the new code: ```rust let mut intermediate = vec![T::Offset::zero(); offsets.len() - 1]; for (index, &offset) in offsets[1..].iter().enumerate() { intermediate[index] = offset + shift; } ``` the assembly for the loop is: ```asm .LBB2_21: vpaddq ymm1, ymm0, ymmword ptr [r12 + 8*rdx + 8] vpaddq ymm2, ymm0, ymmword ptr [r12 + 8*rdx + 40] vpaddq ymm3, ymm0, ymmword ptr [r12 + 8*rdx + 72] vpaddq ymm4, ymm0, ymmword ptr [r12 + 8*rdx + 104] vmovdqu ymmword ptr [rbx + 8*rdx], ymm1 vmovdqu ymmword ptr [rbx + 8*rdx + 32], ymm2 vmovdqu ymmword ptr [rbx + 8*rdx + 64], ymm3 vmovdqu ymmword ptr [rbx + 8*rdx + 96], ymm4 add rdx, 16 cmp rax, rdx jne .LBB2_21 ``` which uses SIMD instructions. The code that I wrote in GodBolt: For the old code: ```rust #[inline(always)] fn extend_offsets + Copy + Default>(output: &mut Vec, offsets: &[T], next_offset: T) { assert_ne!(offsets.len(), 0); let shift: T = next_offset + offsets[0]; let mut intermediate = Vec::::with_capacity(offsets.len() - 1); // Make it easier to find the loop in the assembly let mut dummy = 0u64; unsafe { std::arch::asm!( "# VECTORIZED_START mov {}, 1", out(reg) dummy, options(nostack) ); } for &offset in &offsets[1..] { intermediate.push(offset + shift) } // Make it easier to find the loop in the assembly unsafe { std::arch::asm!( "# VECTORIZED_END mov {}, 2", out(reg) dummy, options(nostack) ); } std::hint::black_box(dummy); output.extend_from_slice(&intermediate); } #[no_mangle] pub fn extend_offsets_usize(output: &mut Vec, offsets: &[usize], next_offset: usize) { extend_offsets(output, offsets, next_offset); } ``` And for the new code: ```rust #[inline(always)] fn extend_offsets + Copy + Default>(output: &mut Vec, offsets: &[T], next_offset: T) { assert_ne!(offsets.len(), 0); let shift: T = next_offset + offsets[0]; let mut intermediate = vec![T::default(); offsets.len() - 1]; // Make it easier to find the loop in the assembly let mut dummy = 0u64; unsafe { std::arch::asm!( "# VECTORIZED_START mov {}, 1", out(reg) dummy, options(nostack) ); } for (index, &offset) in offsets[1..].iter().enumerate() { intermediate[index] = offset + shift } // Make it easier to find the loop in the assembly unsafe { std::arch::asm!( "# VECTORIZED_END mov {}, 2", out(reg) dummy, options(nostack) ); } std::hint::black_box(dummy); output.extend_from_slice(&intermediate); } #[no_mangle] pub fn extend_offsets_usize(output: &mut Vec, offsets: &[usize], next_offset: usize) { extend_offsets(output, offsets, next_offset); } ``` --- arrow-array/src/builder/generic_bytes_builder.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/arrow-array/src/builder/generic_bytes_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs index c2c743e3ab27..c2eeecfcab73 100644 --- a/arrow-array/src/builder/generic_bytes_builder.rs +++ b/arrow-array/src/builder/generic_bytes_builder.rs @@ -157,13 +157,15 @@ impl GenericByteBuilder { // Shifting all the offsets let shift: T::Offset = self.next_offset() - offsets[0]; - // Creating intermediate offsets instead of pushing each offset is faster - // (even if we make MutableBuffer to avoid updating length on each push - // and reserve the necessary capacity, it's still slower) - let mut intermediate = Vec::with_capacity(offsets.len() - 1); + use num::Zero; - for &offset in &offsets[1..] { - intermediate.push(offset + shift) + // Creating intermediate offsets instead of pushing each offsets is faster. + // + // Not using Vec::with_capacity and push as it will not use SIMD for some reason. + let mut intermediate = vec![T::Offset::zero(); offsets.len() - 1]; + + for (index, &offset) in offsets[1..].iter().enumerate() { + intermediate[index] = offset + shift; } self.offsets_builder.extend_from_slice(&intermediate); From fd5a012acdb486547d765d8b835bce4787a28840 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Sun, 21 Sep 2025 02:35:46 +0300 Subject: [PATCH 2/2] change to iterator that will also use SIMD without intermediate buffer --- arrow-array/src/builder/generic_bytes_builder.rs | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/arrow-array/src/builder/generic_bytes_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs index c2eeecfcab73..ffaf9ff351da 100644 --- a/arrow-array/src/builder/generic_bytes_builder.rs +++ b/arrow-array/src/builder/generic_bytes_builder.rs @@ -157,18 +157,8 @@ impl GenericByteBuilder { // Shifting all the offsets let shift: T::Offset = self.next_offset() - offsets[0]; - use num::Zero; - - // Creating intermediate offsets instead of pushing each offsets is faster. - // - // Not using Vec::with_capacity and push as it will not use SIMD for some reason. - let mut intermediate = vec![T::Offset::zero(); offsets.len() - 1]; - - for (index, &offset) in offsets[1..].iter().enumerate() { - intermediate[index] = offset + shift; - } - - self.offsets_builder.extend_from_slice(&intermediate); + self.offsets_builder + .extend(offsets[1..].iter().map(|&offset| offset + shift)); } // Append underlying values, starting from the first offset and ending at the last offset