Skip to content
Merged
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
e0728ed
Perf: implement fast gc for string view
zhuqi-lucas Jul 4, 2025
02f2870
Merge remote-tracking branch 'upstream/main' into fast_gc
zhuqi-lucas Jul 5, 2025
5815519
polish
zhuqi-lucas Jul 5, 2025
f5488cc
format
zhuqi-lucas Jul 5, 2025
4c3c7ee
polish code
zhuqi-lucas Jul 5, 2025
1601ab6
Address comments
zhuqi-lucas Jul 7, 2025
a809c98
remove unused code
zhuqi-lucas Jul 7, 2025
2bb5b93
Merge remote-tracking branch 'upstream/main' into fast_gc
zhuqi-lucas Jul 7, 2025
5b5a05c
don't use value_unchecked which is duplicating check len, etc
zhuqi-lucas Jul 7, 2025
3cb9431
polish code
zhuqi-lucas Jul 7, 2025
de6a199
fix comments
zhuqi-lucas Jul 7, 2025
55fb826
Merge remote-tracking branch 'upstream/main' into fast_gc
zhuqi-lucas Jul 7, 2025
d4b7243
benchmark: Add StringViewArray gc benchmark with not null cases
zhuqi-lucas Jul 7, 2025
a774986
fmt and comments
zhuqi-lucas Jul 7, 2025
75eb0d0
Merge branch 'add_not_null_gc_benchmark' into fast_gc
zhuqi-lucas Jul 7, 2025
fbd47d6
Use unchecked
zhuqi-lucas Jul 7, 2025
b214ab7
fmt
zhuqi-lucas Jul 7, 2025
adc2b1f
address comments
zhuqi-lucas Jul 8, 2025
5310dd1
Merge branch 'add_not_null_gc_benchmark' into fast_gc
zhuqi-lucas Jul 8, 2025
1a01926
optimize null check
zhuqi-lucas Jul 8, 2025
39cf32c
fix clippy
zhuqi-lucas Jul 8, 2025
4786066
address comments and polish code
zhuqi-lucas Jul 8, 2025
3350638
address comments
zhuqi-lucas Jul 8, 2025
7a08ce0
Merge remote-tracking branch 'upstream/main' into fast_gc
zhuqi-lucas Jul 8, 2025
32f27cb
add rich tests and add fast path for inline data and fix empty buffer
zhuqi-lucas Jul 8, 2025
adb8605
fmt
zhuqi-lucas Jul 8, 2025
219aabf
Address comments
zhuqi-lucas Jul 10, 2025
c16d236
Don't need null caculation
zhuqi-lucas Jul 10, 2025
6e6387d
Merge remote-tracking branch 'upstream/main' into fast_gc
zhuqi-lucas Jul 10, 2025
135cbeb
fast path for no data buffer
zhuqi-lucas Jul 10, 2025
625c421
Merge remote-tracking branch 'apache/main' into fast_gc
alamb Jul 10, 2025
2097747
Update arrow-array/src/array/byte_view_array.rs
zhuqi-lucas Jul 11, 2025
c1a1065
Update arrow-array/src/array/byte_view_array.rs
zhuqi-lucas Jul 11, 2025
797b63e
address comments
zhuqi-lucas Jul 11, 2025
5e30749
fmt
zhuqi-lucas Jul 11, 2025
0110ef9
Merge remote-tracking branch 'upstream/main' into fast_gc
zhuqi-lucas Jul 11, 2025
214f844
Merge remote-tracking branch 'apache/main' into fast_gc
alamb Jul 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
214 changes: 209 additions & 5 deletions arrow-array/src/array/byte_view_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -473,13 +473,89 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
/// Note: this function does not attempt to canonicalize / deduplicate values. For this
/// feature see [`GenericByteViewBuilder::with_deduplicate_strings`].
pub fn gc(&self) -> Self {
let mut builder = GenericByteViewBuilder::<T>::with_capacity(self.len());
// 1) Read basic properties once
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are some good tricks here that maybe we can apply to coalese -- in

fn append_views_and_update_buffer_index(&mut self, views: &[u128], buffers: &[Buffer]) {

The only difference is the ability to extend exsiting views/buffers rather than allocate entirely new buffers 🤔

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point @alamb , add the sub-task, i will start the investigation after this PR, thanks!

Improve the performance for coalese with StringView

In epic:
#7802

let len = self.len(); // number of elements
let nulls = self.nulls().cloned(); // reuse & clone existing null bitmap

// 1.5) Fast path: if there are no buffers, just reuse original views and no data blocks
if self.data_buffers().is_empty() {
return unsafe {
GenericByteViewArray::new_unchecked(
self.views().clone(),
vec![], // empty data blocks
nulls,
)
};
}

for v in self.iter() {
builder.append_option(v);
// 2) Calculate total size of all non-inline data and detect if any exists
let total_large = self.total_buffer_bytes_used();

// 2.5) Fast path: if there is no non-inline data, avoid buffer allocation & processing
if total_large == 0 {
// Views are inline-only or all null; just reuse original views and no data blocks
return unsafe {
GenericByteViewArray::new_unchecked(
self.views().clone(),
vec![], // empty data blocks
nulls,
)
};
}

builder.finish()
// 3) Allocate exactly capacity for all non-inline data
let mut data_buf = Vec::with_capacity(total_large);

// 4) Iterate over views and process each inline/non-inline view
let views_buf: Vec<u128> = (0..len)
.map(|i| unsafe { self.copy_view_to_buffer(i, &mut data_buf) })
.collect();

// 5) Wrap up buffers
let data_block = Buffer::from_vec(data_buf);
let views_scalar = ScalarBuffer::from(views_buf);
let data_blocks = vec![data_block];

// SAFETY: views_scalar, data_blocks, and nulls are correctly aligned and sized
unsafe { GenericByteViewArray::new_unchecked(views_scalar, data_blocks, nulls) }
}

/// Copy the i‑th view into `data_buf` if it refers to an out‑of‑line buffer.
///
/// # Safety
///
/// - `i < self.len()`.
/// - Every element in `self.views()` must currently refer to a valid slice
/// inside one of `self.buffers`.
/// - `data_buf` must be ready to have additional bytes appended.
/// - After this call, the returned view will have its
/// `buffer_index` reset to `0` and its `offset` updated so that it points
/// into the bytes just appended at the end of `data_buf`.
#[inline(always)]
unsafe fn copy_view_to_buffer(&self, i: usize, data_buf: &mut Vec<u8>) -> u128 {
// SAFETY: `i < self.len()` ensures this is in‑bounds.
let raw_view = *self.views().get_unchecked(i);
let mut bv = ByteView::from(raw_view);

// Inline‑small views stay as‑is.
if bv.length <= MAX_INLINE_VIEW_LEN {
raw_view
} else {
// SAFETY: `bv.buffer_index` and `bv.offset..bv.offset+bv.length`
// must both lie within valid ranges for `self.buffers`.
let buffer = self.buffers.get_unchecked(bv.buffer_index as usize);
let start = bv.offset as usize;
let end = start + bv.length as usize;
let slice = buffer.get_unchecked(start..end);

// Copy out‑of‑line data into our single “0” buffer.
let new_offset = data_buf.len() as u32;
data_buf.extend_from_slice(slice);

bv.buffer_index = 0;
bv.offset = new_offset;
bv.into()
}
}

/// Returns the total number of bytes used by all non inlined views in all
Expand Down Expand Up @@ -998,7 +1074,11 @@ mod tests {
Array, BinaryViewArray, GenericBinaryArray, GenericByteViewArray, StringViewArray,
};
use arrow_buffer::{Buffer, ScalarBuffer};
use arrow_data::ByteView;
use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN};
use rand::prelude::StdRng;
use rand::{Rng, SeedableRng};

const BLOCK_SIZE: u32 = 8;

#[test]
fn try_new_string() {
Expand Down Expand Up @@ -1188,6 +1268,130 @@ mod tests {
check_gc(&array.slice(3, 1));
}

/// 1) Empty array: no elements, expect gc to return empty with no data buffers
#[test]
fn test_gc_empty_array() {
let array = StringViewBuilder::new()
.with_fixed_block_size(BLOCK_SIZE)
.finish();
let gced = array.gc();
// length and null count remain zero
assert_eq!(gced.len(), 0);
assert_eq!(gced.null_count(), 0);
// no underlying data buffers should be allocated
assert!(
gced.data_buffers().is_empty(),
"Expected no data buffers for empty array"
);
}

/// 2) All inline values (<= INLINE_LEN): capacity-only data buffer, same values
#[test]
fn test_gc_all_inline() {
let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
// append many short strings, each exactly INLINE_LEN long
for _ in 0..100 {
let s = "A".repeat(MAX_INLINE_VIEW_LEN as usize);
builder.append_option(Some(&s));
}
let array = builder.finish();
let gced = array.gc();
// Since all views fit inline, data buffer is empty
assert_eq!(
gced.data_buffers().len(),
0,
"Should have no data buffers for inline values"
);
assert_eq!(gced.len(), 100);
// verify element-wise equality
array.iter().zip(gced.iter()).for_each(|(orig, got)| {
assert_eq!(orig, got, "Inline value mismatch after gc");
});
}

/// 3) All large values (> INLINE_LEN): each must be copied into the new data buffer
#[test]
fn test_gc_all_large() {
let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
let large_str = "X".repeat(MAX_INLINE_VIEW_LEN as usize + 5);
// append multiple large strings
for _ in 0..50 {
builder.append_option(Some(&large_str));
}
let array = builder.finish();
let gced = array.gc();
// New data buffers should be populated (one or more blocks)
assert!(
!gced.data_buffers().is_empty(),
"Expected data buffers for large values"
);
assert_eq!(gced.len(), 50);
// verify that every large string emerges unchanged
array.iter().zip(gced.iter()).for_each(|(orig, got)| {
assert_eq!(orig, got, "Large view mismatch after gc");
});
}

/// 4) All null elements: ensure null bitmap handling path is correct
#[test]
fn test_gc_all_nulls() {
let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
for _ in 0..20 {
builder.append_null();
}
let array = builder.finish();
let gced = array.gc();
// length and null count match
assert_eq!(gced.len(), 20);
assert_eq!(gced.null_count(), 20);
// data buffers remain empty for null-only array
assert!(
gced.data_buffers().is_empty(),
"No data should be stored for nulls"
);
}

/// 5) Random mix of inline, large, and null values with slicing tests
#[test]
fn test_gc_random_mixed_and_slices() {
let mut rng = StdRng::seed_from_u64(42);
let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
// Keep a Vec of original Option<String> for later comparison
let mut original: Vec<Option<String>> = Vec::new();

for _ in 0..200 {
if rng.random_bool(0.1) {
// 10% nulls
builder.append_null();
original.push(None);
} else {
// random length between 0 and twice the inline limit
let len = rng.random_range(0..(MAX_INLINE_VIEW_LEN * 2));
let s: String = "A".repeat(len as usize);
builder.append_option(Some(&s));
original.push(Some(s));
}
}

let array = builder.finish();
// Test multiple slice ranges to ensure offset logic is correct
for (offset, slice_len) in &[(0, 50), (10, 100), (150, 30)] {
let sliced = array.slice(*offset, *slice_len);
let gced = sliced.gc();
// Build expected slice of Option<&str>
let expected: Vec<Option<&str>> = original[*offset..(*offset + *slice_len)]
.iter()
.map(|opt| opt.as_deref())
.collect();

assert_eq!(gced.len(), *slice_len, "Slice length mismatch");
// Compare element-wise
gced.iter().zip(expected.iter()).for_each(|(got, expect)| {
assert_eq!(got, *expect, "Value mismatch in mixed slice after gc");
});
}
}

#[test]
fn test_eq() {
let test_data = [
Expand Down
Loading