diff --git a/rust/arrow/benches/array_slice.rs b/rust/arrow/benches/array_slice.rs index a535c80d217..8aacf451743 100644 --- a/rust/arrow/benches/array_slice.rs +++ b/rust/arrow/benches/array_slice.rs @@ -35,6 +35,23 @@ fn create_array_with_nulls(size: usize) -> ArrayRef { Arc::new(array) } +fn create_nested_array(size: usize) -> ArrayRef { + let mut builder = ListBuilder::new(StringDictionaryBuilder::new(Int16Builder::new(size), StringBuilder::new(size))); + let strings = &["foo", "bar", "baz"]; + + (0..size).for_each(|i| { + if i%2== 0 { + builder.values().append(&strings[i%strings.len()]).unwrap(); + builder.append(true).unwrap() + } else { + builder.append(false).unwrap(); + } + }); + + Arc::new(builder.finish()) +} + + fn array_slice_benchmark(c: &mut Criterion) { let array = create_array_with_nulls(4096); c.bench_function("array_slice 128", |b| { @@ -46,6 +63,17 @@ fn array_slice_benchmark(c: &mut Criterion) { c.bench_function("array_slice 2048", |b| { b.iter(|| create_array_slice(&array, 2048)) }); + + let nested_array = create_nested_array(4096); + c.bench_function("array_slice nested type 128", |b| { + b.iter(|| create_array_slice(&nested_array, 128)) + }); + c.bench_function("array_slice nested type 512", |b| { + b.iter(|| create_array_slice(&nested_array, 512)) + }); + c.bench_function("array_slice nested type 2048", |b| { + b.iter(|| create_array_slice(&nested_array, 2048)) + }); } criterion_group!(benches, array_slice_benchmark); diff --git a/rust/arrow/src/array/data.rs b/rust/arrow/src/array/data.rs index 09fb019f314..34970c746ad 100644 --- a/rust/arrow/src/array/data.rs +++ b/rust/arrow/src/array/data.rs @@ -29,6 +29,7 @@ use crate::{ }; use super::equal::equal; +use std::ops::Index; #[inline] pub(crate) fn count_nulls( @@ -205,6 +206,57 @@ pub(crate) fn into_buffers( } } +#[derive(Debug, Clone)] +enum SmallContainer { + Zero, + One(T), + Many(Vec) +} + +impl SmallContainer { + fn from_vec(mut v: Vec) -> Self { + match v.len() { + 0 => Self::Zero, + 1 => Self::One(v.pop().unwrap()), + _ => Self::Many(v) + } + } + + fn get(&self, index: usize) -> &T { + match self { + SmallContainer::One(one) if index == 0 => one, + SmallContainer::Many(v) if index < v.len() => &v[index], + _ => panic!("index out of bounds") + } + } + + fn as_slice(&self) -> &[T] { + match self { + SmallContainer::Zero => &[], + SmallContainer::One(one) => std::slice::from_ref(one), + SmallContainer::Many(v) => v.as_slice() + } + } + + fn iter(&self) -> impl Iterator { + self.as_slice().iter() + } +} + +impl Index for SmallContainer { + type Output = T; + + fn index(&self, index: usize) -> &Self::Output { + self.get(index) + } +} + +impl Default for SmallContainer { + fn default() -> Self { + Self::Zero + } +} + /// An generic representation of Arrow array data which encapsulates common attributes and /// operations for Arrow array. Specific operations for different arrays types (e.g., /// primitive, list, struct) are implemented in `Array`. @@ -225,11 +277,11 @@ pub struct ArrayData { /// The buffers for this array data. Note that depending on the array types, this /// could hold different kinds of buffers (e.g., value buffer, value offset buffer) /// at different positions. - buffers: Vec, + buffers: SmallContainer, /// The child(ren) of this array. Only non-empty for nested types, currently /// `ListArray` and `StructArray`. - child_data: Vec, + child_data: SmallContainer, /// The null bitmap. A `None` value for this indicates all values are non-null in /// this array. @@ -258,8 +310,33 @@ impl ArrayData { len, null_count, offset, - buffers, - child_data, + buffers: SmallContainer::from_vec(buffers), + child_data: SmallContainer::from_vec(child_data), + null_bitmap, + } + } + + pub fn new_smallvec( + data_type: DataType, + len: usize, + null_count: Option, + null_bit_buffer: Option, + offset: usize, + buffer: Buffer, + child_data: Option, + ) -> Self { + let null_count = match null_count { + None => count_nulls(null_bit_buffer.as_ref(), offset, len), + Some(null_count) => null_count, + }; + let null_bitmap = null_bit_buffer.map(Bitmap::from); + Self { + data_type, + len, + null_count, + offset, + buffers: SmallContainer::One(buffer), + child_data: child_data.map(SmallContainer::One).unwrap_or_default(), null_bitmap, } } @@ -278,12 +355,12 @@ impl ArrayData { /// Returns a slice of buffers for this array data pub fn buffers(&self) -> &[Buffer] { - &self.buffers[..] + &self.buffers.as_slice() } /// Returns a slice of children data arrays pub fn child_data(&self) -> &[ArrayDataRef] { - &self.child_data[..] + &self.child_data.as_slice() } /// Returns whether the element at index `i` is null @@ -340,15 +417,16 @@ impl ArrayData { /// Returns the total number of bytes of memory occupied by the buffers owned by this [ArrayData]. pub fn get_buffer_memory_size(&self) -> usize { let mut size = 0; - for buffer in &self.buffers { + for buffer in self.buffers.iter() { size += buffer.capacity(); } if let Some(bitmap) = &self.null_bitmap { size += bitmap.get_buffer_memory_size() } - for child in &self.child_data { + for child in self.child_data.iter() { size += child.get_buffer_memory_size(); } + size } @@ -362,14 +440,14 @@ impl ArrayData { - mem::size_of_val(&self.child_data); // Calculate rest of the fields top down which contain actual data - for buffer in &self.buffers { + for buffer in self.buffers.iter() { size += mem::size_of_val(&buffer); size += buffer.capacity(); } if let Some(bitmap) = &self.null_bitmap { size += bitmap.get_array_memory_size() } - for child in &self.child_data { + for child in self.child_data.iter() { size += child.get_array_memory_size(); }