Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions rust/arrow/benches/array_slice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,23 @@ fn create_array_with_nulls(size: usize) -> ArrayRef {
Arc::new(array)
}

fn create_nested_array(size: usize) -> ArrayRef {
let mut builder = ListBuilder::new(StringDictionaryBuilder::new(Int16Builder::new(size), StringBuilder::new(size)));
let strings = &["foo", "bar", "baz"];

(0..size).for_each(|i| {
if i%2== 0 {
builder.values().append(&strings[i%strings.len()]).unwrap();
builder.append(true).unwrap()
} else {
builder.append(false).unwrap();
}
});

Arc::new(builder.finish())
}


fn array_slice_benchmark(c: &mut Criterion) {
let array = create_array_with_nulls(4096);
c.bench_function("array_slice 128", |b| {
Expand All @@ -46,6 +63,17 @@ fn array_slice_benchmark(c: &mut Criterion) {
c.bench_function("array_slice 2048", |b| {
b.iter(|| create_array_slice(&array, 2048))
});

let nested_array = create_nested_array(4096);
c.bench_function("array_slice nested type 128", |b| {
b.iter(|| create_array_slice(&nested_array, 128))
});
c.bench_function("array_slice nested type 512", |b| {
b.iter(|| create_array_slice(&nested_array, 512))
});
c.bench_function("array_slice nested type 2048", |b| {
b.iter(|| create_array_slice(&nested_array, 2048))
});
}

criterion_group!(benches, array_slice_benchmark);
Expand Down
98 changes: 88 additions & 10 deletions rust/arrow/src/array/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ use crate::{
};

use super::equal::equal;
use std::ops::Index;

#[inline]
pub(crate) fn count_nulls(
Expand Down Expand Up @@ -205,6 +206,57 @@ pub(crate) fn into_buffers(
}
}

#[derive(Debug, Clone)]
enum SmallContainer<T> {
Zero,
One(T),
Many(Vec<T>)
}

impl <T> SmallContainer<T> {
fn from_vec(mut v: Vec<T>) -> Self {
match v.len() {
0 => Self::Zero,
1 => Self::One(v.pop().unwrap()),
_ => Self::Many(v)
}
}

fn get(&self, index: usize) -> &T {
match self {
SmallContainer::One(one) if index == 0 => one,
SmallContainer::Many(v) if index < v.len() => &v[index],
_ => panic!("index out of bounds")
}
}

fn as_slice(&self) -> &[T] {
match self {
SmallContainer::Zero => &[],
SmallContainer::One(one) => std::slice::from_ref(one),
SmallContainer::Many(v) => v.as_slice()
}
}

fn iter(&self) -> impl Iterator<Item=&T> {
self.as_slice().iter()
}
}

impl <T> Index<usize> for SmallContainer<T> {
type Output = T;

fn index(&self, index: usize) -> &Self::Output {
self.get(index)
}
}

impl <T> Default for SmallContainer<T> {
fn default() -> Self {
Self::Zero
}
}

/// An generic representation of Arrow array data which encapsulates common attributes and
/// operations for Arrow array. Specific operations for different arrays types (e.g.,
/// primitive, list, struct) are implemented in `Array`.
Expand All @@ -225,11 +277,11 @@ pub struct ArrayData {
/// The buffers for this array data. Note that depending on the array types, this
/// could hold different kinds of buffers (e.g., value buffer, value offset buffer)
/// at different positions.
buffers: Vec<Buffer>,
buffers: SmallContainer<Buffer>,

/// The child(ren) of this array. Only non-empty for nested types, currently
/// `ListArray` and `StructArray`.
child_data: Vec<ArrayDataRef>,
child_data: SmallContainer<ArrayDataRef>,

/// The null bitmap. A `None` value for this indicates all values are non-null in
/// this array.
Expand Down Expand Up @@ -258,8 +310,33 @@ impl ArrayData {
len,
null_count,
offset,
buffers,
child_data,
buffers: SmallContainer::from_vec(buffers),
child_data: SmallContainer::from_vec(child_data),
null_bitmap,
}
}

pub fn new_smallvec(
data_type: DataType,
len: usize,
null_count: Option<usize>,
null_bit_buffer: Option<Buffer>,
offset: usize,
buffer: Buffer,
child_data: Option<ArrayDataRef>,
) -> Self {
let null_count = match null_count {
None => count_nulls(null_bit_buffer.as_ref(), offset, len),
Some(null_count) => null_count,
};
let null_bitmap = null_bit_buffer.map(Bitmap::from);
Self {
data_type,
len,
null_count,
offset,
buffers: SmallContainer::One(buffer),
child_data: child_data.map(SmallContainer::One).unwrap_or_default(),
null_bitmap,
}
}
Expand All @@ -278,12 +355,12 @@ impl ArrayData {

/// Returns a slice of buffers for this array data
pub fn buffers(&self) -> &[Buffer] {
&self.buffers[..]
&self.buffers.as_slice()
}

/// Returns a slice of children data arrays
pub fn child_data(&self) -> &[ArrayDataRef] {
&self.child_data[..]
&self.child_data.as_slice()
}

/// Returns whether the element at index `i` is null
Expand Down Expand Up @@ -340,15 +417,16 @@ impl ArrayData {
/// Returns the total number of bytes of memory occupied by the buffers owned by this [ArrayData].
pub fn get_buffer_memory_size(&self) -> usize {
let mut size = 0;
for buffer in &self.buffers {
for buffer in self.buffers.iter() {
size += buffer.capacity();
}
if let Some(bitmap) = &self.null_bitmap {
size += bitmap.get_buffer_memory_size()
}
for child in &self.child_data {
for child in self.child_data.iter() {
size += child.get_buffer_memory_size();
}

size
}

Expand All @@ -362,14 +440,14 @@ impl ArrayData {
- mem::size_of_val(&self.child_data);

// Calculate rest of the fields top down which contain actual data
for buffer in &self.buffers {
for buffer in self.buffers.iter() {
size += mem::size_of_val(&buffer);
size += buffer.capacity();
}
if let Some(bitmap) = &self.null_bitmap {
size += bitmap.get_array_memory_size()
}
for child in &self.child_data {
for child in self.child_data.iter() {
size += child.get_array_memory_size();
}

Expand Down