Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

arrow2 estimated_bytes_size benchmarks #1743

Merged
merged 2 commits into from
Apr 3, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
226 changes: 170 additions & 56 deletions crates/re_arrow_store/benches/arrow2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,22 @@ static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;

use std::sync::Arc;

use arrow2::array::{Array, PrimitiveArray, StructArray};
use arrow2::{
array::{Array, PrimitiveArray, StructArray, UnionArray},
compute::aggregate::estimated_bytes_size,
};
use criterion::{criterion_group, criterion_main, Criterion};
use itertools::Itertools;
use re_log_types::{
component_types::{InstanceKey, Point2D},
datagen::{build_some_instances, build_some_point2d},
DataCell,
component_types::{InstanceKey, Point2D, Rect2D},
datagen::{build_some_instances, build_some_point2d, build_some_rects},
external::arrow2_convert::serialize::TryIntoArrow,
DataCell, SerializableComponent,
};

// ---

criterion_group!(benches, estimated_size_bytes);
criterion_group!(benches, erased_clone, estimated_size_bytes);
criterion_main!(benches);

// ---
Expand All @@ -41,19 +45,138 @@ enum ArrayKind {

/// E.g. an array of `Point2D`.
Struct,

/// E.g. an array of `Rect2D`.
StructLarge,
}

impl std::fmt::Display for ArrayKind {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(match self {
ArrayKind::Primitive => "primitive",
ArrayKind::Struct => "struct",
ArrayKind::StructLarge => "struct_large",
})
}
}

fn erased_clone(c: &mut Criterion) {
let kind = [
ArrayKind::Primitive,
ArrayKind::Struct,
ArrayKind::StructLarge,
];

for kind in kind {
let mut group = c.benchmark_group(format!(
"arrow2/size_bytes/{kind}/rows={NUM_ROWS}/instances={NUM_INSTANCES}"
));
group.throughput(criterion::Throughput::Elements(NUM_ROWS as _));

match kind {
ArrayKind::Primitive => {
let data = build_some_instances(NUM_INSTANCES);
bench_arrow(&mut group, data.as_slice());
bench_native(&mut group, data.as_slice());
}
ArrayKind::Struct => {
let data = build_some_point2d(NUM_INSTANCES);
bench_arrow(&mut group, data.as_slice());
bench_native(&mut group, data.as_slice());
}
ArrayKind::StructLarge => {
let data = build_some_rects(NUM_INSTANCES);
bench_arrow(&mut group, data.as_slice());
bench_native(&mut group, data.as_slice());
}
}
}

// TODO(cmc): Use cells once `cell.size_bytes()` has landed (#1727)
fn bench_arrow<T: SerializableComponent>(
group: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>,
data: &[T],
) {
let arrays: Vec<Box<dyn Array>> = (0..NUM_ROWS)
.map(|_| TryIntoArrow::try_into_arrow(data).unwrap())
.collect_vec();

let total_size_bytes = arrays
.iter()
.map(|array| estimated_bytes_size(&**array) as u64)
.sum::<u64>();
assert!(total_size_bytes as usize >= NUM_ROWS * NUM_INSTANCES * std::mem::size_of::<T>());

group.bench_function("array", |b| {
b.iter(|| {
let sz = arrays
.iter()
.map(|array| estimated_bytes_size(&**array) as u64)
.sum::<u64>();
assert_eq!(total_size_bytes, sz);
sz
});
});
}

fn bench_native<T: Clone>(
group: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>,
data: &[T],
) {
let vecs = (0..NUM_ROWS).map(|_| data.to_vec()).collect_vec();

let total_size_bytes = vecs
.iter()
.map(|vec| std::mem::size_of_val(vec.as_slice()) as u64)
.sum::<u64>();
assert!(total_size_bytes as usize >= NUM_ROWS * NUM_INSTANCES * std::mem::size_of::<T>());

{
let vecs = (0..NUM_ROWS).map(|_| data.to_vec()).collect_vec();
group.bench_function("vec", |b| {
b.iter(|| {
let sz = vecs
.iter()
.map(|vec| std::mem::size_of_val(vec.as_slice()) as u64)
.sum::<u64>();
assert_eq!(total_size_bytes, sz);
sz
});
});
}

trait SizeOf {
fn size_of(&self) -> usize;
}

impl<T> SizeOf for Vec<T> {
fn size_of(&self) -> usize {
std::mem::size_of_val(self.as_slice())
}
}

{
let vecs: Vec<Box<dyn SizeOf>> = (0..NUM_ROWS)
.map(|_| Box::new(data.to_vec()) as Box<dyn SizeOf>)
.collect_vec();

group.bench_function("vec/erased", |b| {
b.iter(|| {
let sz = vecs.iter().map(|vec| vec.size_of() as u64).sum::<u64>();
assert_eq!(total_size_bytes, sz);
sz
});
});
}
}
}

fn estimated_size_bytes(c: &mut Criterion) {
let kind = [ArrayKind::Primitive, ArrayKind::Struct];
let kind = [
ArrayKind::Primitive,
ArrayKind::Struct,
ArrayKind::StructLarge,
];

for kind in kind {
let mut group = c.benchmark_group(format!(
Expand All @@ -69,6 +192,9 @@ fn estimated_size_bytes(c: &mut Criterion) {
ArrayKind::Struct => (0..NUM_ROWS)
.map(|_| DataCell::from_native(build_some_point2d(NUM_INSTANCES).as_slice()))
.collect(),
ArrayKind::StructLarge => (0..NUM_ROWS)
.map(|_| DataCell::from_native(build_some_rects(NUM_INSTANCES).as_slice()))
.collect(),
}
}

Expand Down Expand Up @@ -127,59 +253,40 @@ fn estimated_size_bytes(c: &mut Criterion) {

match kind {
ArrayKind::Primitive => {
let cells = generate_cells(kind);
let arrays = cells
.iter()
.map(|cell| {
cell.as_arrow_ref()
.as_any()
.downcast_ref::<PrimitiveArray<u64>>()
.unwrap()
.clone()
})
.collect_vec();
let total_instances =
arrays.iter().map(|array| array.len() as u32).sum::<u32>();
assert_eq!(total_instances, (NUM_ROWS * NUM_INSTANCES) as u32);

group.bench_function("array/downcast_first", |b| {
b.iter(|| {
let arrays = arrays.clone();
assert_eq!(
total_instances,
arrays.iter().map(|array| array.len() as u32).sum::<u32>()
);
arrays
});
});
bench_downcast_first::<PrimitiveArray<u64>>(&mut group, kind);
}
ArrayKind::Struct => {
let cells = generate_cells(kind);
let arrays = cells
.iter()
.map(|cell| {
cell.as_arrow_ref()
.as_any()
.downcast_ref::<StructArray>()
.unwrap()
.clone()
})
.collect_vec();
let total_instances =
arrays.iter().map(|array| array.len() as u32).sum::<u32>();
assert_eq!(total_instances, (NUM_ROWS * NUM_INSTANCES) as u32);
ArrayKind::Struct => bench_downcast_first::<StructArray>(&mut group, kind),
ArrayKind::StructLarge => bench_downcast_first::<UnionArray>(&mut group, kind),
}

group.bench_function("array/downcast_first", |b| {
b.iter(|| {
let arrays = arrays.clone();
assert_eq!(
total_instances,
arrays.iter().map(|array| array.len() as u32).sum::<u32>()
);
arrays
});
fn bench_downcast_first<T: arrow2::array::Array + Clone>(
group: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>,
kind: ArrayKind,
) {
let cells = generate_cells(kind);
let arrays = cells
.iter()
.map(|cell| {
cell.as_arrow_ref()
.as_any()
.downcast_ref::<T>()
.unwrap()
.clone()
})
.collect_vec();
let total_instances = arrays.iter().map(|array| array.len() as u32).sum::<u32>();
assert_eq!(total_instances, (NUM_ROWS * NUM_INSTANCES) as u32);

group.bench_function("array/downcast_first", |b| {
b.iter(|| {
let arrays = arrays.clone();
assert_eq!(
total_instances,
arrays.iter().map(|array| array.len() as u32).sum::<u32>()
);
arrays
});
}
});
}
}

Expand All @@ -196,9 +303,16 @@ fn estimated_size_bytes(c: &mut Criterion) {
.collect()
}

fn generate_rects() -> Vec<Vec<Rect2D>> {
(0..NUM_ROWS)
.map(|_| build_some_rects(NUM_INSTANCES))
.collect()
}

match kind {
ArrayKind::Primitive => bench_std(&mut group, generate_keys()),
ArrayKind::Struct => bench_std(&mut group, generate_points()),
ArrayKind::StructLarge => bench_std(&mut group, generate_rects()),
}

fn bench_std<T: Clone>(
Expand Down