diff --git a/arrow2_convert/Cargo.toml b/arrow2_convert/Cargo.toml index cdb771d..14675cf 100644 --- a/arrow2_convert/Cargo.toml +++ b/arrow2_convert/Cargo.toml @@ -20,7 +20,15 @@ trybuild = "1.0" [dev-dependencies] arrow2_convert_derive = { version = "0.4.0", path = "../arrow2_convert_derive" } +criterion = "0.4" [features] default = ["derive"] derive = ["arrow2_convert_derive"] + +[lib] +bench = false + +[[bench]] +name = "bench" +harness = false diff --git a/arrow2_convert/benches/bench.rs b/arrow2_convert/benches/bench.rs new file mode 100644 index 0000000..3aded7e --- /dev/null +++ b/arrow2_convert/benches/bench.rs @@ -0,0 +1,68 @@ +use arrow2::{array::Array, buffer::Buffer}; +use arrow2_convert::{ + deserialize::TryIntoCollection, serialize::TryIntoArrow, ArrowDeserialize, ArrowField, + ArrowSerialize, +}; +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; + +#[derive(ArrowField, ArrowSerialize, ArrowDeserialize)] +#[arrow_field(transparent)] +pub struct BufStruct(Buffer); + +#[derive(ArrowField, ArrowSerialize, ArrowDeserialize)] +#[arrow_field(transparent)] +pub struct VecStruct(Vec); + +pub fn bench_buffer_serialize(c: &mut Criterion) { + let mut group = c.benchmark_group("serialize"); + for size in [1, 10, 100, 1000, 10000].iter() { + group.throughput(Throughput::Elements(*size as u64)); + group.bench_with_input(BenchmarkId::new("Buffer", size), size, |b, &size| { + let data = [BufStruct((0..size as u16).into_iter().collect())]; + b.iter(|| { + let _: Box = TryIntoArrow::try_into_arrow(black_box(&data)).unwrap(); + }); + }); + group.bench_with_input(BenchmarkId::new("Vec", size), size, |b, &size| { + let data = [VecStruct((0..size as u16).into_iter().collect())]; + b.iter(|| { + let _: Box = TryIntoArrow::try_into_arrow(black_box(&data)).unwrap(); + }); + }); + } +} +pub fn bench_buffer_deserialize(c: &mut Criterion) { + let mut group = c.benchmark_group("deserialize"); + for size in [1, 10, 100, 1000, 10000].iter() { + group.throughput(Throughput::Elements(*size as u64)); + group.bench_with_input(BenchmarkId::new("Buffer", size), size, |b, &size| { + let data: Box = [BufStruct((0..size as u16).into_iter().collect())] + .try_into_arrow() + .unwrap(); + b.iter_batched( + || data.clone(), + |data| { + let _: Vec = + TryIntoCollection::try_into_collection(black_box(data)).unwrap(); + }, + criterion::BatchSize::SmallInput, + ) + }); + group.bench_with_input(BenchmarkId::new("Vec", size), size, |b, &size| { + let data: Box = [VecStruct((0..size as u16).into_iter().collect())] + .try_into_arrow() + .unwrap(); + b.iter_batched( + || data.clone(), + |data| { + let _: Vec = + TryIntoCollection::try_into_collection(black_box(data)).unwrap(); + }, + criterion::BatchSize::SmallInput, + ); + }); + } +} + +criterion_group!(benches, bench_buffer_serialize, bench_buffer_deserialize); +criterion_main!(benches); diff --git a/arrow2_convert/src/deserialize.rs b/arrow2_convert/src/deserialize.rs index c3a83f8..00fc5b8 100644 --- a/arrow2_convert/src/deserialize.rs +++ b/arrow2_convert/src/deserialize.rs @@ -1,6 +1,6 @@ //! Implementation and traits for deserializing from Arrow. -use arrow2::array::*; +use arrow2::{array::*, buffer::Buffer, types::NativeType}; use chrono::{NaiveDate, NaiveDateTime}; use crate::field::*; @@ -71,6 +71,7 @@ macro_rules! impl_arrow_array { impl ArrowArray for $array { type BaseArrayType = Self; + #[inline] fn iter_from_array_ref(b: &dyn Array) -> <&Self as IntoIterator>::IntoIter { b.as_any() .downcast_ref::() @@ -213,6 +214,28 @@ where }) } +// Blanket implementation for Buffer +impl ArrowDeserialize for Buffer +where + T: ArrowDeserialize + NativeType + ArrowEnableVecForType, + for<'b> &'b ::ArrayType: IntoIterator, +{ + type ArrayType = ListArray; + + #[inline] + fn arrow_deserialize( + v: <&Self::ArrayType as IntoIterator>::Item, + ) -> Option<::Type> { + v.map(|t| { + t.as_any() + .downcast_ref::>() + .unwrap() + .values() + .clone() + }) + } +} + // Blanket implementation for Vec impl ArrowDeserialize for Vec where diff --git a/arrow2_convert/src/field.rs b/arrow2_convert/src/field.rs index 3f144cb..f581ddc 100644 --- a/arrow2_convert/src/field.rs +++ b/arrow2_convert/src/field.rs @@ -1,6 +1,10 @@ //! Implementation and traits for mapping rust types to Arrow types -use arrow2::datatypes::{DataType, Field}; +use arrow2::{ + buffer::Buffer, + datatypes::{DataType, Field}, + types::NativeType, +}; use chrono::{NaiveDate, NaiveDateTime}; /// Trait implemented by all types that can be used as an Arrow field. @@ -169,6 +173,15 @@ impl ArrowField for NaiveDate { } } +impl ArrowField for Buffer { + type Type = Self; + + #[inline] + fn data_type() -> arrow2::datatypes::DataType { + arrow2::datatypes::DataType::Binary + } +} + impl ArrowField for Vec { type Type = Self; @@ -202,6 +215,19 @@ impl ArrowField for FixedSizeBinary { } } +// Blanket implementation for Buffer +impl ArrowField for Buffer +where + T: ArrowField + NativeType + ArrowEnableVecForType, +{ + type Type = Self; + + #[inline] + fn data_type() -> DataType { + DataType::List(Box::new(::field("item"))) + } +} + // Blanket implementation for Vec. impl ArrowField for Vec where diff --git a/arrow2_convert/src/lib.rs b/arrow2_convert/src/lib.rs index 23d7d42..01169e9 100644 --- a/arrow2_convert/src/lib.rs +++ b/arrow2_convert/src/lib.rs @@ -1,4 +1,4 @@ -#![doc = include_str!("../README.md")] +#![cfg_attr(not(target_os = "windows"), doc = include_str!("../README.md"))] #![deny(missing_docs)] #![forbid(unsafe_code)] @@ -14,6 +14,6 @@ pub mod serialize; pub use arrow2_convert_derive::{ArrowDeserialize, ArrowField, ArrowSerialize}; // Test README with doctests -#[doc = include_str!("../README.md")] +#[cfg_attr(not(target_os = "windows"), doc = include_str!("../README.md"))] #[cfg(doctest)] struct ReadmeDoctests; diff --git a/arrow2_convert/src/serialize.rs b/arrow2_convert/src/serialize.rs index dab1da8..bf0e2ea 100644 --- a/arrow2_convert/src/serialize.rs +++ b/arrow2_convert/src/serialize.rs @@ -1,8 +1,9 @@ //! Implementation and traits for serializing to Arrow. -use arrow2::array::Array; use arrow2::array::*; use arrow2::chunk::Chunk; +use arrow2::types::NativeType; +use arrow2::{array::Array, buffer::Buffer}; use chrono::{NaiveDate, NaiveDateTime}; use std::sync::Arc; @@ -178,6 +179,20 @@ impl ArrowSerialize for NaiveDate { } } +impl ArrowSerialize for Buffer { + type MutableArrayType = MutableBinaryArray; + + #[inline] + fn new_array() -> Self::MutableArrayType { + Self::MutableArrayType::default() + } + + #[inline] + fn arrow_serialize(v: &Self, array: &mut Self::MutableArrayType) -> arrow2::error::Result<()> { + array.try_push(Some(v.as_slice())) + } +} + impl ArrowSerialize for Vec { type MutableArrayType = MutableBinaryArray; @@ -226,6 +241,34 @@ impl ArrowSerialize for FixedSizeBinary { } } +// Blanket implementation for Buffer +impl ArrowSerialize for Buffer +where + T: NativeType + ArrowSerialize + ArrowEnableVecForType, +{ + type MutableArrayType = MutableListArray>; + + #[inline] + fn new_array() -> Self::MutableArrayType { + Self::MutableArrayType::new_with_field( + MutablePrimitiveArray::new(), + "item", + ::is_nullable(), + ) + } + + #[inline] + fn arrow_serialize( + v: &::Type, + array: &mut Self::MutableArrayType, + ) -> arrow2::error::Result<()> { + let values = array.mut_values(); + values.reserve(v.len()); + values.extend_from_slice(v.as_slice()); + array.try_push_valid() + } +} + // Blanket implementation for Vec impl ArrowSerialize for Vec where diff --git a/arrow2_convert/tests/test_deserialize.rs b/arrow2_convert/tests/test_deserialize.rs index b9acc4a..e335c38 100644 --- a/arrow2_convert/tests/test_deserialize.rs +++ b/arrow2_convert/tests/test_deserialize.rs @@ -1,5 +1,5 @@ -use arrow2::array::*; use arrow2::error::Result; +use arrow2::{array::*, buffer::Buffer}; use arrow2_convert::{deserialize::*, serialize::*, ArrowDeserialize, ArrowField, ArrowSerialize}; #[test] @@ -77,3 +77,13 @@ fn test_deserialize_large_types_schema_mismatch_error() { let result: Result> = arr1.try_into_collection(); assert!(result.is_err()); } + +#[test] +fn test_deserialize_buffer() { + let original_array = [Buffer::from_iter(0u16..5), Buffer::from_iter(7..9)]; + let b: Box = original_array.try_into_arrow().unwrap(); + let iter = arrow_array_deserialize_iterator::>(b.as_ref()).unwrap(); + for (i, k) in iter.zip(original_array.iter()) { + assert_eq!(&i, k); + } +} diff --git a/arrow2_convert/tests/test_serialize.rs b/arrow2_convert/tests/test_serialize.rs index c257863..70ecd9e 100644 --- a/arrow2_convert/tests/test_serialize.rs +++ b/arrow2_convert/tests/test_serialize.rs @@ -1,4 +1,5 @@ use arrow2::array::Array; +use arrow2::buffer::Buffer; use arrow2::chunk::Chunk; use arrow2_convert::field::{ArrowField, FixedSizeBinary}; use arrow2_convert::serialize::*; @@ -69,6 +70,23 @@ fn test_array() { assert_eq!(r.data_type(), & as ArrowField>::data_type()); } +#[test] +fn test_buffer() { + // Buffer and Vec should serialize into BinaryArray + let dat: Vec> = vec![(0..10).into_iter().collect()]; + let r: Box = dat.try_into_arrow().unwrap(); + assert_eq!(r.len(), 1); + assert_eq!(r.data_type(), & as ArrowField>::data_type()); + assert_eq!(r.data_type(), & as ArrowField>::data_type()); + + // Buffer and Vec should serialize into ListArray + let dat: Vec> = vec![(0..10).into_iter().collect()]; + let r: Box = dat.try_into_arrow().unwrap(); + assert_eq!(r.len(), 1); + assert_eq!(r.data_type(), & as ArrowField>::data_type()); + assert_eq!(r.data_type(), & as ArrowField>::data_type()); +} + #[test] fn test_field_serialize_error() { pub struct CustomType(u64);