Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions rust/arrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,7 @@ harness = false
[[bench]]
name = "array_slice"
harness = false

[[bench]]
name = "concatenate_kernel"
harness = false
102 changes: 102 additions & 0 deletions rust/arrow/benches/concatenate_kernel.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#[macro_use]
extern crate criterion;
use criterion::Criterion;

use rand::distributions::{Alphanumeric, Distribution, Standard};
use rand::Rng;

use std::sync::Arc;

extern crate arrow;

use arrow::array::*;
use arrow::compute::concat;
use arrow::datatypes::*;
use arrow::util::test_util::seedable_rng;

// cast array from specified primitive array type to desired data type
fn create_primitive<T>(size: usize, null_density: f32) -> ArrayRef
where
T: ArrowPrimitiveType,
Standard: Distribution<T::Native>,
PrimitiveArray<T>: std::convert::From<Vec<T::Native>>,
{
let mut rng = seedable_rng();

let array: PrimitiveArray<T> = seedable_rng()
.sample_iter(&Standard)
.take(size)
.map(|value| {
let x = rng.gen::<f32>();
if x < null_density {
Some(value)
} else {
None
}
})
.collect();

Arc::new(array) as ArrayRef
}

fn create_strings(size: usize, null_density: f32) -> ArrayRef {
let rng = &mut seedable_rng();

let mut builder = StringBuilder::new(size);
for _ in 0..size {
let x = rng.gen::<f32>();
if x < null_density {
let value = rng.sample_iter(&Alphanumeric).take(4).collect::<String>();
builder.append_value(&value).unwrap();
} else {
builder.append_null().unwrap()
}
}
Arc::new(builder.finish())
}

fn bench_concat(v1: &ArrayRef, v2: &ArrayRef) {
criterion::black_box(concat(&[v1.as_ref(), v2.as_ref()]).unwrap());
}

fn add_benchmark(c: &mut Criterion) {
let v1 = create_primitive::<Int32Type>(1024, 0.0);
let v2 = create_primitive::<Int32Type>(1024, 0.0);
c.bench_function("concat i32 1024", |b| b.iter(|| bench_concat(&v1, &v2)));

let v1 = create_primitive::<Int32Type>(1024, 0.5);
let v2 = create_primitive::<Int32Type>(1024, 0.5);
c.bench_function("concat i32 nulls 1024", |b| {
b.iter(|| bench_concat(&v1, &v2))
});

let v1 = create_strings(1024, 0.0);
let v2 = create_strings(1024, 0.0);
c.bench_function("concat str 1024", |b| b.iter(|| bench_concat(&v1, &v2)));

let v1 = create_strings(1024, 0.5);
let v2 = create_strings(1024, 0.5);
c.bench_function("concat str nulls 1024", |b| {
b.iter(|| bench_concat(&v1, &v2))
});
}

criterion_group!(benches, add_benchmark);
criterion_main!(benches);
44 changes: 43 additions & 1 deletion rust/arrow/src/array/array_binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,13 @@
// specific language governing permissions and limitations
// under the License.

use std::convert::{From, TryInto};
use std::fmt;
use std::mem;
use std::{any::Any, iter::FromIterator};
use std::{
convert::{From, TryInto},
sync::Arc,
};

use super::{
array::print_long_array, raw_pointer::as_aligned_pointer, raw_pointer::RawPtrBox,
Expand Down Expand Up @@ -373,6 +376,45 @@ impl From<Vec<Vec<u8>>> for FixedSizeBinaryArray {
}
}

impl From<Vec<Option<Vec<u8>>>> for FixedSizeBinaryArray {
fn from(data: Vec<Option<Vec<u8>>>) -> Self {
let len = data.len();
assert!(len > 0);
// try to estimate the size. This may not be possible no entry is valid => panic
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This behavior (panic'ing') doesn't seem ideal, though I realize there isn't much useful to do when converting a Vec of entirely None -- maybe we could just return a zero length array.

Could definitely be done as a follow on PR

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that in general we should avoid using these because they require two allocations (rust's Vec and arrow buffers). This function is mostly useful for testing.

I would be ok with replacing them by the FromIter constructor, which is more performance, more general, and has the same ergonomics (from(vec![].into_iter()) instead of from(vec![...]) for a vector). This way we do not need to worry about these.

The challenge with fixed sized items is that they require knowledge of the size. This would be nicely solved by accepting Option<[T; T: usize]>, but Rust's support for constant generics is slim atm.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let size = data.iter().filter_map(|e| e.as_ref()).next().unwrap().len();
assert!(data
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given this operation can fail (if all the elements are not the same length) perhaps we should implement TryFrom instead of From and panic -- again, this would be an excellent follow on PR (or maybe file a ticket and it could be an excellent first contribution for someone who wanted to contribute)

.iter()
.filter_map(|e| e.as_ref())
.all(|item| item.len() == size));

let num_bytes = bit_util::ceil(len, 8);
let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, false);
let null_slice = null_buf.data_mut();

data.iter().enumerate().for_each(|(i, entry)| {
if entry.is_some() {
bit_util::set_bit(null_slice, i);
}
});

let data = data
.into_iter()
.map(|e| e.unwrap_or_else(|| vec![0; size]))
.flatten()
.collect::<Vec<_>>();
let data = ArrayData::new(
DataType::FixedSizeBinary(size as i32),
len,
None,
Some(null_buf.freeze()),
0,
vec![Buffer::from(&data)],
vec![],
);
FixedSizeBinaryArray::from(Arc::new(data))
}
}

impl From<ArrayDataRef> for FixedSizeBinaryArray {
fn from(data: ArrayDataRef) -> Self {
assert_eq!(
Expand Down
Loading