-
Notifications
You must be signed in to change notification settings - Fork 4k
ARROW-10827: [Rust] Move concat from builders to a compute kernel and make it faster (2-6x) #8853
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
724cb1a
fda61f4
ba670c5
0193e0f
a9f8c6f
608fff6
1f3c773
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,102 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| #[macro_use] | ||
| extern crate criterion; | ||
| use criterion::Criterion; | ||
|
|
||
| use rand::distributions::{Alphanumeric, Distribution, Standard}; | ||
| use rand::Rng; | ||
|
|
||
| use std::sync::Arc; | ||
|
|
||
| extern crate arrow; | ||
|
|
||
| use arrow::array::*; | ||
| use arrow::compute::concat; | ||
| use arrow::datatypes::*; | ||
| use arrow::util::test_util::seedable_rng; | ||
|
|
||
| // cast array from specified primitive array type to desired data type | ||
| fn create_primitive<T>(size: usize, null_density: f32) -> ArrayRef | ||
| where | ||
| T: ArrowPrimitiveType, | ||
| Standard: Distribution<T::Native>, | ||
| PrimitiveArray<T>: std::convert::From<Vec<T::Native>>, | ||
| { | ||
| let mut rng = seedable_rng(); | ||
|
|
||
| let array: PrimitiveArray<T> = seedable_rng() | ||
| .sample_iter(&Standard) | ||
| .take(size) | ||
| .map(|value| { | ||
| let x = rng.gen::<f32>(); | ||
| if x < null_density { | ||
| Some(value) | ||
| } else { | ||
| None | ||
| } | ||
| }) | ||
| .collect(); | ||
|
|
||
| Arc::new(array) as ArrayRef | ||
| } | ||
|
|
||
| fn create_strings(size: usize, null_density: f32) -> ArrayRef { | ||
| let rng = &mut seedable_rng(); | ||
|
|
||
| let mut builder = StringBuilder::new(size); | ||
| for _ in 0..size { | ||
| let x = rng.gen::<f32>(); | ||
| if x < null_density { | ||
| let value = rng.sample_iter(&Alphanumeric).take(4).collect::<String>(); | ||
| builder.append_value(&value).unwrap(); | ||
| } else { | ||
| builder.append_null().unwrap() | ||
| } | ||
| } | ||
| Arc::new(builder.finish()) | ||
| } | ||
|
|
||
| fn bench_concat(v1: &ArrayRef, v2: &ArrayRef) { | ||
| criterion::black_box(concat(&[v1.as_ref(), v2.as_ref()]).unwrap()); | ||
| } | ||
|
|
||
| fn add_benchmark(c: &mut Criterion) { | ||
| let v1 = create_primitive::<Int32Type>(1024, 0.0); | ||
| let v2 = create_primitive::<Int32Type>(1024, 0.0); | ||
| c.bench_function("concat i32 1024", |b| b.iter(|| bench_concat(&v1, &v2))); | ||
|
|
||
| let v1 = create_primitive::<Int32Type>(1024, 0.5); | ||
| let v2 = create_primitive::<Int32Type>(1024, 0.5); | ||
| c.bench_function("concat i32 nulls 1024", |b| { | ||
| b.iter(|| bench_concat(&v1, &v2)) | ||
| }); | ||
|
|
||
| let v1 = create_strings(1024, 0.0); | ||
| let v2 = create_strings(1024, 0.0); | ||
| c.bench_function("concat str 1024", |b| b.iter(|| bench_concat(&v1, &v2))); | ||
|
|
||
| let v1 = create_strings(1024, 0.5); | ||
| let v2 = create_strings(1024, 0.5); | ||
| c.bench_function("concat str nulls 1024", |b| { | ||
| b.iter(|| bench_concat(&v1, &v2)) | ||
| }); | ||
| } | ||
|
|
||
| criterion_group!(benches, add_benchmark); | ||
| criterion_main!(benches); |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,10 +15,13 @@ | |
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| use std::convert::{From, TryInto}; | ||
| use std::fmt; | ||
| use std::mem; | ||
| use std::{any::Any, iter::FromIterator}; | ||
| use std::{ | ||
| convert::{From, TryInto}, | ||
| sync::Arc, | ||
| }; | ||
|
|
||
| use super::{ | ||
| array::print_long_array, raw_pointer::as_aligned_pointer, raw_pointer::RawPtrBox, | ||
|
|
@@ -373,6 +376,45 @@ impl From<Vec<Vec<u8>>> for FixedSizeBinaryArray { | |
| } | ||
| } | ||
|
|
||
| impl From<Vec<Option<Vec<u8>>>> for FixedSizeBinaryArray { | ||
| fn from(data: Vec<Option<Vec<u8>>>) -> Self { | ||
| let len = data.len(); | ||
| assert!(len > 0); | ||
| // try to estimate the size. This may not be possible no entry is valid => panic | ||
| let size = data.iter().filter_map(|e| e.as_ref()).next().unwrap().len(); | ||
| assert!(data | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Given this operation can fail (if all the elements are not the same length) perhaps we should implement |
||
| .iter() | ||
| .filter_map(|e| e.as_ref()) | ||
| .all(|item| item.len() == size)); | ||
|
|
||
| let num_bytes = bit_util::ceil(len, 8); | ||
| let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, false); | ||
| let null_slice = null_buf.data_mut(); | ||
|
|
||
| data.iter().enumerate().for_each(|(i, entry)| { | ||
| if entry.is_some() { | ||
| bit_util::set_bit(null_slice, i); | ||
| } | ||
| }); | ||
|
|
||
| let data = data | ||
| .into_iter() | ||
| .map(|e| e.unwrap_or_else(|| vec![0; size])) | ||
| .flatten() | ||
| .collect::<Vec<_>>(); | ||
| let data = ArrayData::new( | ||
| DataType::FixedSizeBinary(size as i32), | ||
| len, | ||
| None, | ||
| Some(null_buf.freeze()), | ||
| 0, | ||
| vec![Buffer::from(&data)], | ||
| vec![], | ||
| ); | ||
| FixedSizeBinaryArray::from(Arc::new(data)) | ||
| } | ||
| } | ||
|
|
||
| impl From<ArrayDataRef> for FixedSizeBinaryArray { | ||
| fn from(data: ArrayDataRef) -> Self { | ||
| assert_eq!( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This behavior (panic'ing') doesn't seem ideal, though I realize there isn't much useful to do when converting a Vec of entirely
None-- maybe we could just return a zero length array.Could definitely be done as a follow on PR
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note that in general we should avoid using these because they require two allocations (rust's
Vecand arrow buffers). This function is mostly useful for testing.I would be ok with replacing them by the
FromIterconstructor, which is more performance, more general, and has the same ergonomics (from(vec![].into_iter())instead offrom(vec![...])for a vector). This way we do not need to worry about these.The challenge with fixed sized items is that they require knowledge of the size. This would be nicely solved by accepting
Option<[T; T: usize]>, but Rust's support for constant generics is slim atm.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Filed https://issues.apache.org/jira/browse/ARROW-10903