diff --git a/rust/arrow/Cargo.toml b/rust/arrow/Cargo.toml index 0dc43e075e0..39963777348 100644 --- a/rust/arrow/Cargo.toml +++ b/rust/arrow/Cargo.toml @@ -126,3 +126,7 @@ harness = false [[bench]] name = "array_slice" harness = false + +[[bench]] +name = "concatenate_kernel" +harness = false diff --git a/rust/arrow/benches/concatenate_kernel.rs b/rust/arrow/benches/concatenate_kernel.rs new file mode 100644 index 00000000000..84ee13fbc4e --- /dev/null +++ b/rust/arrow/benches/concatenate_kernel.rs @@ -0,0 +1,102 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[macro_use] +extern crate criterion; +use criterion::Criterion; + +use rand::distributions::{Alphanumeric, Distribution, Standard}; +use rand::Rng; + +use std::sync::Arc; + +extern crate arrow; + +use arrow::array::*; +use arrow::compute::concat; +use arrow::datatypes::*; +use arrow::util::test_util::seedable_rng; + +// cast array from specified primitive array type to desired data type +fn create_primitive(size: usize, null_density: f32) -> ArrayRef +where + T: ArrowPrimitiveType, + Standard: Distribution, + PrimitiveArray: std::convert::From>, +{ + let mut rng = seedable_rng(); + + let array: PrimitiveArray = seedable_rng() + .sample_iter(&Standard) + .take(size) + .map(|value| { + let x = rng.gen::(); + if x < null_density { + Some(value) + } else { + None + } + }) + .collect(); + + Arc::new(array) as ArrayRef +} + +fn create_strings(size: usize, null_density: f32) -> ArrayRef { + let rng = &mut seedable_rng(); + + let mut builder = StringBuilder::new(size); + for _ in 0..size { + let x = rng.gen::(); + if x < null_density { + let value = rng.sample_iter(&Alphanumeric).take(4).collect::(); + builder.append_value(&value).unwrap(); + } else { + builder.append_null().unwrap() + } + } + Arc::new(builder.finish()) +} + +fn bench_concat(v1: &ArrayRef, v2: &ArrayRef) { + criterion::black_box(concat(&[v1.as_ref(), v2.as_ref()]).unwrap()); +} + +fn add_benchmark(c: &mut Criterion) { + let v1 = create_primitive::(1024, 0.0); + let v2 = create_primitive::(1024, 0.0); + c.bench_function("concat i32 1024", |b| b.iter(|| bench_concat(&v1, &v2))); + + let v1 = create_primitive::(1024, 0.5); + let v2 = create_primitive::(1024, 0.5); + c.bench_function("concat i32 nulls 1024", |b| { + b.iter(|| bench_concat(&v1, &v2)) + }); + + let v1 = create_strings(1024, 0.0); + let v2 = create_strings(1024, 0.0); + c.bench_function("concat str 1024", |b| b.iter(|| bench_concat(&v1, &v2))); + + let v1 = create_strings(1024, 0.5); + let v2 = create_strings(1024, 0.5); + c.bench_function("concat str nulls 1024", |b| { + b.iter(|| bench_concat(&v1, &v2)) + }); +} + +criterion_group!(benches, add_benchmark); +criterion_main!(benches); diff --git a/rust/arrow/src/array/array_binary.rs b/rust/arrow/src/array/array_binary.rs index a32bcc075c3..da640723f12 100644 --- a/rust/arrow/src/array/array_binary.rs +++ b/rust/arrow/src/array/array_binary.rs @@ -15,10 +15,13 @@ // specific language governing permissions and limitations // under the License. -use std::convert::{From, TryInto}; use std::fmt; use std::mem; use std::{any::Any, iter::FromIterator}; +use std::{ + convert::{From, TryInto}, + sync::Arc, +}; use super::{ array::print_long_array, raw_pointer::as_aligned_pointer, raw_pointer::RawPtrBox, @@ -373,6 +376,45 @@ impl From>> for FixedSizeBinaryArray { } } +impl From>>> for FixedSizeBinaryArray { + fn from(data: Vec>>) -> Self { + let len = data.len(); + assert!(len > 0); + // try to estimate the size. This may not be possible no entry is valid => panic + let size = data.iter().filter_map(|e| e.as_ref()).next().unwrap().len(); + assert!(data + .iter() + .filter_map(|e| e.as_ref()) + .all(|item| item.len() == size)); + + let num_bytes = bit_util::ceil(len, 8); + let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, false); + let null_slice = null_buf.data_mut(); + + data.iter().enumerate().for_each(|(i, entry)| { + if entry.is_some() { + bit_util::set_bit(null_slice, i); + } + }); + + let data = data + .into_iter() + .map(|e| e.unwrap_or_else(|| vec![0; size])) + .flatten() + .collect::>(); + let data = ArrayData::new( + DataType::FixedSizeBinary(size as i32), + len, + None, + Some(null_buf.freeze()), + 0, + vec![Buffer::from(&data)], + vec![], + ); + FixedSizeBinaryArray::from(Arc::new(data)) + } +} + impl From for FixedSizeBinaryArray { fn from(data: ArrayDataRef) -> Self { assert_eq!( diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs index fdf6353170b..168d6e0b53f 100644 --- a/rust/arrow/src/array/builder.rs +++ b/rust/arrow/src/array/builder.rs @@ -25,7 +25,7 @@ use std::collections::HashMap; use std::fmt; use std::marker::PhantomData; use std::mem; -use std::{convert::TryInto, sync::Arc}; +use std::sync::Arc; use crate::array::*; use crate::buffer::{Buffer, MutableBuffer}; @@ -454,16 +454,6 @@ pub trait ArrayBuilder: Any { /// Returns whether number of array slots is zero fn is_empty(&self) -> bool; - /// Appends data from other arrays into the builder - /// - /// This is most useful when concatenating arrays of the same type into a builder. - fn append_data(&mut self, data: &[ArrayDataRef]) -> Result<()>; - - /// Returns the data type of the builder - /// - /// This is used for validating array data types in `append_data` - fn data_type(&self) -> DataType; - /// Builds the array fn finish(&mut self) -> ArrayRef; @@ -591,57 +581,6 @@ impl ArrayBuilder for BooleanBuilder { self.values_builder.is_empty() } - /// Appends data from other arrays into the builder - /// - /// This is most useful when concatenating arrays of the same type into a builder. - fn append_data(&mut self, data: &[ArrayDataRef]) -> Result<()> { - // validate arraydata and reserve memory - let mut total_len = 0; - for array in data { - if array.data_type() != &self.data_type() { - return Err(ArrowError::InvalidArgumentError( - "Cannot append data to builder if data types are different" - .to_string(), - )); - } - if array.buffers().len() != 1 { - return Err(ArrowError::InvalidArgumentError( - "Primitive arrays should have 1 buffer".to_string(), - )); - } - total_len += array.len(); - } - // reserve memory - self.values_builder.reserve(total_len); - self.bitmap_builder.reserve(total_len); - - for array in data { - let len = array.len(); - if len == 0 { - continue; - } - - // booleans are bit-packed, thus we iterate through the array - let array = BooleanArray::from(array.clone()); - for i in 0..len { - self.values_builder.append(array.value(i))?; - } - - for i in 0..len { - // account for offset as `ArrayData` does not - self.bitmap_builder.append(array.is_valid(i))?; - } - } - Ok(()) - } - - /// Returns the data type of the builder - /// - /// This is used for validating array data types in `append_data` - fn data_type(&self) -> DataType { - DataType::Boolean - } - /// Builds the array and reset this builder. fn finish(&mut self) -> ArrayRef { Arc::new(self.finish()) @@ -681,65 +620,6 @@ impl ArrayBuilder for PrimitiveBuilder { self.values_builder.is_empty() } - /// Appends data from other arrays into the builder - /// - /// This is most useful when concatenating arrays of the same type into a builder. - fn append_data(&mut self, data: &[ArrayDataRef]) -> Result<()> { - // validate arraydata and reserve memory - let mut total_len = 0; - for array in data { - if array.data_type() != &self.data_type() { - return Err(ArrowError::InvalidArgumentError( - "Cannot append data to builder if data types are different" - .to_string(), - )); - } - if array.buffers().len() != 1 { - return Err(ArrowError::InvalidArgumentError( - "Primitive arrays should have 1 buffer".to_string(), - )); - } - total_len += array.len(); - } - // reserve memory - self.values_builder.reserve(total_len); - self.bitmap_builder.reserve(total_len); - - let mul = T::get_byte_width(); - for array in data { - let len = array.len(); - if len == 0 { - continue; - } - let offset = array.offset(); - if array.data_type() == &DataType::Boolean { - // booleans are bit-packed, thus we iterate through the array - let array = PrimitiveArray::::from(array.clone()); - for i in 0..len { - self.values_builder.append(array.value(i))?; - } - } else { - let sliced = array.buffers()[0].data(); - // slice into data by factoring (offset and length) * byte width - self.values_builder - .write_bytes(&sliced[(offset * mul)..((len + offset) * mul)], len); - } - - for i in 0..len { - // account for offset as `ArrayData` does not - self.bitmap_builder.append(array.is_valid(i))?; - } - } - Ok(()) - } - - /// Returns the data type of the builder - /// - /// This is used for validating array data types in `append_data` - fn data_type(&self) -> DataType { - T::DATA_TYPE - } - /// Builds the array and reset this builder. fn finish(&mut self) -> ArrayRef { Arc::new(self.finish()) @@ -883,100 +763,6 @@ where self } - /// Appends data from other arrays into the builder - /// - /// This is most useful when concatenating arrays of the same type into a builder. - fn append_data(&mut self, data: &[ArrayDataRef]) -> Result<()> { - // validate arraydata and reserve memory - let mut total_len = 0; - for array in data { - if array.data_type() != &self.data_type() { - return Err(ArrowError::InvalidArgumentError( - "Cannot append data to builder if data types are different" - .to_string(), - )); - } - if array.buffers().len() != 1 { - return Err(ArrowError::InvalidArgumentError( - "List arrays should have 1 buffer".to_string(), - )); - } - if array.child_data().len() != 1 { - return Err(ArrowError::InvalidArgumentError( - "List arrays should have 1 child_data element".to_string(), - )); - } - total_len += array.len(); - } - // reserve memory - self.offsets_builder.reserve(total_len); - self.bitmap_builder.reserve(total_len); - // values_builder is allocated by the relevant builder, and is not allocated here - - // determine the latest offset on the builder - let mut cum_offset = if self.offsets_builder.len() == 0 { - 0 - } else { - // peek into buffer to get last appended offset - let buffer = self.offsets_builder.buffer.data(); - let len = self.offsets_builder.len(); - let (start, end) = ((len - 1) * 4, len * 4); - let slice = &buffer[start..end]; - i32::from_le_bytes(slice.try_into().unwrap()) - }; - for array in data { - let len = array.len(); - if len == 0 { - continue; - } - let offset = array.offset(); - - // `typed_data` is unsafe, however this call is safe as `ListArray` has i32 offsets - let offsets = unsafe { - &array.buffers()[0].typed_data::()[offset..(len + offset) + 1] - }; - // the offsets of the child array determine its length - // this could be obtained by getting the concrete ListArray and getting value_offsets - let offset_at_len = offsets[offsets.len() - 1] as usize; - let first_offset = offsets[0] as usize; - // create the child array and offset it - let child_data = &array.child_data()[0]; - let child_array = make_array(child_data.clone()); - // slice the child array to account for offsets - let sliced = child_array.slice(first_offset, offset_at_len - first_offset); - self.values().append_data(&[sliced.data()])?; - let adjusted_offsets: Vec = offsets - .windows(2) - .map(|w| { - let curr_offset = w[1] - w[0] + cum_offset; - cum_offset = curr_offset; - curr_offset - }) - .collect(); - self.offsets_builder - .append_slice(adjusted_offsets.as_slice())?; - - for i in 0..len { - self.bitmap_builder.append(array.is_valid(i))?; - } - } - - // append array length - self.len += total_len; - Ok(()) - } - - /// Returns the data type of the builder - /// - /// This is used for validating array data types in `append_data` - fn data_type(&self) -> DataType { - DataType::List(Box::new(Field::new( - "item", - self.values_builder.data_type(), - true, - ))) - } - /// Returns the builder as a mutable `Any` reference. fn as_any_mut(&mut self) -> &mut Any { self @@ -1095,100 +881,6 @@ where self } - /// Appends data from other arrays into the builder - /// - /// This is most useful when concatenating arrays of the same type into a builder. - fn append_data(&mut self, data: &[ArrayDataRef]) -> Result<()> { - // validate arraydata and reserve memory - let mut total_len = 0; - for array in data { - if array.data_type() != &self.data_type() { - return Err(ArrowError::InvalidArgumentError( - "Cannot append data to builder if data types are different" - .to_string(), - )); - } - if array.buffers().len() != 1 { - return Err(ArrowError::InvalidArgumentError( - "List arrays should have 1 buffer".to_string(), - )); - } - if array.child_data().len() != 1 { - return Err(ArrowError::InvalidArgumentError( - "List arrays should have 1 child_data element".to_string(), - )); - } - total_len += array.len(); - } - // reserve memory - self.offsets_builder.reserve(total_len); - self.bitmap_builder.reserve(total_len); - // values_builder is allocated by the relevant builder, and is not allocated here - - // determine the latest offset on the builder - let mut cum_offset = if self.offsets_builder.len() == 0 { - 0 - } else { - // peek into buffer to get last appended offset - let buffer = self.offsets_builder.buffer.data(); - let len = self.offsets_builder.len(); - let (start, end) = ((len - 1) * 8, len * 8); - let slice = &buffer[start..end]; - i64::from_le_bytes(slice.try_into().unwrap()) - }; - for array in data { - let len = array.len(); - if len == 0 { - continue; - } - let offset = array.offset(); - - // `typed_data` is unsafe, however this call is safe as `LargeListArray` has i64 offsets - let offsets = unsafe { - &array.buffers()[0].typed_data::()[offset..(len + offset) + 1] - }; - // the offsets of the child array determine its length - // this could be obtained by getting the concrete ListArray and getting value_offsets - let offset_at_len = offsets[offsets.len() - 1] as usize; - let first_offset = offsets[0] as usize; - // create the child array and offset it - let child_data = &array.child_data()[0]; - let child_array = make_array(child_data.clone()); - // slice the child array to account for offsets - let sliced = child_array.slice(first_offset, offset_at_len - first_offset); - self.values().append_data(&[sliced.data()])?; - let adjusted_offsets: Vec = offsets - .windows(2) - .map(|w| { - let curr_offset = w[1] - w[0] + cum_offset; - cum_offset = curr_offset; - curr_offset - }) - .collect(); - self.offsets_builder - .append_slice(adjusted_offsets.as_slice())?; - - for i in 0..len { - self.bitmap_builder.append(array.is_valid(i))?; - } - } - - // append array length - self.len += total_len; - Ok(()) - } - - /// Returns the data type of the builder - /// - /// This is used for validating array data types in `append_data` - fn data_type(&self) -> DataType { - DataType::LargeList(Box::new(Field::new( - "item", - self.values_builder.data_type(), - true, - ))) - } - /// Returns the builder as a mutable `Any` reference. fn as_any_mut(&mut self) -> &mut Any { self @@ -1309,66 +1001,6 @@ where self } - /// Appends data from other arrays into the builder - /// - /// This is most useful when concatenating arrays of the same type into a builder. - fn append_data(&mut self, data: &[ArrayDataRef]) -> Result<()> { - // validate arraydata and reserve memory - let mut total_len = 0; - for array in data { - if array.data_type() != &self.data_type() { - return Err(ArrowError::InvalidArgumentError( - "Cannot append data to builder if data types are different" - .to_string(), - )); - } - if array.child_data().len() != 1 { - return Err(ArrowError::InvalidArgumentError( - "FixedSizeList arrays should have 1 child_data element".to_string(), - )); - } - total_len += array.len(); - } - // reserve memory - self.bitmap_builder.reserve(total_len); - - // determine the latest offset on the builder - for array in data { - let len = array.len(); - if len == 0 { - continue; - } - let offset = array.offset(); - - // the offsets of the child array determine its length - let first_offset = self.list_len as usize * offset; - let offset_at_len = first_offset + len * self.list_len as usize; - // create the child array and offset it - let child_data = &array.child_data()[0]; - let child_array = make_array(child_data.clone()); - // slice the child array to account for offsets - let sliced = child_array.slice(first_offset, offset_at_len - first_offset); - self.values().append_data(&[sliced.data()])?; - for i in 0..len { - self.bitmap_builder.append(array.is_valid(i))?; - } - } - - // append array length - self.len += total_len; - Ok(()) - } - - /// Returns the data type of the builder - /// - /// This is used for validating array data types in `append_data` - fn data_type(&self) -> DataType { - DataType::FixedSizeList( - Box::new(Field::new("item", self.values_builder.data_type(), true)), - self.list_len, - ) - } - /// Returns the builder as a mutable `Any` reference. fn as_any_mut(&mut self) -> &mut Any { self @@ -1495,20 +1127,6 @@ impl ArrayBuilder for BinaryBuilder { self } - /// Appends data from other arrays into the builder - /// - /// This is most useful when concatenating arrays of the same type into a builder. - fn append_data(&mut self, data: &[ArrayDataRef]) -> Result<()> { - append_binary_data(&mut self.builder, &DataType::Binary, data) - } - - /// Returns the data type of the builder - /// - /// This is used for validating array data types in `append_data` - fn data_type(&self) -> DataType { - DataType::Binary - } - /// Returns the builder as a mutable `Any` reference. fn as_any_mut(&mut self) -> &mut Any { self @@ -1546,20 +1164,6 @@ impl ArrayBuilder for LargeBinaryBuilder { self } - /// Appends data from other arrays into the builder - /// - /// This is most useful when concatenating arrays of the same type into a builder. - fn append_data(&mut self, data: &[ArrayDataRef]) -> Result<()> { - append_large_binary_data(&mut self.builder, &DataType::LargeBinary, data) - } - - /// Returns the data type of the builder - /// - /// This is used for validating array data types in `append_data` - fn data_type(&self) -> DataType { - DataType::LargeBinary - } - /// Returns the boxed builder as a box of `Any`. fn into_box_any(self: Box) -> Box { self @@ -1587,20 +1191,6 @@ impl ArrayBuilder for StringBuilder { self } - /// Appends data from other arrays into the builder - /// - /// This is most useful when concatenating arrays of the same type into a builder. - fn append_data(&mut self, data: &[ArrayDataRef]) -> Result<()> { - append_binary_data(&mut self.builder, &DataType::Utf8, data) - } - - /// Returns the data type of the builder - /// - /// This is used for validating array data types in `append_data` - fn data_type(&self) -> DataType { - DataType::Utf8 - } - /// Returns the builder as a mutable `Any` reference. fn as_any_mut(&mut self) -> &mut Any { self @@ -1627,114 +1217,6 @@ impl ArrayBuilder for StringBuilder { } } -// Helper function for appending Binary and Utf8 data -fn append_binary_data( - builder: &mut ListBuilder, - data_type: &DataType, - data: &[ArrayDataRef], -) -> Result<()> { - // validate arraydata and reserve memory - for array in data { - if array.data_type() != data_type { - return Err(ArrowError::InvalidArgumentError( - "Cannot append data to builder if data types are different".to_string(), - )); - } - if array.buffers().len() != 2 { - return Err(ArrowError::InvalidArgumentError( - "Binary/String arrays should have 2 buffers".to_string(), - )); - } - } - - builder.append_data( - &data - .iter() - .map(|array| { - // convert string to List to reuse list's cast - let int_data = &array.buffers()[1]; - let int_data = Arc::new(ArrayData::new( - DataType::UInt8, - int_data.len(), - None, - None, - 0, - vec![int_data.clone()], - vec![], - )) as ArrayDataRef; - - Arc::new(ArrayData::new( - DataType::List(Box::new(Field::new("item", DataType::UInt8, true))), - array.len(), - None, - array.null_buffer().cloned(), - array.offset(), - vec![(&array.buffers()[0]).clone()], - vec![int_data], - )) - }) - .collect::>(), - )?; - - Ok(()) -} - -// Helper function for appending LargeBinary and LargeUtf8 data -fn append_large_binary_data( - builder: &mut LargeListBuilder, - data_type: &DataType, - data: &[ArrayDataRef], -) -> Result<()> { - // validate arraydata and reserve memory - for array in data { - if array.data_type() != data_type { - return Err(ArrowError::InvalidArgumentError( - "Cannot append data to builder if data types are different".to_string(), - )); - } - if array.buffers().len() != 2 { - return Err(ArrowError::InvalidArgumentError( - "Binary/String arrays should have 2 buffers".to_string(), - )); - } - } - - builder.append_data( - &data - .iter() - .map(|array| { - // convert string to List to reuse list's cast - let int_data = &array.buffers()[1]; - let int_data = Arc::new(ArrayData::new( - DataType::UInt8, - int_data.len(), - None, - None, - 0, - vec![int_data.clone()], - vec![], - )) as ArrayDataRef; - - Arc::new(ArrayData::new( - DataType::LargeList(Box::new(Field::new( - "item", - DataType::UInt8, - true, - ))), - array.len(), - None, - array.null_buffer().cloned(), - array.offset(), - vec![(&array.buffers()[0]).clone()], - vec![int_data], - )) - }) - .collect::>(), - )?; - - Ok(()) -} - impl ArrayBuilder for LargeStringBuilder { /// Returns the builder as a non-mutable `Any` reference. fn as_any(&self) -> &Any { @@ -1746,20 +1228,6 @@ impl ArrayBuilder for LargeStringBuilder { self } - /// Appends data from other arrays into the builder - /// - /// This is most useful when concatenating arrays of the same type into a builder. - fn append_data(&mut self, data: &[ArrayDataRef]) -> Result<()> { - append_large_binary_data(&mut self.builder, &DataType::LargeUtf8, data) - } - - /// Returns the data type of the builder - /// - /// This is used for validating array data types in `append_data` - fn data_type(&self) -> DataType { - DataType::LargeUtf8 - } - /// Returns the boxed builder as a box of `Any`. fn into_box_any(self: Box) -> Box { self @@ -1787,60 +1255,6 @@ impl ArrayBuilder for FixedSizeBinaryBuilder { self } - /// Appends data from other arrays into the builder - /// - /// This is most useful when concatenating arrays of the same type into a builder. - fn append_data(&mut self, data: &[ArrayDataRef]) -> Result<()> { - // validate arraydata and reserve memory - for array in data { - if array.data_type() != &self.data_type() { - return Err(ArrowError::InvalidArgumentError( - "Cannot append data to builder if data types are different" - .to_string(), - )); - } - if array.buffers().len() != 1 { - return Err(ArrowError::InvalidArgumentError( - "FixedSizeBinary arrays should have 1 buffer".to_string(), - )); - } - } - for array in data { - // convert string to FixedSizeList to reuse list's append - let int_data = &array.buffers()[0]; - let int_data = Arc::new(ArrayData::new( - DataType::UInt8, - int_data.len(), - None, - None, - 0, - vec![int_data.clone()], - vec![], - )) as ArrayDataRef; - let list_data = Arc::new(ArrayData::new( - DataType::FixedSizeList( - Box::new(Field::new("item", DataType::UInt8, true)), - self.builder.list_len, - ), - array.len(), - None, - array.null_buffer().cloned(), - array.offset(), - vec![], - vec![int_data], - )); - self.builder.append_data(&[list_data])?; - } - Ok(()) - } - - /// Returns the data type of the builder - /// - /// This is used for validating array data types in `append_data` - fn data_type(&self) -> DataType { - DataType::FixedSizeBinary(self.builder.list_len) - } - /// Returns the builder as a mutable `Any` reference. fn as_any_mut(&mut self) -> &mut Any { self @@ -1873,60 +1287,6 @@ impl ArrayBuilder for DecimalBuilder { self } - /// Appends data from other arrays into the builder - /// - /// This is most useful when concatenating arrays of the same type into a builder. - fn append_data(&mut self, data: &[ArrayDataRef]) -> Result<()> { - // validate arraydata and reserve memory - for array in data { - if array.data_type() != &self.data_type() { - return Err(ArrowError::InvalidArgumentError( - "Cannot append data to builder if data types are different" - .to_string(), - )); - } - if array.buffers().len() != 1 { - return Err(ArrowError::InvalidArgumentError( - "Decimal arrays should have 1 buffer".to_string(), - )); - } - } - for array in data { - // convert string to FixedSizeList to reuse list's append - let int_data = &array.buffers()[0]; - let int_data = Arc::new(ArrayData::new( - DataType::UInt8, - int_data.len(), - None, - None, - 0, - vec![int_data.clone()], - vec![], - )) as ArrayDataRef; - let list_data = Arc::new(ArrayData::new( - DataType::FixedSizeList( - Box::new(Field::new("item", DataType::UInt8, true)), - self.builder.list_len, - ), - array.len(), - None, - array.null_buffer().cloned(), - array.offset(), - vec![], - vec![int_data], - )); - self.builder.append_data(&[list_data])?; - } - Ok(()) - } - - /// Returns the data type of the builder - /// - /// This is used for validating array data types in `append_data` - fn data_type(&self) -> DataType { - DataType::Decimal(self.precision, self.scale) - } - /// Returns the builder as a mutable `Any` reference. fn as_any_mut(&mut self) -> &mut Any { self @@ -2271,60 +1631,6 @@ impl ArrayBuilder for StructBuilder { self.len == 0 } - /// Appends data from other arrays into the builder - /// - /// This is most useful when concatenating arrays of the same type into a builder. - fn append_data(&mut self, data: &[ArrayDataRef]) -> Result<()> { - // validate arraydata and reserve memory - let mut total_len = 0; - for array in data { - if array.data_type() != &self.data_type() { - return Err(ArrowError::InvalidArgumentError( - "Cannot append data to builder if data types are different" - .to_string(), - )); - } - if array.child_data().len() != self.num_fields() { - return Err(ArrowError::InvalidArgumentError( - "Struct should have the same child_data length as fields".to_string(), - )); - } - total_len += array.len(); - } - self.bitmap_builder.reserve(total_len); - - for array in data { - let len = array.len(); - if len == 0 { - continue; - } - let offset = array.offset(); - for (builder, child_data) in self - .field_builders - .iter_mut() - .zip(array.child_data().iter()) - { - // slice child_data to account for offsets - let child_array = make_array(child_data.clone()); - let sliced = child_array.slice(offset, len); - builder.append_data(&[sliced.data()])?; - } - for i in 0..len { - self.bitmap_builder.append(array.is_valid(i))?; - } - } - - self.len += total_len; - Ok(()) - } - - /// Returns the data type of the builder - /// - /// This is used for validating array data types in `append_data` - fn data_type(&self) -> DataType { - DataType::Struct(self.fields.clone()) - } - /// Builds the array. fn finish(&mut self) -> ArrayRef { Arc::new(self.finish()) @@ -2857,21 +2163,6 @@ where self.keys_builder.is_empty() } - /// Appends data from other arrays into the builder - /// - /// This is most useful when concatenating arrays of the same type into a builder. - fn append_data(&mut self, _data: &[ArrayDataRef]) -> Result<()> { - // TODO: This will require an implementation that doesn't just append keys - unimplemented!("Appending data for dictionary arrays not yet implemented") - } - - /// Returns the data type of the builder - /// - /// This is used for validating array data types in `append_data` - fn data_type(&self) -> DataType { - DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(V::DATA_TYPE)) - } - /// Builds the array and reset this builder. fn finish(&mut self) -> ArrayRef { Arc::new(self.finish()) @@ -3022,21 +2313,6 @@ where self.keys_builder.is_empty() } - /// Appends data from other arrays into the builder - /// - /// This is most useful when concatenating arrays of the same type into a builder. - fn append_data(&mut self, _data: &[ArrayDataRef]) -> Result<()> { - // TODO: This will require an implementation that doesn't just append keys - unimplemented!("Appending data for dictionary arrays not yet implemented") - } - - /// Returns the data type of the builder - /// - /// This is used for validating array data types in `append_data` - fn data_type(&self) -> DataType { - DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(DataType::Utf8)) - } - /// Builds the array and reset this builder. fn finish(&mut self) -> ArrayRef { Arc::new(self.finish()) @@ -4174,579 +3450,4 @@ mod tests { // Special error if the key overflows (256th entry) builder.append(1257).unwrap(); } - - #[test] - fn test_primitive_append() -> Result<()> { - let mut builder = Int32Builder::new(2); - builder.append_null()?; - builder.append_value(1)?; - // create an array to append - let array = Int32Array::from(vec![None, Some(3), None, None, Some(6), Some(7)]); - builder.append_data(&[ - array.data(), - array.slice(1, 4).data(), - array.slice(2, 0).data(), - ])?; - let finished = builder.finish(); - let expected = Int32Array::from(vec![ - None, - Some(1), - None, - Some(3), - None, - None, - Some(6), - Some(7), - // array.data() end - Some(3), - None, - None, - Some(6), - ]); - assert_eq!(finished.len(), expected.len()); - assert_eq!(finished.null_count(), expected.null_count()); - assert_eq!(finished, expected); - - let mut builder = Float64Builder::new(64); - builder.append_null()?; - builder.append_value(1.0)?; - // create an array to append - let array = - Float64Array::from(vec![None, Some(3.0), None, None, Some(6.0), Some(7.0)]); - builder.append_data(&[ - array.data(), - array.slice(1, 5).data(), - array.slice(2, 1).data(), - ])?; - let finished = builder.finish(); - let expected = Float64Array::from(vec![ - None, - Some(1.0), - None, - Some(3.0), - None, - None, - Some(6.0), - Some(7.0), - Some(3.0), - None, - None, - Some(6.0), - Some(7.0), - None, - ]); - assert_eq!(finished.len(), expected.len()); - assert_eq!(finished.null_count(), expected.null_count()); - assert_eq!(finished, expected); - Ok(()) - } - - #[test] - fn test_boolean_append() -> Result<()> { - let mut builder = BooleanBuilder::new(2); - builder.append_null()?; - builder.append_value(true)?; - // create an array to append - let array = BooleanArray::from(vec![ - None, - Some(true), - None, - None, - Some(false), - Some(true), - ]); - builder.append_data(&[ - array.data(), - array.slice(1, 4).data(), - array.slice(2, 0).data(), - ])?; - let finished = builder.finish(); - let expected = BooleanArray::from(vec![ - None, - Some(true), - None, - Some(true), - None, - None, - Some(false), - Some(true), - Some(true), - None, - None, - Some(false), - ]); - assert_eq!(finished.len(), expected.len()); - assert_eq!(finished.null_count(), expected.null_count()); - assert_eq!(finished, expected); - Ok(()) - } - - #[test] - fn test_list_append() -> Result<()> { - let int_builder = Int64Builder::new(24); - let mut builder = ListBuilder::::new(int_builder); - builder.values().append_slice(&[1, 2, 3])?; - builder.append(true)?; - builder.values().append_slice(&[4, 5])?; - builder.append(true)?; - builder.values().append_slice(&[6, 7, 8])?; - builder.values().append_slice(&[9, 10, 11])?; - builder.append(true)?; - - let a_builder = Int64Builder::new(24); - let mut a_builder = ListBuilder::::new(a_builder); - a_builder.values().append_slice(&[12, 13])?; - a_builder.append(true)?; - a_builder.append(true)?; - a_builder.values().append_slice(&[14, 15])?; - a_builder.append(true)?; - let a = a_builder.finish(); - - // append array - builder.append_data(&[a.data(), a.slice(1, 2).data()])?; - let finished = builder.finish(); - - let expected_int_array = Int64Array::from(vec![ - Some(1), - Some(2), - Some(3), - Some(4), - Some(5), - Some(6), - Some(7), - Some(8), - Some(9), - Some(10), - Some(11), - // append first array - Some(12), - Some(13), - Some(14), - Some(15), - // append second array - Some(14), - Some(15), - ]); - let list_value_offsets = - Buffer::from(&[0, 3, 5, 11, 13, 13, 15, 15, 17].to_byte_slice()); - let expected_list_data = ArrayData::new( - DataType::List(Box::new(Field::new("item", DataType::Int64, true))), - 8, - None, - None, - 0, - vec![list_value_offsets], - vec![expected_int_array.data()], - ); - let expected_list = ListArray::from(Arc::new(expected_list_data) as ArrayDataRef); - assert_eq!( - finished.data().buffers()[0].data(), - expected_list.data().buffers()[0].data() - ); - assert_eq!(&expected_list.values(), &finished.values()); - assert_eq!(expected_list.len(), finished.len()); - - Ok(()) - } - - #[test] - fn test_list_nulls_append() -> Result<()> { - let int_builder = Int64Builder::new(32); - let mut builder = ListBuilder::::new(int_builder); - builder.values().append_slice(&[1, 2, 3])?; - builder.append(true)?; - builder.values().append_slice(&[4, 5])?; - builder.append(true)?; - builder.append(false)?; - builder.values().append_slice(&[6, 7, 8])?; - builder.values().append_null()?; - builder.values().append_null()?; - builder.values().append_slice(&[9, 10, 11])?; - builder.append(true)?; - - let a_builder = Int64Builder::new(32); - let mut a_builder = ListBuilder::::new(a_builder); - a_builder.values().append_slice(&[12, 13])?; - a_builder.append(true)?; - a_builder.append(false)?; - a_builder.append(true)?; - a_builder.values().append_null()?; - a_builder.values().append_null()?; - a_builder.values().append_slice(&[14, 15])?; - a_builder.append(true)?; - let a = a_builder.finish(); - - // append array - builder.append_data(&[ - a.data(), - a.slice(1, 2).data(), - a.slice(2, 2).data(), - a.slice(4, 0).data(), - ])?; - let finished = builder.finish(); - - let expected_int_array = Int64Array::from(vec![ - Some(1), - Some(2), - Some(3), - Some(4), - Some(5), - Some(6), - Some(7), - Some(8), - None, - None, - Some(9), - Some(10), - Some(11), - // second array - Some(12), - Some(13), - None, - None, - Some(14), - Some(15), - // slice(1, 2) results in no values added - None, - None, - Some(14), - Some(15), - ]); - let list_value_offsets = Buffer::from( - &[0, 3, 5, 5, 13, 15, 15, 15, 19, 19, 19, 19, 23].to_byte_slice(), - ); - let expected_list_data = ArrayData::new( - DataType::List(Box::new(Field::new("item", DataType::Int64, true))), - 12, - None, - None, - 0, - vec![list_value_offsets], - vec![expected_int_array.data()], - ); - let expected_list = ListArray::from(Arc::new(expected_list_data) as ArrayDataRef); - assert_eq!( - finished.data().buffers()[0].data(), - expected_list.data().buffers()[0].data() - ); - assert_eq!( - finished.data().child_data()[0].buffers()[0].data(), - expected_list.data().child_data()[0].buffers()[0].data() - ); - assert_eq!(&expected_list.values(), &finished.values()); - assert_eq!(expected_list.len(), finished.len()); - - Ok(()) - } - - #[test] - fn test_list_of_strings_append() -> Result<()> { - let string_builder = StringBuilder::new(32); - let mut builder = ListBuilder::::new(string_builder); - builder.values().append_value("Hello")?; - builder.values().append_value("Arrow")?; - builder.append(true)?; - builder.append(false)?; - - let string_array = StringArray::from(vec![ - Some("alpha"), - Some("beta"), - None, - Some("gamma"), - Some("delta"), - None, - ]); - let list_value_offsets = Buffer::from(&[0, 2, 3, 6].to_byte_slice()); - let list_data = ArrayData::new( - DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), - 3, - None, - None, - 0, - vec![list_value_offsets], - vec![string_array.data()], - ); - let list_array = ListArray::from(Arc::new(list_data) as ArrayDataRef); - builder.append_data(&[ - list_array.data(), - list_array.slice(1, 2).data(), - list_array.slice(0, 0).data(), - ])?; - let finished = builder.finish(); - - let expected_string_array = StringArray::from(vec![ - Some("Hello"), - Some("Arrow"), - // list_array - Some("alpha"), - Some("beta"), - None, - Some("gamma"), - Some("delta"), - None, - // slice(1, 2) - None, - Some("gamma"), - Some("delta"), - None, - // slice(0, 0) returns nothing - ]); - let list_value_offsets = Buffer::from(&[0, 2, 2, 4, 5, 8, 9, 12].to_byte_slice()); - let expected_list_data = ArrayData::new( - DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), - 7, - None, - None, // is this correct? - 0, - vec![list_value_offsets], - vec![expected_string_array.data()], - ); - let expected_list = ListArray::from(Arc::new(expected_list_data) as ArrayDataRef); - assert_eq!( - finished.data().buffers()[0].data(), - expected_list.data().buffers()[0].data() - ); - assert_eq!( - finished.data().child_data()[0].buffers()[0].data(), - expected_list.data().child_data()[0].buffers()[0].data() - ); - assert_eq!(&expected_list.values(), &finished.values()); - assert_eq!(expected_list.len(), finished.len()); - - Ok(()) - } - - #[test] - fn test_fixed_size_list_append() -> Result<()> { - let int_builder = UInt16Builder::new(64); - let mut builder = FixedSizeListBuilder::::new(int_builder, 2); - builder.values().append_slice(&[1, 2])?; - builder.append(true)?; - builder.values().append_slice(&[3, 4])?; - builder.append(false)?; - builder.values().append_slice(&[5, 6])?; - builder.append(true)?; - - let a_builder = UInt16Builder::new(64); - let mut a_builder = FixedSizeListBuilder::::new(a_builder, 2); - a_builder.values().append_slice(&[7, 8])?; - a_builder.append(true)?; - a_builder.values().append_slice(&[9, 10])?; - a_builder.append(true)?; - a_builder.values().append_slice(&[11, 12])?; - a_builder.append(false)?; - a_builder.values().append_slice(&[13, 14])?; - a_builder.append(true)?; - a_builder.values().append_null()?; - a_builder.values().append_null()?; - a_builder.append(true)?; - let a = a_builder.finish(); - - // append array - builder.append_data(&[ - a.data(), - a.slice(1, 3).data(), - a.slice(2, 1).data(), - a.slice(5, 0).data(), - ])?; - let finished = builder.finish(); - - let expected_int_array = UInt16Array::from(vec![ - Some(1), - Some(2), - Some(3), - Some(4), - Some(5), - Some(6), - // append first array - Some(7), - Some(8), - Some(9), - Some(10), - Some(11), - Some(12), - Some(13), - Some(14), - None, - None, - // append slice(1, 3) - Some(9), - Some(10), - Some(11), - Some(12), - Some(13), - Some(14), - // append slice(2, 1) - Some(11), - Some(12), - ]); - let expected_list_data = ArrayData::new( - DataType::FixedSizeList( - Box::new(Field::new("item", DataType::UInt16, true)), - 2, - ), - 12, - None, - None, - 0, - vec![], - vec![expected_int_array.data()], - ); - let expected_list = - FixedSizeListArray::from(Arc::new(expected_list_data) as ArrayDataRef); - assert_eq!(&expected_list.values(), &finished.values()); - assert_eq!(expected_list.len(), finished.len()); - - Ok(()) - } - - #[test] - fn test_fixed_size_binary_append() -> Result<()> { - let mut builder = FixedSizeBinaryBuilder::new(64, 2); - builder.append_value(&[1, 2])?; - builder.append_value(&[3, 4])?; - builder.append_value(&[5, 6])?; - - let mut a_builder = FixedSizeBinaryBuilder::new(64, 2); - a_builder.append_value(&[7, 8])?; - a_builder.append_value(&[9, 10])?; - a_builder.append_null()?; - a_builder.append_value(&[13, 14])?; - a_builder.append_null()?; - let a = a_builder.finish(); - - // append array - builder.append_data(&[ - a.data(), - a.slice(1, 3).data(), - a.slice(2, 1).data(), - a.slice(5, 0).data(), - ])?; - let finished = builder.finish(); - - let expected_int_array = UInt8Array::from(vec![ - Some(1), - Some(2), - Some(3), - Some(4), - Some(5), - Some(6), - // append first array - Some(7), - Some(8), - Some(9), - Some(10), - None, - None, - Some(13), - Some(14), - None, - None, - // append slice(1, 3) - Some(9), - Some(10), - None, - None, - Some(13), - Some(14), - // append slice(2, 1) - None, - None, - ]); - let expected_list_data = ArrayData::new( - DataType::FixedSizeList( - Box::new(Field::new("item", DataType::UInt8, true)), - 2, - ), - 12, - None, - None, - 0, - vec![], - vec![expected_int_array.data()], - ); - let expected_list = - FixedSizeListArray::from(Arc::new(expected_list_data) as ArrayDataRef); - let expected_list = FixedSizeBinaryArray::from(expected_list); - // assert_eq!(expected_list.values(), finished.values()); - assert_eq!(expected_list.len(), finished.len()); - - Ok(()) - } - - #[test] - fn test_struct_append() -> Result<()> { - let int_builder = Int32Builder::new(64); - let bool_builder = BooleanBuilder::new(64); - - let field1 = Field::new("f1", DataType::Int32, false); - let field2 = Field::new("f2", DataType::Boolean, false); - let mut fields = Vec::new(); - let mut field_builders = Vec::new(); - fields.push(field1.clone()); - field_builders.push(Box::new(int_builder) as Box); - fields.push(field2.clone()); - field_builders.push(Box::new(bool_builder) as Box); - - let mut builder = StructBuilder::new(fields, field_builders); - builder - .field_builder::(0) - .unwrap() - .append_slice(&[0, 1, 2, 3, 4])?; - builder - .field_builder::(1) - .unwrap() - .append_slice(&[false, true, false, true, false])?; - - // Append slot values - all are valid. - for _ in 0..5 { - assert!(builder.append(true).is_ok()) - } - - let arr = builder.finish(); - - assert_eq!(5, arr.len()); - assert_eq!(0, builder.len()); - - builder - .field_builder::(0) - .unwrap() - .append_slice(&[1, 3, 5, 7, 9]) - .unwrap(); - builder - .field_builder::(1) - .unwrap() - .append_slice(&[true, true, true, false, true]) - .unwrap(); - - // Append slot values - all are valid. - for _ in 0..5 { - assert!(builder.append(true).is_ok()) - } - - assert_eq!(5, builder.len()); - - // append array to builder - builder.append_data(&[ - arr.data(), - arr.slice(1, 4).data(), - arr.slice(4, 0).data(), - ])?; - // finish builder - let arr2 = builder.finish(); - - let f1 = Arc::new(Int32Array::from(vec![ - 1, 3, 5, 7, 9, 0, 1, 2, 3, 4, 1, 2, 3, 4, - ])) as ArrayRef; - let f2 = Arc::new(BooleanArray::from(vec![ - true, true, true, false, true, false, true, false, true, false, true, false, - true, false, - ])) as ArrayRef; - let expected = StructArray::from(vec![(field1, f1), (field2, f2)]); - assert_eq!(arr2.data().child_data()[0], expected.data().child_data()[0]); - assert_eq!(arr2.data().child_data()[1], expected.data().child_data()[1]); - assert_eq!(arr2, expected); - - Ok(()) - } } diff --git a/rust/arrow/src/array/transform/fixed_binary.rs b/rust/arrow/src/array/transform/fixed_binary.rs index d287c0f7364..84cef62ef95 100644 --- a/rust/arrow/src/array/transform/fixed_binary.rs +++ b/rust/arrow/src/array/transform/fixed_binary.rs @@ -43,7 +43,7 @@ pub(super) fn build_extend(array: &ArrayData) -> Extend { (start..start + len).for_each(|i| { if array.is_valid(i) { // append value - let bytes = &values[start * size..(start + len) * size]; + let bytes = &values[i * size..(i + 1) * size]; values_buffer.extend_from_slice(bytes); } else { values_buffer.extend(size); diff --git a/rust/arrow/src/array/transform/mod.rs b/rust/arrow/src/array/transform/mod.rs index 3930d241dcd..3fe76d63f54 100644 --- a/rust/arrow/src/array/transform/mod.rs +++ b/rust/arrow/src/array/transform/mod.rs @@ -486,13 +486,21 @@ mod tests { use super::*; - use crate::array::{ - Array, ArrayDataRef, ArrayRef, BooleanArray, DictionaryArray, - FixedSizeBinaryArray, Int16Array, Int16Type, Int32Array, Int64Builder, - ListBuilder, PrimitiveBuilder, StringArray, StringDictionaryBuilder, StructArray, - UInt8Array, + use crate::{ + array::{ + Array, ArrayDataRef, ArrayRef, BooleanArray, DictionaryArray, + FixedSizeBinaryArray, Int16Array, Int16Type, Int32Array, Int64Array, + Int64Builder, ListBuilder, PrimitiveBuilder, StringArray, + StringDictionaryBuilder, StructArray, UInt8Array, + }, + buffer::Buffer, + datatypes::Field, + }; + use crate::{ + array::{ListArray, StringBuilder}, + datatypes::ToByteSlice, + error::Result, }; - use crate::{array::ListArray, error::Result}; /// tests extending from a primitive array w/ offset nor nulls #[test] @@ -867,4 +875,361 @@ mod tests { let expected = FixedSizeBinaryArray::from(vec![vec![0, 2], vec![0, 1]]); assert_eq!(result, expected); } + + #[test] + fn test_list_append() -> Result<()> { + let mut builder = ListBuilder::::new(Int64Builder::new(24)); + builder.values().append_slice(&[1, 2, 3])?; + builder.append(true)?; + builder.values().append_slice(&[4, 5])?; + builder.append(true)?; + builder.values().append_slice(&[6, 7, 8])?; + builder.values().append_slice(&[9, 10, 11])?; + builder.append(true)?; + let a = builder.finish().data(); + + let a_builder = Int64Builder::new(24); + let mut a_builder = ListBuilder::::new(a_builder); + a_builder.values().append_slice(&[12, 13])?; + a_builder.append(true)?; + a_builder.append(true)?; + a_builder.values().append_slice(&[14, 15])?; + a_builder.append(true)?; + let b = a_builder.finish(); + + let b = b.data(); + let c = b.slice(1, 2); + + let mut mutable = + MutableArrayData::new(vec![a.as_ref(), b.as_ref(), &c], false, 1); + mutable.extend(0, 0, a.len()); + mutable.extend(1, 0, b.len()); + mutable.extend(2, 0, c.len()); + + let finished = mutable.freeze(); + + let expected_int_array = Int64Array::from(vec![ + Some(1), + Some(2), + Some(3), + Some(4), + Some(5), + Some(6), + Some(7), + Some(8), + Some(9), + Some(10), + Some(11), + // append first array + Some(12), + Some(13), + Some(14), + Some(15), + // append second array + Some(14), + Some(15), + ]); + let list_value_offsets = + Buffer::from(&[0i32, 3, 5, 11, 13, 13, 15, 15, 17].to_byte_slice()); + let expected_list_data = ArrayData::new( + DataType::List(Box::new(Field::new("item", DataType::Int64, true))), + 8, + None, + None, + 0, + vec![list_value_offsets], + vec![expected_int_array.data()], + ); + assert_eq!(finished, expected_list_data); + + Ok(()) + } + + #[test] + fn test_list_nulls_append() -> Result<()> { + let mut builder = ListBuilder::::new(Int64Builder::new(32)); + builder.values().append_slice(&[1, 2, 3])?; + builder.append(true)?; + builder.values().append_slice(&[4, 5])?; + builder.append(true)?; + builder.append(false)?; + builder.values().append_slice(&[6, 7, 8])?; + builder.values().append_null()?; + builder.values().append_null()?; + builder.values().append_slice(&[9, 10, 11])?; + builder.append(true)?; + let a = builder.finish(); + let a = a.data(); + + let mut builder = ListBuilder::::new(Int64Builder::new(32)); + builder.values().append_slice(&[12, 13])?; + builder.append(true)?; + builder.append(false)?; + builder.append(true)?; + builder.values().append_null()?; + builder.values().append_null()?; + builder.values().append_slice(&[14, 15])?; + builder.append(true)?; + let b = builder.finish(); + let b = b.data(); + let c = b.slice(1, 2); + let d = b.slice(2, 2); + + let mut mutable = + MutableArrayData::new(vec![a.as_ref(), b.as_ref(), &c, &d], false, 10); + + mutable.extend(0, 0, a.len()); + mutable.extend(1, 0, b.len()); + mutable.extend(2, 0, c.len()); + mutable.extend(3, 0, d.len()); + let result = mutable.freeze(); + + let expected_int_array = Int64Array::from(vec![ + Some(1), + Some(2), + Some(3), + Some(4), + Some(5), + Some(6), + Some(7), + Some(8), + None, + None, + Some(9), + Some(10), + Some(11), + // second array + Some(12), + Some(13), + None, + None, + Some(14), + Some(15), + // slice(1, 2) results in no values added + None, + None, + Some(14), + Some(15), + ]); + let list_value_offsets = Buffer::from( + &[0, 3, 5, 5, 13, 15, 15, 15, 19, 19, 19, 19, 23].to_byte_slice(), + ); + let expected_list_data = ArrayData::new( + DataType::List(Box::new(Field::new("item", DataType::Int64, true))), + 12, + None, + Some(Buffer::from(&[0b11011011, 0b1110])), + 0, + vec![list_value_offsets], + vec![expected_int_array.data()], + ); + assert_eq!(result, expected_list_data); + + Ok(()) + } + + #[test] + fn test_list_of_strings_append() -> Result<()> { + // [["alpha", "beta", None]] + let mut builder = ListBuilder::new(StringBuilder::new(32)); + builder.values().append_value("Hello")?; + builder.values().append_value("Arrow")?; + builder.values().append_null()?; + builder.append(true)?; + let a = builder.finish().data(); + + // [["alpha", "beta"], [None], ["gamma", "delta", None]] + let mut builder = ListBuilder::new(StringBuilder::new(32)); + builder.values().append_value("alpha")?; + builder.values().append_value("beta")?; + builder.append(true)?; + builder.values().append_null()?; + builder.append(true)?; + builder.values().append_value("gamma")?; + builder.values().append_value("delta")?; + builder.values().append_null()?; + builder.append(true)?; + let b = builder.finish().data(); + + let mut mutable = MutableArrayData::new(vec![a.as_ref(), b.as_ref()], false, 10); + + mutable.extend(0, 0, a.len()); + mutable.extend(1, 0, b.len()); + mutable.extend(1, 1, 3); + mutable.extend(1, 0, 0); + let result = mutable.freeze(); + + let expected_string_array = StringArray::from(vec![ + // extend a[0..a.len()] + // a[0] + Some("Hello"), + Some("Arrow"), + None, + // extend b[0..b.len()] + // b[0] + Some("alpha"), + Some("beta"), + // b[1] + None, + // b[2] + Some("gamma"), + Some("delta"), + None, + // extend b[1..3] + // b[1] + None, + // b[2] + Some("gamma"), + Some("delta"), + None, + // extend b[0..0] + ]); + let list_value_offsets = Buffer::from(&[0, 3, 5, 6, 9, 10, 13].to_byte_slice()); + let expected_list_data = ArrayData::new( + DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), + 6, + None, + None, + 0, + vec![list_value_offsets], + vec![expected_string_array.data()], + ); + assert_eq!(result, expected_list_data); + Ok(()) + } + + #[test] + fn test_fixed_size_binary_append() -> Result<()> { + let a = vec![Some(vec![1, 2]), Some(vec![3, 4]), Some(vec![5, 6])]; + let a = FixedSizeBinaryArray::from(a).data(); + + let b = vec![ + Some(vec![7, 8]), + Some(vec![9, 10]), + None, + Some(vec![13, 14]), + None, + ]; + let b = FixedSizeBinaryArray::from(b).data(); + + let mut mutable = MutableArrayData::new(vec![a.as_ref(), b.as_ref()], false, 10); + + mutable.extend(0, 0, a.len()); + mutable.extend(1, 0, b.len()); + mutable.extend(1, 1, 4); + mutable.extend(1, 2, 3); + mutable.extend(1, 5, 5); + let result = mutable.freeze(); + + let expected = vec![ + // a + Some(vec![1, 2]), + Some(vec![3, 4]), + Some(vec![5, 6]), + // b + Some(vec![7, 8]), + Some(vec![9, 10]), + None, + Some(vec![13, 14]), + None, + // b[1..4] + Some(vec![9, 10]), + None, + Some(vec![13, 14]), + // b[2..3] + None, + // b[4..4] + ]; + let expected = FixedSizeBinaryArray::from(expected).data(); + assert_eq!(&result, expected.as_ref()); + Ok(()) + } + + /* + // this is an old test used on a meanwhile removed dead code + // that is still useful when `MutableArrayData` supports fixed-size lists. + #[test] + fn test_fixed_size_list_append() -> Result<()> { + let int_builder = UInt16Builder::new(64); + let mut builder = FixedSizeListBuilder::::new(int_builder, 2); + builder.values().append_slice(&[1, 2])?; + builder.append(true)?; + builder.values().append_slice(&[3, 4])?; + builder.append(false)?; + builder.values().append_slice(&[5, 6])?; + builder.append(true)?; + + let a_builder = UInt16Builder::new(64); + let mut a_builder = FixedSizeListBuilder::::new(a_builder, 2); + a_builder.values().append_slice(&[7, 8])?; + a_builder.append(true)?; + a_builder.values().append_slice(&[9, 10])?; + a_builder.append(true)?; + a_builder.values().append_slice(&[11, 12])?; + a_builder.append(false)?; + a_builder.values().append_slice(&[13, 14])?; + a_builder.append(true)?; + a_builder.values().append_null()?; + a_builder.values().append_null()?; + a_builder.append(true)?; + let a = a_builder.finish(); + + // append array + builder.append_data(&[ + a.data(), + a.slice(1, 3).data(), + a.slice(2, 1).data(), + a.slice(5, 0).data(), + ])?; + let finished = builder.finish(); + + let expected_int_array = UInt16Array::from(vec![ + Some(1), + Some(2), + Some(3), + Some(4), + Some(5), + Some(6), + // append first array + Some(7), + Some(8), + Some(9), + Some(10), + Some(11), + Some(12), + Some(13), + Some(14), + None, + None, + // append slice(1, 3) + Some(9), + Some(10), + Some(11), + Some(12), + Some(13), + Some(14), + // append slice(2, 1) + Some(11), + Some(12), + ]); + let expected_list_data = ArrayData::new( + DataType::FixedSizeList( + Box::new(Field::new("item", DataType::UInt16, true)), + 2, + ), + 12, + None, + None, + 0, + vec![], + vec![expected_int_array.data()], + ); + let expected_list = + FixedSizeListArray::from(Arc::new(expected_list_data) as ArrayDataRef); + assert_eq!(&expected_list.values(), &finished.values()); + assert_eq!(expected_list.len(), finished.len()); + + Ok(()) + } + */ } diff --git a/rust/arrow/src/compute/kernels/concat.rs b/rust/arrow/src/compute/kernels/concat.rs index e36ccfbc168..08240424e82 100644 --- a/rust/arrow/src/compute/kernels/concat.rs +++ b/rust/arrow/src/compute/kernels/concat.rs @@ -20,154 +20,60 @@ //! Example: //! //! ``` -//! use std::sync::Arc; //! use arrow::array::{ArrayRef, StringArray}; //! use arrow::compute::concat; //! -//! let arr = concat(&vec![ -//! Arc::new(StringArray::from(vec!["hello", "world"])) as ArrayRef, -//! Arc::new(StringArray::from(vec!["!"])) as ArrayRef, +//! let arr = concat(&[ +//! &StringArray::from(vec!["hello", "world"]), +//! &StringArray::from(vec!["!"]), //! ]).unwrap(); //! assert_eq!(arr.len(), 3); //! ``` +use std::sync::Arc; + use crate::array::*; -use crate::datatypes::*; use crate::error::{ArrowError, Result}; -use TimeUnit::*; - -/// Concatenate multiple `ArrayRef` with the same type. -/// -/// Returns a new ArrayRef. -pub fn concat(array_list: &[ArrayRef]) -> Result { - if array_list.is_empty() { +/// Concatenate multiple [Array] of the same type into a single [ArrayRef]. +pub fn concat(arrays: &[&Array]) -> Result { + if arrays.is_empty() { return Err(ArrowError::ComputeError( "concat requires input of at least one array".to_string(), )); } - let array_data_list = &array_list + + if arrays .iter() - .map(|a| a.data_ref().clone()) - .collect::>(); - - match array_data_list[0].data_type() { - DataType::Utf8 => { - let mut builder = StringBuilder::new(0); - builder.append_data(array_data_list)?; - Ok(ArrayBuilder::finish(&mut builder)) - } - DataType::Boolean => { - let mut builder = BooleanArray::builder(0); - builder.append_data(array_data_list)?; - Ok(ArrayBuilder::finish(&mut builder)) - } - DataType::Int8 => concat_primitive::(array_data_list), - DataType::Int16 => concat_primitive::(array_data_list), - DataType::Int32 => concat_primitive::(array_data_list), - DataType::Int64 => concat_primitive::(array_data_list), - DataType::UInt8 => concat_primitive::(array_data_list), - DataType::UInt16 => concat_primitive::(array_data_list), - DataType::UInt32 => concat_primitive::(array_data_list), - DataType::UInt64 => concat_primitive::(array_data_list), - DataType::Float32 => concat_primitive::(array_data_list), - DataType::Float64 => concat_primitive::(array_data_list), - DataType::Date32(_) => concat_primitive::(array_data_list), - DataType::Date64(_) => concat_primitive::(array_data_list), - DataType::Time32(Second) => concat_primitive::(array_data_list), - DataType::Time32(Millisecond) => { - concat_primitive::(array_data_list) - } - DataType::Time64(Microsecond) => { - concat_primitive::(array_data_list) - } - DataType::Time64(Nanosecond) => { - concat_primitive::(array_data_list) - } - DataType::Timestamp(Second, _) => { - concat_primitive::(array_data_list) - } - DataType::Timestamp(Millisecond, _) => { - concat_primitive::(array_data_list) - } - DataType::Timestamp(Microsecond, _) => { - concat_primitive::(array_data_list) - } - DataType::Timestamp(Nanosecond, _) => { - concat_primitive::(array_data_list) - } - DataType::Interval(IntervalUnit::YearMonth) => { - concat_primitive::(array_data_list) - } - DataType::Interval(IntervalUnit::DayTime) => { - concat_primitive::(array_data_list) - } - DataType::Duration(TimeUnit::Second) => { - concat_primitive::(array_data_list) - } - DataType::Duration(TimeUnit::Millisecond) => { - concat_primitive::(array_data_list) - } - DataType::Duration(TimeUnit::Microsecond) => { - concat_primitive::(array_data_list) - } - DataType::Duration(TimeUnit::Nanosecond) => { - concat_primitive::(array_data_list) - } - DataType::List(nested_field) => { - concat_list(array_data_list, nested_field.data_type()) - } - t => Err(ArrowError::ComputeError(format!( - "Concat not supported for data type {:?}", - t - ))), + .any(|array| array.data_type() != arrays[0].data_type()) + { + return Err(ArrowError::InvalidArgumentError( + "It is not possible to concatenate arrays of different data types." + .to_string(), + )); } -} -#[inline] -fn concat_primitive(array_data_list: &[ArrayDataRef]) -> Result -where - T: ArrowNumericType, -{ - let mut builder = PrimitiveArray::::builder(0); - builder.append_data(array_data_list)?; - Ok(ArrayBuilder::finish(&mut builder)) -} + let lengths = arrays.iter().map(|array| array.len()).collect::>(); + let capacity = lengths.iter().sum(); -#[inline] -fn concat_primitive_list(array_data_list: &[ArrayDataRef]) -> Result -where - T: ArrowNumericType, -{ - let mut builder = ListBuilder::new(PrimitiveArray::::builder(0)); - builder.append_data(array_data_list)?; - Ok(ArrayBuilder::finish(&mut builder)) -} + let arrays = arrays + .iter() + .map(|a| a.data_ref().as_ref()) + .collect::>(); -#[inline] -fn concat_list( - array_data_list: &[ArrayDataRef], - data_type: &DataType, -) -> Result { - match data_type { - DataType::Int8 => concat_primitive_list::(array_data_list), - DataType::Int16 => concat_primitive_list::(array_data_list), - DataType::Int32 => concat_primitive_list::(array_data_list), - DataType::Int64 => concat_primitive_list::(array_data_list), - DataType::UInt8 => concat_primitive_list::(array_data_list), - DataType::UInt16 => concat_primitive_list::(array_data_list), - DataType::UInt32 => concat_primitive_list::(array_data_list), - DataType::UInt64 => concat_primitive_list::(array_data_list), - t => Err(ArrowError::ComputeError(format!( - "Concat not supported for list with data type {:?}", - t - ))), + let mut mutable = MutableArrayData::new(arrays, false, capacity); + + for (i, len) in lengths.iter().enumerate() { + mutable.extend(i, 0, *len) } + + Ok(make_array(Arc::new(mutable.freeze()))) } #[cfg(test)] mod tests { use super::*; + use crate::datatypes::*; use std::sync::Arc; #[test] @@ -180,16 +86,8 @@ mod tests { #[test] fn test_concat_incompatible_datatypes() -> Result<()> { let re = concat(&[ - Arc::new(PrimitiveArray::::from(vec![ - Some(-1), - Some(2), - None, - ])) as ArrayRef, - Arc::new(StringArray::from(vec![ - Some("hello"), - Some("bar"), - Some("world"), - ])) as ArrayRef, + &PrimitiveArray::::from(vec![Some(-1), Some(2), None]), + &StringArray::from(vec![Some("hello"), Some("bar"), Some("world")]), ]); assert!(re.is_err()); Ok(()) @@ -198,14 +96,9 @@ mod tests { #[test] fn test_concat_string_arrays() -> Result<()> { let arr = concat(&[ - Arc::new(StringArray::from(vec![Some("hello"), Some("world")])) as ArrayRef, - Arc::new(StringArray::from(vec!["1", "2", "3", "4", "6"])).slice(1, 3), - Arc::new(StringArray::from(vec![ - Some("foo"), - Some("bar"), - None, - Some("baz"), - ])) as ArrayRef, + &StringArray::from(vec!["hello", "world"]), + &StringArray::from(vec!["2", "3", "4"]), + &StringArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]), ])?; let expected_output = Arc::new(StringArray::from(vec![ @@ -228,24 +121,20 @@ mod tests { #[test] fn test_concat_primitive_arrays() -> Result<()> { let arr = concat(&[ - Arc::new(PrimitiveArray::::from(vec![ + &PrimitiveArray::::from(vec![ Some(-1), Some(-1), Some(2), None, None, - ])) as ArrayRef, - Arc::new(PrimitiveArray::::from(vec![ + ]), + &PrimitiveArray::::from(vec![ Some(101), Some(102), Some(103), None, - ])) as ArrayRef, - Arc::new(PrimitiveArray::::from(vec![ - Some(256), - Some(512), - Some(1024), - ])) as ArrayRef, + ]), + &PrimitiveArray::::from(vec![Some(256), Some(512), Some(1024)]), ])?; let expected_output = Arc::new(PrimitiveArray::::from(vec![ @@ -271,20 +160,15 @@ mod tests { #[test] fn test_concat_boolean_primitive_arrays() -> Result<()> { let arr = concat(&[ - Arc::new(BooleanArray::from(vec![ + &BooleanArray::from(vec![ Some(true), Some(true), Some(false), None, None, Some(false), - ])) as ArrayRef, - Arc::new(BooleanArray::from(vec![ - None, - Some(false), - Some(true), - Some(false), - ])) as ArrayRef, + ]), + &BooleanArray::from(vec![None, Some(false), Some(true), Some(false)]), ])?; let expected_output = Arc::new(BooleanArray::from(vec![ @@ -359,9 +243,9 @@ mod tests { populate_list3(&mut builder_expected)?; let array_result = concat(&[ - Arc::new(builder_in1.finish()), - Arc::new(builder_in2.finish()), - Arc::new(builder_in3.finish()), + &builder_in1.finish(), + &builder_in2.finish(), + &builder_in3.finish(), ])?; let array_expected = Arc::new(builder_expected.finish()) as ArrayRef; diff --git a/rust/arrow/src/ffi.rs b/rust/arrow/src/ffi.rs index 22bf2a385b5..9464edeb9a7 100644 --- a/rust/arrow/src/ffi.rs +++ b/rust/arrow/src/ffi.rs @@ -637,7 +637,7 @@ mod tests { let array = make_array(data); // perform some operation - let array = kernels::concat::concat(&[array.clone(), array]).unwrap(); + let array = kernels::concat::concat(&[array.as_ref(), array.as_ref()]).unwrap(); let array = array.as_any().downcast_ref::().unwrap(); // verify diff --git a/rust/datafusion/src/physical_plan/hash_aggregate.rs b/rust/datafusion/src/physical_plan/hash_aggregate.rs index 3916e614b93..4d16d98c847 100644 --- a/rust/datafusion/src/physical_plan/hash_aggregate.rs +++ b/rust/datafusion/src/physical_plan/hash_aggregate.rs @@ -633,7 +633,10 @@ impl RecordBatchStream for HashAggregateStream { fn concatenate(arrays: Vec>) -> ArrowResult> { (0..arrays[0].len()) .map(|column| { - let array_list = arrays.iter().map(|a| a[column].clone()).collect::>(); + let array_list = arrays + .iter() + .map(|a| a[column].as_ref()) + .collect::>(); compute::concat(&array_list) }) .collect::>>() diff --git a/rust/datafusion/src/physical_plan/sort.rs b/rust/datafusion/src/physical_plan/sort.rs index 267fb7d6d6a..92de48eea74 100644 --- a/rust/datafusion/src/physical_plan/sort.rs +++ b/rust/datafusion/src/physical_plan/sort.rs @@ -143,8 +143,8 @@ fn sort_batches( concat( &batches .iter() - .map(|batch| batch.columns()[i].clone()) - .collect::>(), + .map(|batch| batch.column(i).as_ref()) + .collect::>(), ) }) .collect::>>()?,