diff --git a/rust/arrow/src/array/array_list.rs b/rust/arrow/src/array/array_list.rs index 7a80794f150..84e65e946d0 100644 --- a/rust/arrow/src/array/array_list.rs +++ b/rust/arrow/src/array/array_list.rs @@ -19,15 +19,19 @@ use std::any::Any; use std::convert::From; use std::fmt; use std::mem; +use std::sync::Arc; use num::Num; use super::{ array::print_long_array, make_array, raw_pointer::RawPtrBox, Array, ArrayDataRef, - ArrayRef, + ArrayRef, BinaryBuilder, BooleanBuilder, FixedSizeListBuilder, PrimitiveBuilder, + StringBuilder, }; +use crate::array::builder::GenericListBuilder; use crate::datatypes::ArrowNativeType; -use crate::datatypes::DataType; +use crate::datatypes::*; +use crate::error::{ArrowError, Result}; /// trait declaring an offset size, relevant for i32 vs i64 array types. pub trait OffsetSizeTrait: ArrowNativeType + Num + Ord + std::ops::AddAssign { @@ -238,13 +242,16 @@ impl From for FixedSizeListArray { let values = make_array(data.child_data()[0].clone()); let length = match data.data_type() { DataType::FixedSizeList(_, len) => { - // check that child data is multiple of length - assert_eq!( - values.len() % *len as usize, - 0, - "FixedSizeListArray child array length should be a multiple of {}", - len - ); + if *len > 0 { + // check that child data is multiple of length + assert_eq!( + values.len() % *len as usize, + 0, + "FixedSizeListArray child array length should be a multiple of {}", + len + ); + } + *len } _ => { @@ -295,10 +302,268 @@ impl fmt::Debug for FixedSizeListArray { } } +macro_rules! build_empty_list_array_with_primitive_items { + ($item_type:ident, $offset_type:ident) => {{ + let values_builder = PrimitiveBuilder::<$item_type>::new(0); + let mut builder = + GenericListBuilder::<$offset_type, PrimitiveBuilder<$item_type>>::new( + values_builder, + ); + let empty_list_array = builder.finish(); + Ok(Arc::new(empty_list_array)) + }}; +} + +macro_rules! build_empty_list_array_with_non_primitive_items { + ($type_builder:ident, $offset_type:ident) => {{ + let values_builder = $type_builder::new(0); + let mut builder = + GenericListBuilder::<$offset_type, $type_builder>::new(values_builder); + let empty_list_array = builder.finish(); + Ok(Arc::new(empty_list_array)) + }}; +} + +pub fn build_empty_list_array( + item_type: DataType, +) -> Result { + match item_type { + DataType::UInt8 => { + build_empty_list_array_with_primitive_items!(UInt8Type, OffsetSize) + } + DataType::UInt16 => { + build_empty_list_array_with_primitive_items!(UInt16Type, OffsetSize) + } + DataType::UInt32 => { + build_empty_list_array_with_primitive_items!(UInt32Type, OffsetSize) + } + DataType::UInt64 => { + build_empty_list_array_with_primitive_items!(UInt64Type, OffsetSize) + } + DataType::Int8 => { + build_empty_list_array_with_primitive_items!(Int8Type, OffsetSize) + } + DataType::Int16 => { + build_empty_list_array_with_primitive_items!(Int16Type, OffsetSize) + } + DataType::Int32 => { + build_empty_list_array_with_primitive_items!(Int32Type, OffsetSize) + } + DataType::Int64 => { + build_empty_list_array_with_primitive_items!(Int64Type, OffsetSize) + } + DataType::Float32 => { + build_empty_list_array_with_primitive_items!(Float32Type, OffsetSize) + } + DataType::Float64 => { + build_empty_list_array_with_primitive_items!(Float64Type, OffsetSize) + } + DataType::Boolean => { + build_empty_list_array_with_non_primitive_items!(BooleanBuilder, OffsetSize) + } + DataType::Date32(_) => { + build_empty_list_array_with_primitive_items!(Date32Type, OffsetSize) + } + DataType::Date64(_) => { + build_empty_list_array_with_primitive_items!(Date64Type, OffsetSize) + } + DataType::Time32(TimeUnit::Second) => { + build_empty_list_array_with_primitive_items!(Time32SecondType, OffsetSize) + } + DataType::Time32(TimeUnit::Millisecond) => { + build_empty_list_array_with_primitive_items!( + Time32MillisecondType, + OffsetSize + ) + } + DataType::Time64(TimeUnit::Microsecond) => { + build_empty_list_array_with_primitive_items!( + Time64MicrosecondType, + OffsetSize + ) + } + DataType::Time64(TimeUnit::Nanosecond) => { + build_empty_list_array_with_primitive_items!(Time64NanosecondType, OffsetSize) + } + DataType::Duration(TimeUnit::Second) => { + build_empty_list_array_with_primitive_items!(DurationSecondType, OffsetSize) + } + DataType::Duration(TimeUnit::Millisecond) => { + build_empty_list_array_with_primitive_items!( + DurationMillisecondType, + OffsetSize + ) + } + DataType::Duration(TimeUnit::Microsecond) => { + build_empty_list_array_with_primitive_items!( + DurationMicrosecondType, + OffsetSize + ) + } + DataType::Duration(TimeUnit::Nanosecond) => { + build_empty_list_array_with_primitive_items!( + DurationNanosecondType, + OffsetSize + ) + } + DataType::Timestamp(TimeUnit::Second, _) => { + build_empty_list_array_with_primitive_items!(TimestampSecondType, OffsetSize) + } + DataType::Timestamp(TimeUnit::Millisecond, _) => { + build_empty_list_array_with_primitive_items!( + TimestampMillisecondType, + OffsetSize + ) + } + DataType::Timestamp(TimeUnit::Microsecond, _) => { + build_empty_list_array_with_primitive_items!( + TimestampMicrosecondType, + OffsetSize + ) + } + DataType::Timestamp(TimeUnit::Nanosecond, _) => { + build_empty_list_array_with_primitive_items!( + TimestampNanosecondType, + OffsetSize + ) + } + DataType::Utf8 => { + build_empty_list_array_with_non_primitive_items!(StringBuilder, OffsetSize) + } + DataType::Binary => { + build_empty_list_array_with_non_primitive_items!(BinaryBuilder, OffsetSize) + } + _ => Err(ArrowError::NotYetImplemented(format!( + "GenericListBuilder of type List({:?}) is not supported", + item_type + ))), + } +} + +macro_rules! build_empty_fixed_size_list_array_with_primitive_items { + ($item_type:ident) => {{ + let values_builder = PrimitiveBuilder::<$item_type>::new(0); + let mut builder = FixedSizeListBuilder::new(values_builder, 0); + let empty_list_array = builder.finish(); + Ok(Arc::new(empty_list_array)) + }}; +} + +macro_rules! build_empty_fixed_size_list_array_with_non_primitive_items { + ($type_builder:ident) => {{ + let values_builder = $type_builder::new(0); + let mut builder = FixedSizeListBuilder::new(values_builder, 0); + let empty_list_array = builder.finish(); + Ok(Arc::new(empty_list_array)) + }}; +} + +pub fn build_empty_fixed_size_list_array(item_type: DataType) -> Result { + match item_type { + DataType::UInt8 => { + build_empty_fixed_size_list_array_with_primitive_items!(UInt8Type) + } + DataType::UInt16 => { + build_empty_fixed_size_list_array_with_primitive_items!(UInt16Type) + } + DataType::UInt32 => { + build_empty_fixed_size_list_array_with_primitive_items!(UInt32Type) + } + DataType::UInt64 => { + build_empty_fixed_size_list_array_with_primitive_items!(UInt64Type) + } + DataType::Int8 => { + build_empty_fixed_size_list_array_with_primitive_items!(Int8Type) + } + DataType::Int16 => { + build_empty_fixed_size_list_array_with_primitive_items!(Int16Type) + } + DataType::Int32 => { + build_empty_fixed_size_list_array_with_primitive_items!(Int32Type) + } + DataType::Int64 => { + build_empty_fixed_size_list_array_with_primitive_items!(Int64Type) + } + DataType::Float32 => { + build_empty_fixed_size_list_array_with_primitive_items!(Float32Type) + } + DataType::Float64 => { + build_empty_fixed_size_list_array_with_primitive_items!(Float64Type) + } + DataType::Boolean => { + build_empty_fixed_size_list_array_with_non_primitive_items!(BooleanBuilder) + } + DataType::Date32(_) => { + build_empty_fixed_size_list_array_with_primitive_items!(Date32Type) + } + DataType::Date64(_) => { + build_empty_fixed_size_list_array_with_primitive_items!(Date64Type) + } + DataType::Time32(TimeUnit::Second) => { + build_empty_fixed_size_list_array_with_primitive_items!(Time32SecondType) + } + DataType::Time32(TimeUnit::Millisecond) => { + build_empty_fixed_size_list_array_with_primitive_items!(Time32MillisecondType) + } + DataType::Time64(TimeUnit::Microsecond) => { + build_empty_fixed_size_list_array_with_primitive_items!(Time64MicrosecondType) + } + DataType::Time64(TimeUnit::Nanosecond) => { + build_empty_fixed_size_list_array_with_primitive_items!(Time64NanosecondType) + } + DataType::Duration(TimeUnit::Second) => { + build_empty_fixed_size_list_array_with_primitive_items!(DurationSecondType) + } + DataType::Duration(TimeUnit::Millisecond) => { + build_empty_fixed_size_list_array_with_primitive_items!( + DurationMillisecondType + ) + } + DataType::Duration(TimeUnit::Microsecond) => { + build_empty_fixed_size_list_array_with_primitive_items!( + DurationMicrosecondType + ) + } + DataType::Duration(TimeUnit::Nanosecond) => { + build_empty_fixed_size_list_array_with_primitive_items!( + DurationNanosecondType + ) + } + DataType::Timestamp(TimeUnit::Second, _) => { + build_empty_fixed_size_list_array_with_primitive_items!(TimestampSecondType) + } + DataType::Timestamp(TimeUnit::Millisecond, _) => { + build_empty_fixed_size_list_array_with_primitive_items!( + TimestampMillisecondType + ) + } + DataType::Timestamp(TimeUnit::Microsecond, _) => { + build_empty_fixed_size_list_array_with_primitive_items!( + TimestampMicrosecondType + ) + } + DataType::Timestamp(TimeUnit::Nanosecond, _) => { + build_empty_fixed_size_list_array_with_primitive_items!( + TimestampNanosecondType + ) + } + DataType::Utf8 => { + build_empty_fixed_size_list_array_with_non_primitive_items!(StringBuilder) + } + DataType::Binary => { + build_empty_fixed_size_list_array_with_non_primitive_items!(BinaryBuilder) + } + _ => Err(ArrowError::NotYetImplemented(format!( + "FixedSizeListBuilder of type FixedSizeList({:?}) is not supported", + item_type + ))), + } +} + #[cfg(test)] mod tests { use crate::{ - array::ArrayData, array::Int32Array, buffer::Buffer, datatypes::Field, + array::ArrayData, array::Int32Array, buffer::Buffer, datatypes::Field, memory, util::bit_util, }; @@ -770,4 +1035,68 @@ mod tests { .build(); ListArray::from(list_data); } + + #[test] + #[should_panic(expected = "memory is not aligned")] + fn test_primitive_array_alignment() { + let ptr = memory::allocate_aligned(8); + let buf = unsafe { Buffer::from_raw_parts(ptr, 8, 8) }; + let buf2 = buf.slice(1); + let array_data = ArrayData::builder(DataType::Int32).add_buffer(buf2).build(); + Int32Array::from(array_data); + } + + #[test] + #[should_panic(expected = "memory is not aligned")] + fn test_list_array_alignment() { + let ptr = memory::allocate_aligned(8); + let buf = unsafe { Buffer::from_raw_parts(ptr, 8, 8) }; + let buf2 = buf.slice(1); + + let values: [i32; 8] = [0; 8]; + let value_data = ArrayData::builder(DataType::Int32) + .add_buffer(Buffer::from(values.to_byte_slice())) + .build(); + + let list_data_type = + DataType::List(Box::new(Field::new("item", DataType::Int32, false))); + let list_data = ArrayData::builder(list_data_type) + .add_buffer(buf2) + .add_child_data(value_data) + .build(); + ListArray::from(list_data); + } + + macro_rules! make_test_build_empty_list_array { + ($OFFSET:ident) => { + build_empty_list_array::<$OFFSET>(DataType::Boolean).unwrap(); + build_empty_list_array::<$OFFSET>(DataType::Int16).unwrap(); + build_empty_list_array::<$OFFSET>(DataType::Int32).unwrap(); + build_empty_list_array::<$OFFSET>(DataType::Int64).unwrap(); + build_empty_list_array::<$OFFSET>(DataType::Float32).unwrap(); + build_empty_list_array::<$OFFSET>(DataType::Float64).unwrap(); + build_empty_list_array::<$OFFSET>(DataType::Boolean).unwrap(); + build_empty_list_array::<$OFFSET>(DataType::Utf8).unwrap(); + build_empty_list_array::<$OFFSET>(DataType::Binary).unwrap(); + }; + } + + #[test] + fn test_build_empty_list_array() { + make_test_build_empty_list_array!(i32); + make_test_build_empty_list_array!(i64); + } + + #[test] + fn test_build_empty_fixed_size_list_array() { + build_empty_fixed_size_list_array(DataType::Boolean).unwrap(); + build_empty_fixed_size_list_array(DataType::Int16).unwrap(); + build_empty_fixed_size_list_array(DataType::Int32).unwrap(); + build_empty_fixed_size_list_array(DataType::Int64).unwrap(); + build_empty_fixed_size_list_array(DataType::Float32).unwrap(); + build_empty_fixed_size_list_array(DataType::Float64).unwrap(); + build_empty_fixed_size_list_array(DataType::Boolean).unwrap(); + build_empty_fixed_size_list_array(DataType::Utf8).unwrap(); + build_empty_fixed_size_list_array(DataType::Binary).unwrap(); + } } diff --git a/rust/arrow/src/array/mod.rs b/rust/arrow/src/array/mod.rs index ebdc5c211bc..9caf7f8e257 100644 --- a/rust/arrow/src/array/mod.rs +++ b/rust/arrow/src/array/mod.rs @@ -119,6 +119,8 @@ pub use self::array_binary::FixedSizeBinaryArray; pub use self::array_binary::LargeBinaryArray; pub use self::array_boolean::BooleanArray; pub use self::array_dictionary::DictionaryArray; +pub use self::array_list::build_empty_fixed_size_list_array; +pub use self::array_list::build_empty_list_array; pub use self::array_list::FixedSizeListArray; pub use self::array_list::LargeListArray; pub use self::array_list::ListArray; diff --git a/rust/arrow/src/error.rs b/rust/arrow/src/error.rs index 6570ed0c511..2b9b2b577fe 100644 --- a/rust/arrow/src/error.rs +++ b/rust/arrow/src/error.rs @@ -24,6 +24,8 @@ use std::error::Error; /// Many different operations in the `arrow` crate return this error type. #[derive(Debug)] pub enum ArrowError { + /// Returned when functionality is not yet available. + NotYetImplemented(String), ExternalError(Box), MemoryError(String), ParseError(String), @@ -90,6 +92,9 @@ impl From for ArrowError { impl Display for ArrowError { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { + ArrowError::NotYetImplemented(source) => { + write!(f, "Not yet implemented: {}", &source) + } ArrowError::ExternalError(source) => write!(f, "External error: {}", &source), ArrowError::MemoryError(desc) => write!(f, "Memory error: {}", desc), ArrowError::ParseError(desc) => write!(f, "Parser error: {}", desc), diff --git a/rust/datafusion/src/physical_plan/common.rs b/rust/datafusion/src/physical_plan/common.rs index 40c9763c024..60ca857e99b 100644 --- a/rust/datafusion/src/physical_plan/common.rs +++ b/rust/datafusion/src/physical_plan/common.rs @@ -39,9 +39,9 @@ use arrow::{ }; use arrow::{ array::{ - Date64Array, Time64MicrosecondArray, Time64NanosecondArray, - TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, - TimestampSecondArray, + build_empty_fixed_size_list_array, build_empty_list_array, Date64Array, + Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray, + TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, }, buffer::Buffer, datatypes::{DataType, SchemaRef, TimeUnit}, @@ -227,6 +227,15 @@ pub fn create_batch_empty(schema: &Schema) -> ArrowResult { as ArrayRef) } }, + DataType::List(nested_type) => Ok(build_empty_list_array::( + nested_type.data_type().clone(), + )?), + DataType::LargeList(nested_type) => Ok(build_empty_list_array::( + nested_type.data_type().clone(), + )?), + DataType::FixedSizeList(nested_type, _) => Ok( + build_empty_fixed_size_list_array(nested_type.data_type().clone())?, + ), _ => Err(DataFusionError::NotImplemented(format!( "Cannot convert datatype {:?} to array", f.data_type() diff --git a/rust/parquet/src/arrow/array_reader.rs b/rust/parquet/src/arrow/array_reader.rs index c6b5cdaa726..4bacb504424 100644 --- a/rust/parquet/src/arrow/array_reader.rs +++ b/rust/parquet/src/arrow/array_reader.rs @@ -24,10 +24,10 @@ use std::sync::Arc; use std::vec::Vec; use arrow::array::{ - Array, ArrayData, ArrayDataBuilder, ArrayDataRef, ArrayRef, BinaryArray, - BinaryBuilder, BooleanArray, BooleanBufferBuilder, DecimalBuilder, - FixedSizeBinaryArray, FixedSizeBinaryBuilder, GenericListArray, Int16BufferBuilder, - Int32Array, Int64Array, ListBuilder, OffsetSizeTrait, PrimitiveArray, + build_empty_list_array, Array, ArrayData, ArrayDataBuilder, ArrayDataRef, ArrayRef, + BinaryArray, BinaryBuilder, BooleanArray, BooleanBufferBuilder, BooleanBuilder, + DecimalBuilder, FixedSizeBinaryArray, FixedSizeBinaryBuilder, GenericListArray, + Int16BufferBuilder, Int32Array, Int64Array, OffsetSizeTrait, PrimitiveArray, PrimitiveBuilder, StringArray, StringBuilder, StructArray, }; use arrow::buffer::{Buffer, MutableBuffer}; @@ -624,105 +624,6 @@ impl ListArrayReader { } } -macro_rules! build_empty_list_array_with_primitive_items { - ($item_type:ident) => {{ - let values_builder = PrimitiveBuilder::<$item_type>::new(0); - let mut builder = ListBuilder::new(values_builder); - let empty_list_array = builder.finish(); - Ok(Arc::new(empty_list_array)) - }}; -} - -macro_rules! build_empty_list_array_with_non_primitive_items { - ($builder:ident) => {{ - let values_builder = $builder::new(0); - let mut builder = ListBuilder::new(values_builder); - let empty_list_array = builder.finish(); - Ok(Arc::new(empty_list_array)) - }}; -} - -fn build_empty_list_array(item_type: ArrowType) -> Result { - match item_type { - ArrowType::UInt8 => build_empty_list_array_with_primitive_items!(ArrowUInt8Type), - ArrowType::UInt16 => { - build_empty_list_array_with_primitive_items!(ArrowUInt16Type) - } - ArrowType::UInt32 => { - build_empty_list_array_with_primitive_items!(ArrowUInt32Type) - } - ArrowType::UInt64 => { - build_empty_list_array_with_primitive_items!(ArrowUInt64Type) - } - ArrowType::Int8 => build_empty_list_array_with_primitive_items!(ArrowInt8Type), - ArrowType::Int16 => build_empty_list_array_with_primitive_items!(ArrowInt16Type), - ArrowType::Int32 => build_empty_list_array_with_primitive_items!(ArrowInt32Type), - ArrowType::Int64 => build_empty_list_array_with_primitive_items!(ArrowInt64Type), - ArrowType::Float32 => { - build_empty_list_array_with_primitive_items!(ArrowFloat32Type) - } - ArrowType::Float64 => { - build_empty_list_array_with_primitive_items!(ArrowFloat64Type) - } - ArrowType::Boolean => { - //build_empty_list_array_with_primitive_items!(ArrowBooleanType) - todo!() - } - ArrowType::Date32(_) => { - build_empty_list_array_with_primitive_items!(ArrowDate32Type) - } - ArrowType::Date64(_) => { - build_empty_list_array_with_primitive_items!(ArrowDate64Type) - } - ArrowType::Time32(ArrowTimeUnit::Second) => { - build_empty_list_array_with_primitive_items!(ArrowTime32SecondType) - } - ArrowType::Time32(ArrowTimeUnit::Millisecond) => { - build_empty_list_array_with_primitive_items!(ArrowTime32MillisecondType) - } - ArrowType::Time64(ArrowTimeUnit::Microsecond) => { - build_empty_list_array_with_primitive_items!(ArrowTime64MicrosecondType) - } - ArrowType::Time64(ArrowTimeUnit::Nanosecond) => { - build_empty_list_array_with_primitive_items!(ArrowTime64NanosecondType) - } - ArrowType::Duration(ArrowTimeUnit::Second) => { - build_empty_list_array_with_primitive_items!(ArrowDurationSecondType) - } - ArrowType::Duration(ArrowTimeUnit::Millisecond) => { - build_empty_list_array_with_primitive_items!(ArrowDurationMillisecondType) - } - ArrowType::Duration(ArrowTimeUnit::Microsecond) => { - build_empty_list_array_with_primitive_items!(ArrowDurationMicrosecondType) - } - ArrowType::Duration(ArrowTimeUnit::Nanosecond) => { - build_empty_list_array_with_primitive_items!(ArrowDurationNanosecondType) - } - ArrowType::Timestamp(ArrowTimeUnit::Second, _) => { - build_empty_list_array_with_primitive_items!(ArrowTimestampSecondType) - } - ArrowType::Timestamp(ArrowTimeUnit::Millisecond, _) => { - build_empty_list_array_with_primitive_items!(ArrowTimestampMillisecondType) - } - ArrowType::Timestamp(ArrowTimeUnit::Microsecond, _) => { - build_empty_list_array_with_primitive_items!(ArrowTimestampMicrosecondType) - } - ArrowType::Timestamp(ArrowTimeUnit::Nanosecond, _) => { - build_empty_list_array_with_primitive_items!(ArrowTimestampNanosecondType) - } - ArrowType::Utf8 => { - build_empty_list_array_with_non_primitive_items!(StringBuilder) - } - ArrowType::Binary => { - build_empty_list_array_with_non_primitive_items!(BinaryBuilder) - } - _ => Err(ParquetError::General(format!( - "ListArray of type List({:?}) is not supported by array_reader", - item_type - ))), - } -} - macro_rules! remove_primitive_array_indices { ($arr: expr, $item_type:ty, $indices:expr) => {{ let array_data = match $arr.as_any().downcast_ref::>() { @@ -811,8 +712,12 @@ fn remove_indices( remove_primitive_array_indices!(arr, ArrowFloat64Type, indices) } ArrowType::Boolean => { - todo!() - //remove_primitive_array_indices!(arr, ArrowBooleanType, indices) + remove_array_indices_custom_builder!( + arr, + BooleanArray, + BooleanBuilder, + indices + ) } ArrowType::Date32(_) => { remove_primitive_array_indices!(arr, ArrowDate32Type, indices) @@ -893,7 +798,8 @@ impl ArrayReader for ListArrayReader { let item_type = self.item_reader.get_data_type().clone(); if next_batch_array.len() == 0 { - return build_empty_list_array(item_type); + return build_empty_list_array::(item_type) + .map_err(|err| ParquetError::General(err.to_string())); } let def_levels = self .item_reader