diff --git a/rust/arrow/examples/builders.rs b/rust/arrow/examples/builders.rs index e4f7eca1caa..61cce0ed97a 100644 --- a/rust/arrow/examples/builders.rs +++ b/rust/arrow/examples/builders.rs @@ -25,9 +25,7 @@ use arrow::array::{ StringArray, StructArray, }; use arrow::buffer::Buffer; -use arrow::datatypes::{ - DataType, Date64Type, Field, NullableDataType, Time64NanosecondType, ToByteSlice, -}; +use arrow::datatypes::{DataType, Date64Type, Field, Time64NanosecondType, ToByteSlice}; fn main() { // Primitive Arrays @@ -102,7 +100,7 @@ fn main() { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(NullableDataType::new(DataType::Int32, false))); + DataType::List(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) diff --git a/rust/arrow/src/array/array_binary.rs b/rust/arrow/src/array/array_binary.rs index ef1cf8d8b2d..15d6ccd0045 100644 --- a/rust/arrow/src/array/array_binary.rs +++ b/rust/arrow/src/array/array_binary.rs @@ -596,7 +596,7 @@ impl Array for DecimalArray { #[cfg(test)] mod tests { - use crate::datatypes::NullableDataType; + use crate::datatypes::Field; use super::*; @@ -908,7 +908,7 @@ mod tests { .build(); let array_data = ArrayData::builder(DataType::FixedSizeList( - Box::new(NullableDataType::new(DataType::Binary, false)), + Box::new(Field::new("item", DataType::Binary, false)), 4, )) .len(3) diff --git a/rust/arrow/src/array/array_list.rs b/rust/arrow/src/array/array_list.rs index 00e7d29d0b8..4eb8dc56640 100644 --- a/rust/arrow/src/array/array_list.rs +++ b/rust/arrow/src/array/array_list.rs @@ -297,12 +297,15 @@ impl fmt::Debug for FixedSizeListArray { #[cfg(test)] mod tests { use crate::{ - array::ArrayData, array::Int32Array, buffer::Buffer, datatypes::ToByteSlice, - memory, util::bit_util, + array::ArrayData, + array::Int32Array, + buffer::Buffer, + datatypes::{Field, ToByteSlice}, + memory, + util::bit_util, }; use super::*; - use crate::datatypes::NullableDataType; #[test] fn test_list_array() { @@ -318,7 +321,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(NullableDataType::new(DataType::Int32, false))); + DataType::List(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_buffer(value_offsets.clone()) @@ -388,7 +391,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::LargeList(Box::new(NullableDataType::new(DataType::Int32, false))); + DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_buffer(value_offsets.clone()) @@ -454,7 +457,7 @@ mod tests { // Construct a list array from the above two let list_data_type = DataType::FixedSizeList( - Box::new(NullableDataType::new(DataType::Int32, false)), + Box::new(Field::new("item", DataType::Int32, false)), 3, ); let list_data = ArrayData::builder(list_data_type.clone()) @@ -523,7 +526,7 @@ mod tests { // Construct a list array from the above two let list_data_type = DataType::FixedSizeList( - Box::new(NullableDataType::new(DataType::Int32, false)), + Box::new(Field::new("item", DataType::Int32, false)), 3, ); let list_data = ArrayData::builder(list_data_type) @@ -557,7 +560,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(NullableDataType::new(DataType::Int32, false))); + DataType::List(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(9) .add_buffer(value_offsets) @@ -622,7 +625,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::LargeList(Box::new(NullableDataType::new(DataType::Int32, false))); + DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(9) .add_buffer(value_offsets) @@ -685,7 +688,7 @@ mod tests { // Construct a fixed size list array from the above two let list_data_type = DataType::FixedSizeList( - Box::new(NullableDataType::new(DataType::Int32, false)), + Box::new(Field::new("item", DataType::Int32, false)), 2, ); let list_data = ArrayData::builder(list_data_type) @@ -736,7 +739,7 @@ mod tests { .add_buffer(Buffer::from(&[0, 1, 2, 3, 4, 5, 6, 7].to_byte_slice())) .build(); let list_data_type = - DataType::List(Box::new(NullableDataType::new(DataType::Int32, false))); + DataType::List(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_child_data(value_data) @@ -751,7 +754,7 @@ mod tests { fn test_list_array_invalid_child_array_len() { let value_offsets = Buffer::from(&[0, 2, 5, 7].to_byte_slice()); let list_data_type = - DataType::List(Box::new(NullableDataType::new(DataType::Int32, false))); + DataType::List(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -770,7 +773,7 @@ mod tests { let value_offsets = Buffer::from(&[2, 2, 5, 7].to_byte_slice()); let list_data_type = - DataType::List(Box::new(NullableDataType::new(DataType::Int32, false))); + DataType::List(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -802,7 +805,7 @@ mod tests { .build(); let list_data_type = - DataType::List(Box::new(NullableDataType::new(DataType::Int32, false))); + DataType::List(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .add_buffer(buf2) .add_child_data(value_data) diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs index 11d375a932b..08fcd6468b6 100644 --- a/rust/arrow/src/array/builder.rs +++ b/rust/arrow/src/array/builder.rs @@ -764,7 +764,8 @@ where /// /// This is used for validating array data types in `append_data` fn data_type(&self) -> DataType { - DataType::List(Box::new(NullableDataType::new( + DataType::List(Box::new(Field::new( + "item", self.values_builder.data_type(), true, ))) @@ -833,7 +834,8 @@ where let null_bit_buffer = self.bitmap_builder.finish(); let nulls = null_bit_buffer.count_set_bits(); self.offsets_builder.append(0).unwrap(); - let data = ArrayData::builder(DataType::List(Box::new(NullableDataType::new( + let data = ArrayData::builder(DataType::List(Box::new(Field::new( + "item", values_data.data_type().clone(), true, // TODO: find a consistent way of getting this )))) @@ -974,7 +976,8 @@ where /// /// This is used for validating array data types in `append_data` fn data_type(&self) -> DataType { - DataType::LargeList(Box::new(NullableDataType::new( + DataType::LargeList(Box::new(Field::new( + "item", self.values_builder.data_type(), true, ))) @@ -1043,9 +1046,11 @@ where let null_bit_buffer = self.bitmap_builder.finish(); let nulls = null_bit_buffer.count_set_bits(); self.offsets_builder.append(0).unwrap(); - let data = ArrayData::builder(DataType::LargeList(Box::new( - NullableDataType::new(values_data.data_type().clone(), true), - ))) + let data = ArrayData::builder(DataType::LargeList(Box::new(Field::new( + "item", + values_data.data_type().clone(), + true, + )))) .len(len) .null_count(len - nulls) .add_buffer(offset_buffer) @@ -1153,7 +1158,7 @@ where /// This is used for validating array data types in `append_data` fn data_type(&self) -> DataType { DataType::FixedSizeList( - Box::new(NullableDataType::new(self.values_builder.data_type(), true)), + Box::new(Field::new("item", self.values_builder.data_type(), true)), self.list_len, ) } @@ -1232,7 +1237,7 @@ where let null_bit_buffer = self.bitmap_builder.finish(); let nulls = null_bit_buffer.count_set_bits(); let data = ArrayData::builder(DataType::FixedSizeList( - Box::new(NullableDataType::new(values_data.data_type().clone(), true)), + Box::new(Field::new("item", values_data.data_type().clone(), true)), self.list_len, )) .len(len) @@ -1453,10 +1458,7 @@ fn append_binary_data( )) as ArrayDataRef; Arc::new(ArrayData::new( - DataType::List(Box::new(NullableDataType::new( - DataType::UInt8, - true, - ))), + DataType::List(Box::new(Field::new("item", DataType::UInt8, true))), array.len(), None, array.null_buffer().cloned(), @@ -1508,7 +1510,8 @@ fn append_large_binary_data( )) as ArrayDataRef; Arc::new(ArrayData::new( - DataType::LargeList(Box::new(NullableDataType::new( + DataType::LargeList(Box::new(Field::new( + "item", DataType::UInt8, true, ))), @@ -1610,7 +1613,7 @@ impl ArrayBuilder for FixedSizeBinaryBuilder { )) as ArrayDataRef; let list_data = Arc::new(ArrayData::new( DataType::FixedSizeList( - Box::new(NullableDataType::new(DataType::UInt8, true)), + Box::new(Field::new("item", DataType::UInt8, true)), self.builder.list_len, ), array.len(), @@ -1696,7 +1699,7 @@ impl ArrayBuilder for DecimalBuilder { )) as ArrayDataRef; let list_data = Arc::new(ArrayData::new( DataType::FixedSizeList( - Box::new(NullableDataType::new(DataType::UInt8, true)), + Box::new(Field::new("item", DataType::UInt8, true)), self.builder.list_len, ), array.len(), @@ -3817,13 +3820,13 @@ mod tests { #[test] #[should_panic( - expected = "Data type List(NullableDataType { data_type: Int64, nullable: true }) is not currently supported" + expected = "Data type List(Field { name: \"item\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false }) is not currently supported" )] fn test_struct_array_builder_from_schema_unsupported_type() { let mut fields = Vec::new(); fields.push(Field::new("f1", DataType::Int16, false)); let list_type = - DataType::List(Box::new(NullableDataType::new(DataType::Int64, true))); + DataType::List(Box::new(Field::new("item", DataType::Int64, true))); fields.push(Field::new("f2", list_type, false)); let _ = StructBuilder::from_fields(fields, 5); @@ -4122,7 +4125,7 @@ mod tests { let list_value_offsets = Buffer::from(&[0, 3, 5, 11, 13, 13, 15, 15, 17].to_byte_slice()); let expected_list_data = ArrayData::new( - DataType::List(Box::new(NullableDataType::new(DataType::Int64, true))), + DataType::List(Box::new(Field::new("item", DataType::Int64, true))), 8, None, None, @@ -4208,7 +4211,7 @@ mod tests { &[0, 3, 5, 5, 13, 15, 15, 15, 19, 19, 19, 19, 23].to_byte_slice(), ); let expected_list_data = ArrayData::new( - DataType::List(Box::new(NullableDataType::new(DataType::Int64, true))), + DataType::List(Box::new(Field::new("item", DataType::Int64, true))), 12, None, None, @@ -4250,7 +4253,7 @@ mod tests { ]); let list_value_offsets = Buffer::from(&[0, 2, 3, 6].to_byte_slice()); let list_data = ArrayData::new( - DataType::List(Box::new(NullableDataType::new(DataType::Utf8, true))), + DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), 3, None, None, @@ -4285,7 +4288,7 @@ mod tests { ]); let list_value_offsets = Buffer::from(&[0, 2, 2, 4, 5, 8, 9, 12].to_byte_slice()); let expected_list_data = ArrayData::new( - DataType::List(Box::new(NullableDataType::new(DataType::Utf8, true))), + DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), 7, None, None, // is this correct? @@ -4374,7 +4377,7 @@ mod tests { ]); let expected_list_data = ArrayData::new( DataType::FixedSizeList( - Box::new(NullableDataType::new(DataType::UInt16, true)), + Box::new(Field::new("item", DataType::UInt16, true)), 2, ), 12, @@ -4447,7 +4450,7 @@ mod tests { ]); let expected_list_data = ArrayData::new( DataType::FixedSizeList( - Box::new(NullableDataType::new(DataType::UInt8, true)), + Box::new(Field::new("item", DataType::UInt8, true)), 2, ), 12, diff --git a/rust/arrow/src/compute/kernels/cast.rs b/rust/arrow/src/compute/kernels/cast.rs index 96254c4e16d..ef79302927f 100644 --- a/rust/arrow/src/compute/kernels/cast.rs +++ b/rust/arrow/src/compute/kernels/cast.rs @@ -1237,7 +1237,7 @@ mod tests { let array = Arc::new(a) as ArrayRef; let b = cast( &array, - &DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))), + &DataType::List(Box::new(Field::new("item", DataType::Int32, true))), ) .unwrap(); assert_eq!(5, b.len()); @@ -1267,7 +1267,7 @@ mod tests { let array = Arc::new(a) as ArrayRef; let b = cast( &array, - &DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))), + &DataType::List(Box::new(Field::new("item", DataType::Int32, true))), ) .unwrap(); assert_eq!(5, b.len()); @@ -1300,7 +1300,7 @@ mod tests { let array = array.slice(2, 4); let b = cast( &array, - &DataType::List(Box::new(NullableDataType::new(DataType::Float64, true))), + &DataType::List(Box::new(Field::new("item", DataType::Float64, true))), ) .unwrap(); assert_eq!(4, b.len()); @@ -1377,7 +1377,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))); + DataType::List(Box::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -1387,7 +1387,7 @@ mod tests { let cast_array = cast( &list_array, - &DataType::List(Box::new(NullableDataType::new(DataType::UInt16, true))), + &DataType::List(Box::new(Field::new("item", DataType::UInt16, true))), ) .unwrap(); // 3 negative values should get lost when casting to unsigned, @@ -1436,7 +1436,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))); + DataType::List(Box::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -1446,7 +1446,8 @@ mod tests { cast( &list_array, - &DataType::List(Box::new(NullableDataType::new( + &DataType::List(Box::new(Field::new( + "item", DataType::Timestamp(TimeUnit::Microsecond, None), true, ))), @@ -2853,7 +2854,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))); + DataType::List(Box::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -2875,7 +2876,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::LargeList(Box::new(NullableDataType::new(DataType::Int32, true))); + DataType::LargeList(Box::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -2895,7 +2896,7 @@ mod tests { // Construct a fixed size list array from the above two let list_data_type = DataType::FixedSizeList( - Box::new(NullableDataType::new(DataType::Int32, true)), + Box::new(Field::new("item", DataType::Int32, true)), 2, ); let list_data = ArrayData::builder(list_data_type) @@ -2988,12 +2989,12 @@ mod tests { LargeBinary, Utf8, LargeUtf8, - List(Box::new(NullableDataType::new(DataType::Int8, true))), - List(Box::new(NullableDataType::new(DataType::Utf8, true))), - FixedSizeList(Box::new(NullableDataType::new(DataType::Int8, true)), 10), - FixedSizeList(Box::new(NullableDataType::new(DataType::Utf8, false)), 10), - LargeList(Box::new(NullableDataType::new(DataType::Int8, true))), - LargeList(Box::new(NullableDataType::new(DataType::Utf8, false))), + List(Box::new(Field::new("item", DataType::Int8, true))), + List(Box::new(Field::new("item", DataType::Utf8, true))), + FixedSizeList(Box::new(Field::new("item", DataType::Int8, true)), 10), + FixedSizeList(Box::new(Field::new("item", DataType::Utf8, false)), 10), + LargeList(Box::new(Field::new("item", DataType::Int8, true))), + LargeList(Box::new(Field::new("item", DataType::Utf8, false))), Struct(vec![ Field::new("f1", DataType::Int32, false), Field::new("f2", DataType::Utf8, true), diff --git a/rust/arrow/src/compute/kernels/comparison.rs b/rust/arrow/src/compute/kernels/comparison.rs index 02318101bee..fd0bc7343f0 100644 --- a/rust/arrow/src/compute/kernels/comparison.rs +++ b/rust/arrow/src/compute/kernels/comparison.rs @@ -776,8 +776,8 @@ fn new_all_set_buffer(len: usize) -> Buffer { #[cfg(test)] mod tests { use super::*; - use crate::array::Int32Array; - use crate::datatypes::{Int8Type, NullableDataType, ToByteSlice}; + use crate::datatypes::{Int8Type, ToByteSlice}; + use crate::{array::Int32Array, datatypes::Field}; #[test] fn test_primitive_array_eq() { @@ -1046,7 +1046,7 @@ mod tests { .data(); let value_offsets = Buffer::from(&[0i64, 3, 6, 6, 9].to_byte_slice()); let list_data_type = - DataType::LargeList(Box::new(NullableDataType::new(DataType::Int32, true))); + DataType::LargeList(Box::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(4) .add_buffer(value_offsets) diff --git a/rust/arrow/src/compute/kernels/filter.rs b/rust/arrow/src/compute/kernels/filter.rs index ba8e1b758d8..e90e4939b4e 100644 --- a/rust/arrow/src/compute/kernels/filter.rs +++ b/rust/arrow/src/compute/kernels/filter.rs @@ -1080,7 +1080,7 @@ mod tests { let value_offsets = Buffer::from(&[0i64, 3, 6, 8, 8].to_byte_slice()); let list_data_type = - DataType::LargeList(Box::new(NullableDataType::new(DataType::Int32, false))); + DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(4) .add_buffer(value_offsets) diff --git a/rust/arrow/src/compute/kernels/limit.rs b/rust/arrow/src/compute/kernels/limit.rs index 5e182e6bc49..911dbf2889d 100644 --- a/rust/arrow/src/compute/kernels/limit.rs +++ b/rust/arrow/src/compute/kernels/limit.rs @@ -35,7 +35,7 @@ mod tests { use super::*; use crate::array::*; use crate::buffer::Buffer; - use crate::datatypes::{DataType, Field, NullableDataType, ToByteSlice}; + use crate::datatypes::{DataType, Field, ToByteSlice}; use crate::util::bit_util; use std::sync::Arc; @@ -110,7 +110,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(NullableDataType::new(DataType::Int32, false))); + DataType::List(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(9) .add_buffer(value_offsets) diff --git a/rust/arrow/src/compute/kernels/take.rs b/rust/arrow/src/compute/kernels/take.rs index 9b48eb2cc9c..9b5963a4e91 100644 --- a/rust/arrow/src/compute/kernels/take.rs +++ b/rust/arrow/src/compute/kernels/take.rs @@ -810,9 +810,11 @@ mod tests { let value_offsets: [$offset_type; 4] = [0, 3, 6, 8]; let value_offsets = Buffer::from(&value_offsets.to_byte_slice()); // Construct a list array from the above two - let list_data_type = DataType::$list_data_type(Box::new( - NullableDataType::new(DataType::Int32, false), - )); + let list_data_type = DataType::$list_data_type(Box::new(Field::new( + "item", + DataType::Int32, + false, + ))); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_buffer(value_offsets) @@ -881,9 +883,11 @@ mod tests { let value_offsets: [$offset_type; 5] = [0, 3, 6, 7, 9]; let value_offsets = Buffer::from(&value_offsets.to_byte_slice()); // Construct a list array from the above two - let list_data_type = DataType::$list_data_type(Box::new( - NullableDataType::new(DataType::Int32, false), - )); + let list_data_type = DataType::$list_data_type(Box::new(Field::new( + "item", + DataType::Int32, + false, + ))); let list_data = ArrayData::builder(list_data_type.clone()) .len(4) .add_buffer(value_offsets) @@ -952,9 +956,11 @@ mod tests { let value_offsets: [$offset_type; 5] = [0, 3, 6, 6, 8]; let value_offsets = Buffer::from(&value_offsets.to_byte_slice()); // Construct a list array from the above two - let list_data_type = DataType::$list_data_type(Box::new( - NullableDataType::new(DataType::Int32, false), - )); + let list_data_type = DataType::$list_data_type(Box::new(Field::new( + "item", + DataType::Int32, + false, + ))); let list_data = ArrayData::builder(list_data_type.clone()) .len(4) .add_buffer(value_offsets) @@ -1045,7 +1051,7 @@ mod tests { let value_offsets = Buffer::from(&[0, 3, 6, 8].to_byte_slice()); // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(NullableDataType::new(DataType::Int32, false))); + DataType::List(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) diff --git a/rust/arrow/src/compute/util.rs b/rust/arrow/src/compute/util.rs index 0fd0e64355a..ba7de77f6b0 100644 --- a/rust/arrow/src/compute/util.rs +++ b/rust/arrow/src/compute/util.rs @@ -321,7 +321,7 @@ mod tests { #[test] fn test_take_value_index_from_list() { let list = build_list( - DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))), + DataType::List(Box::new(Field::new("item", DataType::Int32, true))), Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), vec![0i32, 2i32, 5i32, 10i32], ); @@ -337,7 +337,7 @@ mod tests { #[test] fn test_take_value_index_from_large_list() { let list = build_list( - DataType::LargeList(Box::new(NullableDataType::new(DataType::Int32, false))), + DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))), Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), vec![0i64, 2i64, 5i64, 10i64], ); diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index dc7ae77f344..0a26d2e5fd2 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -125,11 +125,11 @@ pub enum DataType { /// A variable-length string in Unicode with UFT-8 encoding and 64-bit offsets. LargeUtf8, /// A list of some logical data type with variable length. - List(Box), + List(Box), /// A list of some logical data type with fixed length. - FixedSizeList(Box, i32), + FixedSizeList(Box, i32), /// A list of some logical data type with variable length and 64-bit offsets. - LargeList(Box), + LargeList(Box), /// A nested datatype that contains a number of sub-fields. Struct(Vec), /// A nested datatype that can represent slots of differing types. @@ -149,13 +149,6 @@ pub enum DataType { Decimal(usize, usize), } -/// Extends data type with nullability -#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -pub struct NullableDataType { - data_type: DataType, - nullable: bool, -} - /// Date is either a 32-bit or 64-bit type representing elapsed time since UNIX /// epoch (1970-01-01) in days or milliseconds. #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] @@ -196,7 +189,8 @@ pub enum IntervalUnit { #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct Field { name: String, - data_type: NullableDataType, + data_type: DataType, + nullable: bool, dict_id: i64, dict_is_ordered: bool, } @@ -882,7 +876,7 @@ impl ToByteSlice for T { impl DataType { /// Parse a data type from a JSON representation pub(crate) fn from(json: &Value) -> Result { - let default_dt_ctx = NullableDataType::new(DataType::Boolean, true); + let default_field = Field::new("", DataType::Boolean, true); match *json { Value::Object(ref map) => match map.get("name") { Some(s) if s == "null" => Ok(DataType::Null), @@ -1016,17 +1010,17 @@ impl DataType { }, Some(s) if s == "list" => { // return a list with any type as its child isn't defined in the map - Ok(DataType::List(Box::new(default_dt_ctx))) + Ok(DataType::List(Box::new(default_field))) } Some(s) if s == "largelist" => { // return a largelist with any type as its child isn't defined in the map - Ok(DataType::LargeList(Box::new(default_dt_ctx))) + Ok(DataType::LargeList(Box::new(default_field))) } Some(s) if s == "fixedsizelist" => { // return a list with any type as its child isn't defined in the map if let Some(Value::Number(size)) = map.get("listSize") { Ok(DataType::FixedSizeList( - Box::new(default_dt_ctx), + Box::new(default_field), size.as_i64().unwrap() as i32, )) } else { @@ -1155,34 +1149,13 @@ impl DataType { } } -impl NullableDataType { - /// Creates a new data type context - pub fn new(data_type: DataType, nullable: bool) -> Self { - NullableDataType { - data_type, - nullable, - } - } - - /// Returns an immutable reference to the data type - #[inline] - pub const fn data_type(&self) -> &DataType { - &self.data_type - } - - /// Indicates whether in this data type context null values are eligible - #[inline] - pub const fn is_nullable(&self) -> bool { - self.nullable - } -} - impl Field { /// Creates a new field pub fn new(name: &str, data_type: DataType, nullable: bool) -> Self { Field { name: name.to_string(), - data_type: NullableDataType::new(data_type, nullable), + data_type, + nullable, dict_id: 0, dict_is_ordered: false, } @@ -1198,7 +1171,8 @@ impl Field { ) -> Self { Field { name: name.to_string(), - data_type: NullableDataType::new(data_type, nullable), + data_type, + nullable, dict_id, dict_is_ordered, } @@ -1213,13 +1187,13 @@ impl Field { /// Returns an immutable reference to the `Field`'s data-type #[inline] pub const fn data_type(&self) -> &DataType { - self.data_type.data_type() + &self.data_type } /// Indicates whether this `Field` supports null values #[inline] pub const fn is_nullable(&self) -> bool { - self.data_type.nullable + self.nullable } /// Returns the dictionary ID @@ -1273,21 +1247,16 @@ impl Field { "Field 'children' must have one element for a list data type".to_string(), )); } - let nested_field = Self::from(&values[0])?; - let nexted_dt_ctx = NullableDataType::new( - nested_field.data_type.data_type, - nested_field.data_type.nullable, - ); match data_type { DataType::List(_) => DataType::List(Box::new( - nexted_dt_ctx, + Self::from(&values[0])?, )), DataType::LargeList(_) => DataType::LargeList(Box::new( - nexted_dt_ctx, + Self::from(&values[0])?, )), DataType::FixedSizeList(_, int) => { DataType::FixedSizeList( - Box::new(nexted_dt_ctx), + Box::new(Self::from(&values[0])?), int, ) } @@ -1363,7 +1332,8 @@ impl Field { }; Ok(Field { name, - data_type: NullableDataType::new(data_type, nullable), + nullable, + data_type, dict_id, dict_is_ordered, }) @@ -1378,36 +1348,15 @@ impl Field { pub fn to_json(&self) -> Value { let children: Vec = match self.data_type() { DataType::Struct(fields) => fields.iter().map(|f| f.to_json()).collect(), - DataType::List(type_ctx) => { - let item = Field::new( - "item", - type_ctx.data_type().clone(), - type_ctx.is_nullable(), - ); - vec![item.to_json()] - } - DataType::LargeList(type_ctx) => { - let item = Field::new( - "item", - type_ctx.data_type().clone(), - type_ctx.is_nullable(), - ); - vec![item.to_json()] - } - DataType::FixedSizeList(type_ctx, _) => { - let item = Field::new( - "item", - type_ctx.data_type().clone(), - type_ctx.is_nullable(), - ); - vec![item.to_json()] - } + DataType::List(field) => vec![field.to_json()], + DataType::LargeList(field) => vec![field.to_json()], + DataType::FixedSizeList(field, _) => vec![field.to_json()], _ => vec![], }; match self.data_type() { DataType::Dictionary(ref index_type, ref value_type) => json!({ "name": self.name, - "nullable": self.data_type.nullable, + "nullable": self.nullable, "type": value_type.to_json(), "children": children, "dictionary": { @@ -1418,8 +1367,8 @@ impl Field { }), _ => json!({ "name": self.name, - "nullable": self.data_type.is_nullable(), - "type": self.data_type.data_type().to_json(), + "nullable": self.nullable, + "type": self.data_type.to_json(), "children": children }), } @@ -1448,8 +1397,8 @@ impl Field { .to_string(), )); } - match &mut self.data_type.data_type { - DataType::Struct(nested_fields) => match &from.data_type.data_type { + match &mut self.data_type { + DataType::Struct(nested_fields) => match &from.data_type { DataType::Struct(from_nested_fields) => { for from_field in from_nested_fields { let mut is_new_field = true; @@ -1472,7 +1421,7 @@ impl Field { )); } }, - DataType::Union(nested_fields) => match &from.data_type.data_type { + DataType::Union(nested_fields) => match &from.data_type { DataType::Union(from_nested_fields) => { for from_field in from_nested_fields { let mut is_new_field = true; @@ -1524,7 +1473,7 @@ impl Field { | DataType::Utf8 | DataType::LargeUtf8 | DataType::Decimal(_, _) => { - if self.data_type.data_type != from.data_type.data_type { + if self.data_type != from.data_type { return Err(ArrowError::SchemaError( "Fail to merge schema Field due to conflicting datatype" .to_string(), @@ -1532,8 +1481,8 @@ impl Field { } } } - if from.data_type.nullable { - self.data_type.nullable = from.data_type.nullable; + if from.nullable { + self.nullable = from.nullable; } Ok(()) @@ -1542,7 +1491,7 @@ impl Field { impl fmt::Display for Field { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}: {:?}", self.name, self.data_type.data_type) + write!(f, "{}: {:?}", self.name, self.data_type) } } @@ -1862,12 +1811,12 @@ mod tests { assert_eq!( "{\"Struct\":[\ - {\"name\":\"first_name\",\"data_type\":{\"data_type\":\"Utf8\",\"nullable\":false},\"dict_id\":0,\"dict_is_ordered\":false},\ - {\"name\":\"last_name\",\"data_type\":{\"data_type\":\"Utf8\",\"nullable\":false},\"dict_id\":0,\"dict_is_ordered\":false},\ - {\"name\":\"address\",\"data_type\":{\"data_type\":{\"Struct\":\ - [{\"name\":\"street\",\"data_type\":{\"data_type\":\"Utf8\",\"nullable\":false},\"dict_id\":0,\"dict_is_ordered\":false},\ - {\"name\":\"zip\",\"data_type\":{\"data_type\":\"UInt16\",\"nullable\":false},\"dict_id\":0,\"dict_is_ordered\":false}\ - ]},\"nullable\":false},\"dict_id\":0,\"dict_is_ordered\":false}]}", + {\"name\":\"first_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false},\ + {\"name\":\"last_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false},\ + {\"name\":\"address\",\"data_type\":{\"Struct\":\ + [{\"name\":\"street\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false},\ + {\"name\":\"zip\",\"data_type\":\"UInt16\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false}\ + ]},\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false}]}", serialized ); @@ -2048,24 +1997,23 @@ mod tests { Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false), Field::new( "c21", - DataType::List(Box::new(NullableDataType::new( - DataType::Boolean, - true, - ))), + DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), false, ), Field::new( "c22", DataType::FixedSizeList( - Box::new(NullableDataType::new(DataType::Boolean, false)), + Box::new(Field::new("bools", DataType::Boolean, false)), 5, ), false, ), Field::new( "c23", - DataType::List(Box::new(NullableDataType::new( - DataType::List(Box::new(NullableDataType::new( + DataType::List(Box::new(Field::new( + "inner_list", + DataType::List(Box::new(Field::new( + "struct", DataType::Struct(vec![]), true, ))), @@ -2101,8 +2049,10 @@ mod tests { Field::new("c33", DataType::LargeUtf8, true), Field::new( "c34", - DataType::LargeList(Box::new(NullableDataType::new( - DataType::LargeList(Box::new(NullableDataType::new( + DataType::LargeList(Box::new(Field::new( + "inner_large_list", + DataType::LargeList(Box::new(Field::new( + "struct", DataType::Struct(vec![]), false, ))), @@ -2330,7 +2280,7 @@ mod tests { }, "children": [ { - "name": "item", + "name": "bools", "nullable": false, "type": { "name": "bool" @@ -2347,14 +2297,14 @@ mod tests { }, "children": [ { - "name": "item", + "name": "inner_list", "nullable": false, "type": { "name": "list" }, "children": [ { - "name": "item", + "name": "struct", "nullable": true, "type": { "name": "struct" @@ -2487,14 +2437,14 @@ mod tests { }, "children": [ { - "name": "item", + "name": "inner_large_list", "nullable": true, "type": { "name": "largelist" }, "children": [ { - "name": "item", + "name": "struct", "nullable": false, "type": { "name": "struct" @@ -2561,8 +2511,8 @@ mod tests { assert_eq!(schema.to_string(), "first_name: Utf8, \ last_name: Utf8, \ address: Struct([\ - Field { name: \"street\", data_type: NullableDataType { data_type: Utf8, nullable: false }, dict_id: 0, dict_is_ordered: false }, \ - Field { name: \"zip\", data_type: NullableDataType { data_type: UInt16, nullable: false }, dict_id: 0, dict_is_ordered: false }])") + Field { name: \"street\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false }, \ + Field { name: \"zip\", data_type: UInt16, nullable: false, dict_id: 0, dict_is_ordered: false }])") } #[test] @@ -2796,34 +2746,6 @@ mod tests { Ok(()) } - - #[test] - fn test_compare_nested_types() { - let list_type_a = &DataType::List(Box::new(NullableDataType::new( - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - true, - ))); - let list_type_b = &DataType::List(Box::new(NullableDataType::new( - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - true, - ))); - - assert_eq!(list_type_a, list_type_b); - } - - #[test] - fn test_compare_mismatching_types() { - let list_type_a = &DataType::LargeList(Box::new(NullableDataType::new( - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - true, - ))); - let list_type_b = &DataType::LargeList(Box::new(NullableDataType::new( - DataType::Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), - false, - ))); - - assert_ne!(list_type_a, list_type_b); - } } #[cfg(all( diff --git a/rust/arrow/src/ipc/convert.rs b/rust/arrow/src/ipc/convert.rs index 8acfb0583ae..5c5544297a1 100644 --- a/rust/arrow/src/ipc/convert.rs +++ b/rust/arrow/src/ipc/convert.rs @@ -17,9 +17,7 @@ //! Utilities for converting between IPC types and native Arrow types -use crate::datatypes::{ - DataType, DateUnit, Field, IntervalUnit, NullableDataType, Schema, TimeUnit, -}; +use crate::datatypes::{DataType, DateUnit, Field, IntervalUnit, Schema, TimeUnit}; use crate::ipc; use flatbuffers::{ @@ -127,12 +125,6 @@ pub fn schema_from_bytes(bytes: &[u8]) -> Option { /// Get the Arrow data type from the flatbuffer Field table pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataType { - get_data_type_context(field, may_be_dictionary) - .data_type() - .clone() -} - -fn get_data_type_context(field: ipc::Field, may_be_dictionary: bool) -> NullableDataType { if let Some(dictionary) = field.dictionary() { if may_be_dictionary { let int = dictionary.indexType().unwrap(); @@ -147,16 +139,14 @@ fn get_data_type_context(field: ipc::Field, may_be_dictionary: bool) -> Nullable (64, false) => DataType::UInt64, _ => panic!("Unexpected bitwidth and signed"), }; - let value_type = get_data_type_context(field, false).data_type().clone(); - return NullableDataType::new( - DataType::Dictionary(Box::new(index_type), Box::new(value_type)), - // taking nullability from parent field - field.nullable(), + return DataType::Dictionary( + Box::new(index_type), + Box::new(get_data_type(field, false)), ); } } - let data_type = match field.type_type() { + match field.type_type() { ipc::Type::Null => DataType::Null, ipc::Type::Bool => DataType::Boolean, ipc::Type::Int => { @@ -253,16 +243,14 @@ fn get_data_type_context(field: ipc::Field, may_be_dictionary: bool) -> Nullable if children.len() != 1 { panic!("expect a list to have one child") } - let child_field = children.get(0); - DataType::List(Box::new(get_data_type_context(child_field, false))) + DataType::List(Box::new(children.get(0).into())) } ipc::Type::LargeList => { let children = field.children().unwrap(); if children.len() != 1 { panic!("expect a large list to have one child") } - let child_field = children.get(0); - DataType::LargeList(Box::new(get_data_type_context(child_field, false))) + DataType::LargeList(Box::new(children.get(0).into())) } ipc::Type::FixedSizeList => { let children = field.children().unwrap(); @@ -270,11 +258,7 @@ fn get_data_type_context(field: ipc::Field, may_be_dictionary: bool) -> Nullable panic!("expect a list to have one child") } let fsl = field.type_as_fixed_size_list().unwrap(); - let child_field = children.get(0); - DataType::FixedSizeList( - Box::new(get_data_type_context(child_field, false)), - fsl.listSize(), - ) + DataType::FixedSizeList(Box::new(children.get(0).into()), fsl.listSize()) } ipc::Type::Struct_ => { let mut fields = vec![]; @@ -287,9 +271,7 @@ fn get_data_type_context(field: ipc::Field, may_be_dictionary: bool) -> Nullable DataType::Struct(fields) } t => unimplemented!("Type {:?} not supported", t), - }; - - NullableDataType::new(data_type, field.nullable()) + } } pub(crate) struct FBFieldType<'b> { @@ -522,63 +504,24 @@ pub(crate) fn get_fb_field_type<'a>( children: Some(fbb.create_vector(&empty_fields[..])), } } - List(ref type_ctx) => { - let nested_type = - get_fb_field_type(type_ctx.data_type(), type_ctx.is_nullable(), fbb); - let child = ipc::Field::create( - fbb, - &ipc::FieldArgs { - name: None, - nullable: type_ctx.is_nullable(), - type_type: nested_type.type_type, - type_: Some(nested_type.type_), - children: nested_type.children, - dictionary: None, - custom_metadata: None, - }, - ); + List(ref list_type) => { + let child = build_field(fbb, list_type); FBFieldType { type_type: ipc::Type::List, type_: ipc::ListBuilder::new(fbb).finish().as_union_value(), children: Some(fbb.create_vector(&[child])), } } - LargeList(ref type_ctx) => { - let inner_types = - get_fb_field_type(type_ctx.data_type(), type_ctx.is_nullable(), fbb); - let child = ipc::Field::create( - fbb, - &ipc::FieldArgs { - name: None, - nullable: type_ctx.is_nullable(), - type_type: inner_types.type_type, - type_: Some(inner_types.type_), - dictionary: None, - children: inner_types.children, - custom_metadata: None, - }, - ); + LargeList(ref list_type) => { + let child = build_field(fbb, list_type); FBFieldType { type_type: ipc::Type::LargeList, type_: ipc::LargeListBuilder::new(fbb).finish().as_union_value(), children: Some(fbb.create_vector(&[child])), } } - FixedSizeList(ref type_ctx, len) => { - let inner_types = - get_fb_field_type(type_ctx.data_type(), type_ctx.is_nullable(), fbb); - let child = ipc::Field::create( - fbb, - &ipc::FieldArgs { - name: None, - nullable: type_ctx.is_nullable(), - type_type: inner_types.type_type, - type_: Some(inner_types.type_), - dictionary: None, - children: inner_types.children, - custom_metadata: None, - }, - ); + FixedSizeList(ref list_type, len) => { + let child = build_field(fbb, list_type); let mut builder = ipc::FixedSizeListBuilder::new(fbb); builder.add_listSize(*len as i32); FBFieldType { @@ -661,7 +604,7 @@ pub(crate) fn get_fb_dictionary<'a>( #[cfg(test)] mod tests { use super::*; - use crate::datatypes::{DataType, Field, NullableDataType, Schema}; + use crate::datatypes::{DataType, Field, Schema}; #[test] fn convert_schema_round_trip() { @@ -727,15 +670,13 @@ mod tests { Field::new("binary", DataType::Binary, false), Field::new( "list[u8]", - DataType::List(Box::new(NullableDataType::new( - DataType::UInt8, - false, - ))), + DataType::List(Box::new(Field::new("item", DataType::UInt8, false))), true, ), Field::new( "list[struct]", - DataType::List(Box::new(NullableDataType::new( + DataType::List(Box::new(Field::new( + "struct", DataType::Struct(vec![ Field::new("float32", DataType::UInt8, false), Field::new("int32", DataType::Int32, true), @@ -751,7 +692,8 @@ mod tests { Field::new("int64", DataType::Int64, true), Field::new( "list[struct]>]", - DataType::List(Box::new(NullableDataType::new( + DataType::List(Box::new(Field::new( + "struct", DataType::Struct(vec![ Field::new( "date32", @@ -760,7 +702,8 @@ mod tests { ), Field::new( "list[struct<>]", - DataType::List(Box::new(NullableDataType::new( + DataType::List(Box::new(Field::new( + "struct", DataType::Struct(vec![]), false, ))), diff --git a/rust/arrow/src/ipc/reader.rs b/rust/arrow/src/ipc/reader.rs index d5a929f066a..76ad6b77cf3 100644 --- a/rust/arrow/src/ipc/reader.rs +++ b/rust/arrow/src/ipc/reader.rs @@ -89,7 +89,7 @@ fn create_array( buffer_index += 2; array } - List(ref type_ctx) | LargeList(ref type_ctx) => { + List(ref list_field) | LargeList(ref list_field) => { let list_node = &nodes[node_index]; let list_buffers: Vec = buffers[buffer_index..buffer_index + 2] .iter() @@ -99,7 +99,7 @@ fn create_array( buffer_index += 2; let triple = create_array( nodes, - type_ctx.data_type(), + list_field.data_type(), data, buffers, dictionaries, diff --git a/rust/arrow/src/json/reader.rs b/rust/arrow/src/json/reader.rs index 08c400708b4..5543edaecf3 100644 --- a/rust/arrow/src/json/reader.rs +++ b/rust/arrow/src/json/reader.rs @@ -66,16 +66,20 @@ fn coerce_data_type(dt: Vec<&DataType>) -> Result { 1 => Ok(dt[0].clone()), 2 => { // there can be a case where a list and scalar both exist - if dt.contains(&&DataType::List(Box::new(NullableDataType::new( + if dt.contains(&&DataType::List(Box::new(Field::new( + "item", DataType::Float64, true, - )))) || dt.contains(&&DataType::List(Box::new(NullableDataType::new( + )))) || dt.contains(&&DataType::List(Box::new(Field::new( + "item", DataType::Int64, true, - )))) || dt.contains(&&DataType::List(Box::new(NullableDataType::new( + )))) || dt.contains(&&DataType::List(Box::new(Field::new( + "item", DataType::Boolean, true, - )))) || dt.contains(&&DataType::List(Box::new(NullableDataType::new( + )))) || dt.contains(&&DataType::List(Box::new(Field::new( + "item", DataType::Utf8, true, )))) { @@ -86,12 +90,14 @@ fn coerce_data_type(dt: Vec<&DataType>) -> Result { match (dt[0], dt[1]) { (t1, DataType::List(e)) if e.data_type() == &DataType::Float64 => { if t1 == &DataType::Float64 { - Ok(DataType::List(Box::new(NullableDataType::new( + Ok(DataType::List(Box::new(Field::new( + "item", DataType::Float64, true, )))) } else { - Ok(DataType::List(Box::new(NullableDataType::new( + Ok(DataType::List(Box::new(Field::new( + "item", coerce_data_type(vec![t1, &DataType::Float64])?, true, )))) @@ -99,12 +105,14 @@ fn coerce_data_type(dt: Vec<&DataType>) -> Result { } (t1, DataType::List(e)) if e.data_type() == &DataType::Int64 => { if t1 == &DataType::Int64 { - Ok(DataType::List(Box::new(NullableDataType::new( + Ok(DataType::List(Box::new(Field::new( + "item", DataType::Int64, true, )))) } else { - Ok(DataType::List(Box::new(NullableDataType::new( + Ok(DataType::List(Box::new(Field::new( + "item", coerce_data_type(vec![t1, &DataType::Int64])?, true, )))) @@ -112,12 +120,14 @@ fn coerce_data_type(dt: Vec<&DataType>) -> Result { } (t1, DataType::List(e)) if e.data_type() == &DataType::Boolean => { if t1 == &DataType::Boolean { - Ok(DataType::List(Box::new(NullableDataType::new( + Ok(DataType::List(Box::new(Field::new( + "item", DataType::Boolean, true, )))) } else { - Ok(DataType::List(Box::new(NullableDataType::new( + Ok(DataType::List(Box::new(Field::new( + "item", coerce_data_type(vec![t1, &DataType::Boolean])?, true, )))) @@ -125,12 +135,14 @@ fn coerce_data_type(dt: Vec<&DataType>) -> Result { } (t1, DataType::List(e)) if e.data_type() == &DataType::Utf8 => { if t1 == &DataType::Utf8 { - Ok(DataType::List(Box::new(NullableDataType::new( + Ok(DataType::List(Box::new(Field::new( + "item", DataType::Utf8, true, )))) } else { - Ok(DataType::List(Box::new(NullableDataType::new( + Ok(DataType::List(Box::new(Field::new( + "item", coerce_data_type(vec![t1, &DataType::Utf8])?, true, )))) @@ -150,7 +162,8 @@ fn coerce_data_type(dt: Vec<&DataType>) -> Result { _ => { // TODO(nevi_me) It's possible to have [float, int, list(float)], which should // return list(float). Will hash this out later - Ok(DataType::List(Box::new(NullableDataType::new( + Ok(DataType::List(Box::new(Field::new( + "item", DataType::Utf8, true, )))) @@ -291,13 +304,13 @@ pub fn infer_json_schema( if values.contains_key(k) { let x = values.get_mut(k).unwrap(); x.insert(DataType::List(Box::new( - NullableDataType::new(dt, true), + Field::new("item", dt, true), ))); } else { // create hashset and add value type let mut hs = HashSet::new(); hs.insert(DataType::List(Box::new( - NullableDataType::new(dt, true), + Field::new("item", dt, true), ))); values.insert(k.to_string(), hs); } @@ -1422,12 +1435,12 @@ mod tests { assert_eq!(&DataType::Int64, a.1.data_type()); let b = schema.column_with_name("b").unwrap(); assert_eq!( - &DataType::List(Box::new(NullableDataType::new(DataType::Float64, true))), + &DataType::List(Box::new(Field::new("item", DataType::Float64, true))), b.1.data_type() ); let c = schema.column_with_name("c").unwrap(); assert_eq!( - &DataType::List(Box::new(NullableDataType::new(DataType::Boolean, true))), + &DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), c.1.data_type() ); let d = schema.column_with_name("d").unwrap(); @@ -1480,35 +1493,35 @@ mod tests { use crate::datatypes::DataType::*; assert_eq!( - List(Box::new(NullableDataType::new(Float64, true))), + List(Box::new(Field::new("item", Float64, true))), coerce_data_type(vec![ &Float64, - &List(Box::new(NullableDataType::new(Float64, true))) + &List(Box::new(Field::new("item", Float64, true))) ]) .unwrap() ); assert_eq!( - List(Box::new(NullableDataType::new(Float64, true))), + List(Box::new(Field::new("item", Float64, true))), coerce_data_type(vec![ &Float64, - &List(Box::new(NullableDataType::new(Int64, true))) + &List(Box::new(Field::new("item", Int64, true))) ]) .unwrap() ); assert_eq!( - List(Box::new(NullableDataType::new(Int64, true))), + List(Box::new(Field::new("item", Int64, true))), coerce_data_type(vec![ &Int64, - &List(Box::new(NullableDataType::new(Int64, true))) + &List(Box::new(Field::new("item", Int64, true))) ]) .unwrap() ); // boolean and number are incompatible, return utf8 assert_eq!( - List(Box::new(NullableDataType::new(Utf8, true))), + List(Box::new(Field::new("item", Utf8, true))), coerce_data_type(vec![ &Boolean, - &List(Box::new(NullableDataType::new(Float64, true))) + &List(Box::new(Field::new("item", Float64, true))) ]) .unwrap() ); @@ -1541,17 +1554,17 @@ mod tests { assert_eq!(&DataType::Int64, a.1.data_type()); let b = schema.column_with_name("b").unwrap(); assert_eq!( - &DataType::List(Box::new(NullableDataType::new(DataType::Float64, true))), + &DataType::List(Box::new(Field::new("item", DataType::Float64, true))), b.1.data_type() ); let c = schema.column_with_name("c").unwrap(); assert_eq!( - &DataType::List(Box::new(NullableDataType::new(DataType::Boolean, true))), + &DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), c.1.data_type() ); let d = schema.column_with_name("d").unwrap(); assert_eq!( - &DataType::List(Box::new(NullableDataType::new(DataType::Utf8, true))), + &DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), d.1.data_type() ); @@ -1791,7 +1804,8 @@ mod tests { fn test_list_of_string_dictionary_from_json() { let schema = Schema::new(vec![Field::new( "events", - List(Box::new(NullableDataType::new( + List(Box::new(Field::new( + "item", Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), true, ))), @@ -1814,7 +1828,8 @@ mod tests { let events = schema.column_with_name("events").unwrap(); assert_eq!( - &List(Box::new(NullableDataType::new( + &List(Box::new(Field::new( + "item", Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), true ))), @@ -1848,7 +1863,8 @@ mod tests { fn test_list_of_string_dictionary_from_json_with_nulls() { let schema = Schema::new(vec![Field::new( "events", - List(Box::new(NullableDataType::new( + List(Box::new(Field::new( + "item", Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), true, ))), @@ -1873,7 +1889,8 @@ mod tests { let events = schema.column_with_name("events").unwrap(); assert_eq!( - &List(Box::new(NullableDataType::new( + &List(Box::new(Field::new( + "item", Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), true ))), @@ -2014,17 +2031,17 @@ mod tests { Field::new("a", DataType::Int64, true), Field::new( "b", - DataType::List(Box::new(NullableDataType::new(DataType::Float64, true))), + DataType::List(Box::new(Field::new("item", DataType::Float64, true))), true, ), Field::new( "c", - DataType::List(Box::new(NullableDataType::new(DataType::Boolean, true))), + DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), true, ), Field::new( "d", - DataType::List(Box::new(NullableDataType::new(DataType::Utf8, true))), + DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), true, ), ]); diff --git a/rust/arrow/src/record_batch.rs b/rust/arrow/src/record_batch.rs index 41cbd6dcfb0..b4aa97dd2a2 100644 --- a/rust/arrow/src/record_batch.rs +++ b/rust/arrow/src/record_batch.rs @@ -298,42 +298,6 @@ mod tests { assert!(!batch.is_ok()); } - #[test] - fn create_record_batch_with_matching_nested_type() { - let schema = Schema::new(vec![Field::new( - "list", - DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))), - false, - )]); - - let child_data = Int32Array::from(vec![0, 1, 2, 3, 4, 5]); - let child_data_ref = Arc::new(ArrayData::new( - DataType::Int32, - 6, - None, - None, - 0, - vec![child_data.data_ref().buffers()[0].clone()], - vec![], - )); - - let offsets = UInt64Array::from(vec![0, 2, 4]); - let array_data = Arc::new(ArrayData::new( - DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))), - 3, - None, - None, - 0, - vec![offsets.data_ref().buffers()[0].clone()], - vec![child_data_ref], - )); - - let list_array = Arc::new(ListArray::from(array_data)); - - let result = RecordBatch::try_new(Arc::new(schema), vec![list_array]); - assert!(result.is_ok()); - } - #[test] fn create_record_batch_from_struct_array() { let boolean_data = ArrayData::builder(DataType::Boolean) diff --git a/rust/arrow/src/util/integration_util.rs b/rust/arrow/src/util/integration_util.rs index 4e419968d7c..94d0a9b75a0 100644 --- a/rust/arrow/src/util/integration_util.rs +++ b/rust/arrow/src/util/integration_util.rs @@ -688,7 +688,11 @@ mod tests { Field::new("c3", DataType::Utf8, true), Field::new( "c4", - DataType::List(Box::new(NullableDataType::new(DataType::Int32, false))), + DataType::List(Box::new(Field::new( + "custom_item", + DataType::Int32, + false, + ))), true, ), ]); @@ -758,7 +762,7 @@ mod tests { Field::new("utf8s", DataType::Utf8, true), Field::new( "lists", - DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))), + DataType::List(Box::new(Field::new("item", DataType::Int32, true))), true, ), Field::new( @@ -835,7 +839,7 @@ mod tests { let value_data = Int32Array::from(vec![None, Some(2), None, None]); let value_offsets = Buffer::from(&[0, 3, 4, 4].to_byte_slice()); let list_data_type = - DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))); + DataType::List(Box::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) diff --git a/rust/datafusion/src/physical_plan/distinct_expressions.rs b/rust/datafusion/src/physical_plan/distinct_expressions.rs index a441c0b76c2..bbccc3be6eb 100644 --- a/rust/datafusion/src/physical_plan/distinct_expressions.rs +++ b/rust/datafusion/src/physical_plan/distinct_expressions.rs @@ -22,7 +22,7 @@ use std::fmt::Debug; use std::hash::Hash; use std::sync::Arc; -use arrow::datatypes::{DataType, Field, NullableDataType}; +use arrow::datatypes::{DataType, Field}; use ahash::RandomState; use std::collections::HashSet; @@ -81,10 +81,7 @@ impl AggregateExpr for DistinctCount { .map(|data_type| { Field::new( &format_state_name(&self.name, "count distinct"), - DataType::List(Box::new(NullableDataType::new( - data_type.clone(), - true, - ))), + DataType::List(Box::new(Field::new("item", data_type.clone(), true))), false, ) }) diff --git a/rust/datafusion/src/physical_plan/functions.rs b/rust/datafusion/src/physical_plan/functions.rs index 41fd2105191..b954f479d93 100644 --- a/rust/datafusion/src/physical_plan/functions.rs +++ b/rust/datafusion/src/physical_plan/functions.rs @@ -39,12 +39,11 @@ use crate::physical_plan::datetime_expressions; use crate::physical_plan::expressions::{nullif_func, SUPPORTED_NULLIF_TYPES}; use crate::physical_plan::math_expressions; use crate::physical_plan::string_expressions; -use arrow::datatypes::NullableDataType; use arrow::{ array::ArrayRef, compute::kernels::length::length, datatypes::TimeUnit, - datatypes::{DataType, Schema}, + datatypes::{DataType, Field, Schema}, record_batch::RecordBatch, }; use fmt::{Debug, Formatter}; @@ -208,7 +207,7 @@ pub fn return_type( Ok(DataType::Timestamp(TimeUnit::Nanosecond, None)) } BuiltinScalarFunction::Array => Ok(DataType::FixedSizeList( - Box::new(NullableDataType::new(arg_types[0].clone(), true)), + Box::new(Field::new("item", arg_types[0].clone(), true)), arg_types.len() as i32, )), BuiltinScalarFunction::NullIf => { @@ -485,10 +484,7 @@ mod tests { assert_eq!( expr.data_type(&schema)?, // type equals to a common coercion - DataType::FixedSizeList( - Box::new(NullableDataType::new(expected_type, true)), - 2 - ) + DataType::FixedSizeList(Box::new(Field::new("item", expected_type, true)), 2) ); // evaluate works diff --git a/rust/datafusion/src/physical_plan/planner.rs b/rust/datafusion/src/physical_plan/planner.rs index d6082f3a720..b592111b592 100644 --- a/rust/datafusion/src/physical_plan/planner.rs +++ b/rust/datafusion/src/physical_plan/planner.rs @@ -786,7 +786,7 @@ mod tests { }; let plan = planner.create_physical_plan(&logical_plan, &ctx_state); - let expected_error = "Extension planner for NoOp created an ExecutionPlan with mismatched schema. LogicalPlan schema: Schema { fields: [Field { name: \"a\", data_type: NullableDataType { data_type: Int32, nullable: false }, dict_id: 0, dict_is_ordered: false }], metadata: {} }, ExecutionPlan schema: Schema { fields: [Field { name: \"b\", data_type: NullableDataType { data_type: Int32, nullable: false }, dict_id: 0, dict_is_ordered: false }], metadata: {} }"; + let expected_error = "Extension planner for NoOp created an ExecutionPlan with mismatched schema. LogicalPlan schema: Schema { fields: [Field { name: \"a\", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false }], metadata: {} }, ExecutionPlan schema: Schema { fields: [Field { name: \"b\", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false }], metadata: {} }"; match plan { Ok(_) => assert!(false, "Expected planning failure"), diff --git a/rust/datafusion/src/scalar.rs b/rust/datafusion/src/scalar.rs index c64f1a2d02d..06309ab84c0 100644 --- a/rust/datafusion/src/scalar.rs +++ b/rust/datafusion/src/scalar.rs @@ -23,7 +23,10 @@ use arrow::array::{ Int16Builder, Int32Builder, Int64Builder, Int8Builder, ListBuilder, UInt16Builder, UInt32Builder, UInt64Builder, UInt8Builder, }; -use arrow::{array::ArrayRef, datatypes::DataType}; +use arrow::{ + array::ArrayRef, + datatypes::{DataType, Field}, +}; use arrow::{ array::{ Array, BooleanArray, Date32Array, Float32Array, Float64Array, Int16Array, @@ -34,7 +37,6 @@ use arrow::{ }; use crate::error::{DataFusionError, Result}; -use arrow::datatypes::NullableDataType; /// Represents a dynamically typed, nullable single value. /// This is the single-valued counter-part of arrow’s `Array`. @@ -134,7 +136,7 @@ impl ScalarValue { ScalarValue::Utf8(_) => DataType::Utf8, ScalarValue::LargeUtf8(_) => DataType::LargeUtf8, ScalarValue::List(_, data_type) => { - DataType::List(Box::new(NullableDataType::new(data_type.clone(), true))) + DataType::List(Box::new(Field::new("item", data_type.clone(), true))) } ScalarValue::Date32(_) => DataType::Date32(DateUnit::Day), } diff --git a/rust/datafusion/tests/sql.rs b/rust/datafusion/tests/sql.rs index bb503a102e4..7327487d3b0 100644 --- a/rust/datafusion/tests/sql.rs +++ b/rust/datafusion/tests/sql.rs @@ -25,7 +25,7 @@ extern crate datafusion; use arrow::{array::*, datatypes::TimeUnit}; use arrow::{datatypes::Int32Type, datatypes::Int64Type, record_batch::RecordBatch}; use arrow::{ - datatypes::{DataType, Field, NullableDataType, Schema, SchemaRef}, + datatypes::{DataType, Field, Schema, SchemaRef}, util::display::array_value_to_string, }; @@ -142,12 +142,12 @@ async fn parquet_list_columns() { let schema = Arc::new(Schema::new(vec![ Field::new( "int64_list", - DataType::List(Box::new(NullableDataType::new(DataType::Int64, true))), + DataType::List(Box::new(Field::new("item", DataType::Int64, true))), true, ), Field::new( "utf8_list", - DataType::List(Box::new(NullableDataType::new(DataType::Utf8, true))), + DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), true, ), ])); diff --git a/rust/integration-testing/src/bin/arrow-json-integration-test.rs b/rust/integration-testing/src/bin/arrow-json-integration-test.rs index 2ce36922b03..72a113fcdc2 100644 --- a/rust/integration-testing/src/bin/arrow-json-integration-test.rs +++ b/rust/integration-testing/src/bin/arrow-json-integration-test.rs @@ -408,14 +408,9 @@ fn array_from_json( } Ok(Arc::new(b.finish())) } - DataType::List(type_ctx) => { + DataType::List(child_field) => { let null_buf = create_null_buf(&json_col); let children = json_col.children.clone().unwrap(); - let child_field = Field::new( - "element", - type_ctx.data_type().clone(), - type_ctx.is_nullable(), - ); let child_array = array_from_json( &child_field, children.get(0).unwrap().clone(), @@ -436,14 +431,9 @@ fn array_from_json( .build(); Ok(Arc::new(ListArray::from(list_data))) } - DataType::LargeList(type_ctx) => { + DataType::LargeList(child_field) => { let null_buf = create_null_buf(&json_col); let children = json_col.children.clone().unwrap(); - let child_field = Field::new( - "element", - type_ctx.data_type().clone(), - type_ctx.is_nullable(), - ); let child_array = array_from_json( &child_field, children.get(0).unwrap().clone(), @@ -468,13 +458,8 @@ fn array_from_json( .build(); Ok(Arc::new(LargeListArray::from(list_data))) } - DataType::FixedSizeList(type_ctx, _) => { + DataType::FixedSizeList(child_field, _) => { let children = json_col.children.clone().unwrap(); - let child_field = Field::new( - "element", - type_ctx.data_type().clone(), - type_ctx.is_nullable(), - ); let child_array = array_from_json( &child_field, children.get(0).unwrap().clone(), @@ -495,8 +480,8 @@ fn array_from_json( .len(json_col.count) .null_bit_buffer(null_buf); - for (f, col) in fields.iter().zip(json_col.children.unwrap()) { - let array = array_from_json(f, col, dictionaries)?; + for (field, col) in fields.iter().zip(json_col.children.unwrap()) { + let array = array_from_json(field, col, dictionaries)?; array_data = array_data.add_child_data(array.data()); } diff --git a/rust/parquet/src/arrow/array_reader.rs b/rust/parquet/src/arrow/array_reader.rs index eeb71b0dc12..231b46dea55 100644 --- a/rust/parquet/src/arrow/array_reader.rs +++ b/rust/parquet/src/arrow/array_reader.rs @@ -40,7 +40,7 @@ use arrow::datatypes::{ DurationSecondType as ArrowDurationSecondType, Field, Float32Type as ArrowFloat32Type, Float64Type as ArrowFloat64Type, Int16Type as ArrowInt16Type, Int32Type as ArrowInt32Type, - Int64Type as ArrowInt64Type, Int8Type as ArrowInt8Type, NullableDataType, Schema, + Int64Type as ArrowInt64Type, Int8Type as ArrowInt8Type, Schema, Time32MillisecondType as ArrowTime32MillisecondType, Time32SecondType as ArrowTime32SecondType, Time64MicrosecondType as ArrowTime64MicrosecondType, @@ -1347,7 +1347,8 @@ impl<'a> TypeVisitor>, &'a ArrayReaderBuilderContext .ok() .map(|f| f.data_type().to_owned()) .unwrap_or_else(|| { - ArrowType::List(Box::new(NullableDataType::new( + ArrowType::List(Box::new(Field::new( + list_type.name(), item_reader_type.clone(), list_type.is_optional(), ))) @@ -1627,7 +1628,7 @@ mod tests { }; use arrow::datatypes::{ ArrowPrimitiveType, DataType as ArrowType, Date32Type as ArrowDate32, Field, - Int32Type as ArrowInt32, Int64Type as ArrowInt64, NullableDataType, + Int32Type as ArrowInt32, Int64Type as ArrowInt64, Time32MillisecondType as ArrowTime32MillisecondArray, Time64MicrosecondType as ArrowTime64MicrosecondArray, TimestampMicrosecondType as ArrowTimestampMicrosecondType, @@ -2310,7 +2311,7 @@ mod tests { let mut list_array_reader = ListArrayReader::::new( Box::new(item_array_reader), - ArrowType::List(Box::new(NullableDataType::new(ArrowType::Int32, false))), + ArrowType::List(Box::new(Field::new("item", ArrowType::Int32, true))), ArrowType::Int32, 1, 1, @@ -2364,7 +2365,7 @@ mod tests { let mut list_array_reader = ListArrayReader::::new( Box::new(item_array_reader), - ArrowType::LargeList(Box::new(NullableDataType::new(ArrowType::Int32, true))), + ArrowType::LargeList(Box::new(Field::new("item", ArrowType::Int32, true))), ArrowType::Int32, 1, 1, diff --git a/rust/parquet/src/arrow/arrow_writer.rs b/rust/parquet/src/arrow/arrow_writer.rs index 4dd35a70736..dc9cf70a374 100644 --- a/rust/parquet/src/arrow/arrow_writer.rs +++ b/rust/parquet/src/arrow/arrow_writer.rs @@ -688,8 +688,8 @@ mod tests { use std::sync::Arc; use arrow::array::*; + use arrow::datatypes::ToByteSlice; use arrow::datatypes::{DataType, Field, Schema, UInt32Type, UInt8Type}; - use arrow::datatypes::{NullableDataType, ToByteSlice}; use arrow::record_batch::RecordBatch; use crate::arrow::{ArrowReader, ParquetFileArrowReader}; @@ -776,7 +776,7 @@ mod tests { // define schema let schema = Schema::new(vec![Field::new( "a", - DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))), + DataType::List(Box::new(Field::new("item", DataType::Int32, true))), false, )]); @@ -789,9 +789,11 @@ mod tests { arrow::buffer::Buffer::from(&[0, 1, 3, 3, 6, 10].to_byte_slice()); // Construct a list array from the above two - let a_list_data = ArrayData::builder(DataType::List(Box::new( - NullableDataType::new(DataType::Int32, true), - ))) + let a_list_data = ArrayData::builder(DataType::List(Box::new(Field::new( + "items", + DataType::Int32, + true, + )))) .len(5) .add_buffer(a_value_offsets) .add_child_data(a_values.data()) @@ -874,7 +876,7 @@ mod tests { let struct_field_f = Field::new("f", DataType::Float32, true); let struct_field_g = Field::new( "g", - DataType::List(Box::new(NullableDataType::new(DataType::Int16, false))), + DataType::List(Box::new(Field::new("items", DataType::Int16, false))), false, ); let struct_field_e = Field::new( @@ -1295,9 +1297,11 @@ mod tests { let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); let a_value_offsets = arrow::buffer::Buffer::from(&[0, 1, 3, 3, 6, 10].to_byte_slice()); - let a_list_data = ArrayData::builder(DataType::List(Box::new( - NullableDataType::new(DataType::Int32, true), - ))) + let a_list_data = ArrayData::builder(DataType::List(Box::new(Field::new( + "item", + DataType::Int32, + true, + )))) .len(5) .add_buffer(a_value_offsets) .add_child_data(a_values.data()) @@ -1318,9 +1322,11 @@ mod tests { let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); let a_value_offsets = arrow::buffer::Buffer::from(&[0i64, 1, 3, 3, 6, 10].to_byte_slice()); - let a_list_data = ArrayData::builder(DataType::LargeList(Box::new( - NullableDataType::new(DataType::Int32, true), - ))) + let a_list_data = ArrayData::builder(DataType::LargeList(Box::new(Field::new( + "large_item", + DataType::Int32, + true, + )))) .len(5) .add_buffer(a_value_offsets) .add_child_data(a_values.data()) diff --git a/rust/parquet/src/arrow/schema.rs b/rust/parquet/src/arrow/schema.rs index b0c3564ecb1..c93325b79b1 100644 --- a/rust/parquet/src/arrow/schema.rs +++ b/rust/parquet/src/arrow/schema.rs @@ -26,7 +26,7 @@ use std::collections::{HashMap, HashSet}; use std::sync::Arc; -use arrow::datatypes::{DataType, DateUnit, Field, NullableDataType, Schema, TimeUnit}; +use arrow::datatypes::{DataType, DateUnit, Field, Schema, TimeUnit}; use arrow::ipc::writer; use crate::basic::{LogicalType, Repetition, Type as PhysicalType}; @@ -412,25 +412,18 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_repetition(repetition) .build() } - DataType::List(type_ctx) - | DataType::FixedSizeList(type_ctx, _) - | DataType::LargeList(type_ctx) => Type::group_type_builder(name) - .with_fields(&mut vec![Arc::new( - Type::group_type_builder("list") - .with_fields(&mut vec![Arc::new({ - let list_field = Field::new( - "element", - type_ctx.data_type().clone(), - type_ctx.is_nullable(), - ); - arrow_to_parquet_type(&list_field)? - })]) - .with_repetition(Repetition::REPEATED) - .build()?, - )]) - .with_logical_type(LogicalType::LIST) - .with_repetition(Repetition::REQUIRED) - .build(), + DataType::List(f) | DataType::FixedSizeList(f, _) | DataType::LargeList(f) => { + Type::group_type_builder(name) + .with_fields(&mut vec![Arc::new( + Type::group_type_builder("list") + .with_fields(&mut vec![Arc::new(arrow_to_parquet_type(f)?)]) + .with_repetition(Repetition::REPEATED) + .build()?, + )]) + .with_logical_type(LogicalType::LIST) + .with_repetition(Repetition::REQUIRED) + .build() + } DataType::Struct(fields) => { if fields.is_empty() { return Err(ArrowError( @@ -545,7 +538,8 @@ impl ParquetTypeConverter<'_> { if self.is_self_included() { self.to_primitive_type_inner().map(|dt| { if self.is_repeated() { - Some(DataType::List(Box::new(NullableDataType::new( + Some(DataType::List(Box::new(Field::new( + self.schema.name(), dt, self.is_nullable(), )))) @@ -644,7 +638,8 @@ impl ParquetTypeConverter<'_> { if self.is_repeated() { self.to_struct().map(|opt| { opt.map(|dt| { - DataType::List(Box::new(NullableDataType::new( + DataType::List(Box::new(Field::new( + self.schema.name(), dt, self.is_nullable(), ))) @@ -736,7 +731,8 @@ impl ParquetTypeConverter<'_> { item_type.map(|opt| { opt.map(|dt| { - DataType::List(Box::new(NullableDataType::new( + DataType::List(Box::new(Field::new( + list_item.name(), dt, list_item.is_optional(), ))) @@ -756,9 +752,7 @@ mod tests { use std::{collections::HashMap, convert::TryFrom, sync::Arc}; - use arrow::datatypes::{ - DataType, DateUnit, Field, IntervalUnit, NullableDataType, TimeUnit, - }; + use arrow::datatypes::{DataType, DateUnit, Field, IntervalUnit, TimeUnit}; use crate::file::{metadata::KeyValue, reader::SerializedFileReader}; use crate::{ @@ -917,7 +911,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(NullableDataType::new(DataType::Utf8, true))), + DataType::List(Box::new(Field::new("list", DataType::Utf8, true))), false, )); } @@ -931,7 +925,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(NullableDataType::new(DataType::Utf8, true))), + DataType::List(Box::new(Field::new("list", DataType::Utf8, true))), true, )); } @@ -950,10 +944,10 @@ mod tests { // } { let arrow_inner_list = - DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))); + DataType::List(Box::new(Field::new("list", DataType::Int32, true))); arrow_fields.push(Field::new( "array_of_arrays", - DataType::List(Box::new(NullableDataType::new(arrow_inner_list, true))), + DataType::List(Box::new(Field::new("list", arrow_inner_list, true))), true, )); } @@ -967,7 +961,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(NullableDataType::new(DataType::Utf8, true))), + DataType::List(Box::new(Field::new("element", DataType::Utf8, true))), true, )); } @@ -979,7 +973,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))), + DataType::List(Box::new(Field::new("element", DataType::Int32, true))), true, )); } @@ -998,7 +992,7 @@ mod tests { ]); arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(NullableDataType::new(arrow_struct, true))), + DataType::List(Box::new(Field::new("element", arrow_struct, true))), true, )); } @@ -1015,7 +1009,7 @@ mod tests { DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]); arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(NullableDataType::new(arrow_struct, true))), + DataType::List(Box::new(Field::new("array", arrow_struct, true))), true, )); } @@ -1032,7 +1026,7 @@ mod tests { DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]); arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(NullableDataType::new(arrow_struct, true))), + DataType::List(Box::new(Field::new("my_list_tuple", arrow_struct, true))), true, )); } @@ -1042,7 +1036,7 @@ mod tests { { arrow_fields.push(Field::new( "name", - DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))), + DataType::List(Box::new(Field::new("name", DataType::Int32, true))), true, )); } @@ -1208,7 +1202,8 @@ mod tests { let inner_group_list = Field::new( "innerGroup", - DataType::List(Box::new(NullableDataType::new( + DataType::List(Box::new(Field::new( + "innerGroup", DataType::Struct(vec![Field::new("leaf3", DataType::Int32, true)]), true, ))), @@ -1217,7 +1212,8 @@ mod tests { let outer_group_list = Field::new( "outerGroup", - DataType::List(Box::new(NullableDataType::new( + DataType::List(Box::new(Field::new( + "outerGroup", DataType::Struct(vec![ Field::new("leaf2", DataType::Int32, true), inner_group_list, @@ -1293,7 +1289,7 @@ mod tests { Field::new("string", DataType::Utf8, true), Field::new( "bools", - DataType::List(Box::new(NullableDataType::new(DataType::Boolean, true))), + DataType::List(Box::new(Field::new("bools", DataType::Boolean, true))), true, ), Field::new("date", DataType::Date32(DateUnit::Day), true), @@ -1363,7 +1359,7 @@ mod tests { Field::new("string", DataType::Utf8, true), Field::new( "bools", - DataType::List(Box::new(NullableDataType::new(DataType::Boolean, true))), + DataType::List(Box::new(Field::new("element", DataType::Boolean, true))), true, ), Field::new("date", DataType::Date32(DateUnit::Day), true), @@ -1386,7 +1382,8 @@ mod tests { Field::new("uint32", DataType::UInt32, false), Field::new( "int32", - DataType::List(Box::new(NullableDataType::new( + DataType::List(Box::new(Field::new( + "element", DataType::Int32, true, ))), @@ -1499,10 +1496,7 @@ mod tests { Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false), Field::new( "c21", - DataType::List(Box::new(NullableDataType::new( - DataType::Boolean, - true, - ))), + DataType::List(Box::new(Field::new("list", DataType::Boolean, true))), false, ), // Field::new( @@ -1597,7 +1591,8 @@ mod tests { vec![ Field::new( "c21", - DataType::List(Box::new(NullableDataType::new( + DataType::List(Box::new(Field::new( + "array", DataType::Boolean, true, ))), @@ -1606,15 +1601,17 @@ mod tests { Field::new( "c22", DataType::FixedSizeList( - Box::new(NullableDataType::new(DataType::Boolean, false)), + Box::new(Field::new("items", DataType::Boolean, false)), 5, ), false, ), Field::new( "c23", - DataType::List(Box::new(NullableDataType::new( - DataType::LargeList(Box::new(NullableDataType::new( + DataType::List(Box::new(Field::new( + "items", + DataType::LargeList(Box::new(Field::new( + "items", DataType::Struct(vec![ Field::new("a", DataType::Int16, true), Field::new("b", DataType::Float64, false),