From 9dc7994f35cabcaf4e504197fa662b8abb59cc18 Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Sun, 11 Oct 2020 10:06:54 +0200 Subject: [PATCH 1/6] ARROW-10261: [Rust] Change list types to use Field This is part 1 of potentially 3 parts, it only changes the `arrow` crate `parquet` depends on `arrow`, and `datafusion` depends on both. `datafusion` will thus only compile after `parquet` is fixed. --- rust/arrow/examples/builders.rs | 3 +- rust/arrow/src/array/array.rs | 51 +++-- rust/arrow/src/array/builder.rs | 93 ++++++--- rust/arrow/src/compute/kernels/cast.rs | 74 ++++--- rust/arrow/src/compute/kernels/comparison.rs | 5 +- rust/arrow/src/compute/kernels/concat.rs | 6 +- rust/arrow/src/compute/kernels/filter.rs | 7 +- rust/arrow/src/compute/kernels/limit.rs | 3 +- rust/arrow/src/compute/kernels/take.rs | 10 +- rust/arrow/src/compute/util.rs | 4 +- rust/arrow/src/datatypes.rs | 130 ++++++++---- rust/arrow/src/ipc/convert.rs | 151 +++++++------- rust/arrow/src/ipc/reader.rs | 8 +- rust/arrow/src/json/reader.rs | 197 +++++++++++++------ rust/arrow/src/record_batch.rs | 3 +- rust/arrow/src/util/integration_util.rs | 35 ++-- rust/arrow/test/data/integration.json | 4 +- 17 files changed, 514 insertions(+), 270 deletions(-) diff --git a/rust/arrow/examples/builders.rs b/rust/arrow/examples/builders.rs index 01edf2d99fa..61cce0ed97a 100644 --- a/rust/arrow/examples/builders.rs +++ b/rust/arrow/examples/builders.rs @@ -99,7 +99,8 @@ fn main() { let value_offsets = Buffer::from(&[0, 3, 6, 8].to_byte_slice()); // Construct a list array from the above two - let list_data_type = DataType::List(Box::new(DataType::Int32)); + let list_data_type = + DataType::List(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) diff --git a/rust/arrow/src/array/array.rs b/rust/arrow/src/array/array.rs index f35c02aa3d0..8556c99e079 100644 --- a/rust/arrow/src/array/array.rs +++ b/rust/arrow/src/array/array.rs @@ -2927,7 +2927,8 @@ mod tests { let value_offsets = Buffer::from(&[0, 3, 6, 8].to_byte_slice()); // Construct a list array from the above two - let list_data_type = DataType::List(Box::new(DataType::Int32)); + let list_data_type = + DataType::List(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_buffer(value_offsets.clone()) @@ -2996,7 +2997,8 @@ mod tests { let value_offsets = Buffer::from(&[0i64, 3, 6, 8].to_byte_slice()); // Construct a list array from the above two - let list_data_type = DataType::LargeList(Box::new(DataType::Int32)); + let list_data_type = + DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_buffer(value_offsets.clone()) @@ -3147,7 +3149,10 @@ mod tests { .build(); // Construct a list array from the above two - let list_data_type = DataType::FixedSizeList(Box::new(DataType::Int32), 3); + let list_data_type = DataType::FixedSizeList( + Box::new(Field::new("item", DataType::Int32, false)), + 3, + ); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_child_data(value_data.clone()) @@ -3213,7 +3218,10 @@ mod tests { .build(); // Construct a list array from the above two - let list_data_type = DataType::FixedSizeList(Box::new(DataType::Int32), 3); + let list_data_type = DataType::FixedSizeList( + Box::new(Field::new("item", DataType::Int32, false)), + 3, + ); let list_data = ArrayData::builder(list_data_type) .len(3) .add_child_data(value_data) @@ -3244,7 +3252,8 @@ mod tests { bit_util::set_bit(&mut null_bits, 8); // Construct a list array from the above two - let list_data_type = DataType::List(Box::new(DataType::Int32)); + let list_data_type = + DataType::List(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(9) .add_buffer(value_offsets) @@ -3308,7 +3317,8 @@ mod tests { bit_util::set_bit(&mut null_bits, 8); // Construct a list array from the above two - let list_data_type = DataType::LargeList(Box::new(DataType::Int32)); + let list_data_type = + DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(9) .add_buffer(value_offsets) @@ -3370,7 +3380,10 @@ mod tests { bit_util::set_bit(&mut null_bits, 4); // Construct a fixed size list array from the above two - let list_data_type = DataType::FixedSizeList(Box::new(DataType::Int32), 2); + let list_data_type = DataType::FixedSizeList( + Box::new(Field::new("item", DataType::Int32, false)), + 2, + ); let list_data = ArrayData::builder(list_data_type) .len(5) .add_child_data(value_data.clone()) @@ -3418,7 +3431,8 @@ mod tests { .len(8) .add_buffer(Buffer::from(&[0, 1, 2, 3, 4, 5, 6, 7].to_byte_slice())) .build(); - let list_data_type = DataType::List(Box::new(DataType::Int32)); + let list_data_type = + DataType::List(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_child_data(value_data) @@ -3432,7 +3446,8 @@ mod tests { )] fn test_list_array_invalid_child_array_len() { let value_offsets = Buffer::from(&[0, 2, 5, 7].to_byte_slice()); - let list_data_type = DataType::List(Box::new(DataType::Int32)); + let list_data_type = + DataType::List(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -3450,7 +3465,8 @@ mod tests { let value_offsets = Buffer::from(&[2, 2, 5, 7].to_byte_slice()); - let list_data_type = DataType::List(Box::new(DataType::Int32)); + let list_data_type = + DataType::List(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -3843,11 +3859,13 @@ mod tests { .add_child_data(ArrayData::builder(DataType::Boolean).build()) .build(); - let array_data = - ArrayData::builder(DataType::FixedSizeList(Box::new(DataType::Binary), 4)) - .len(3) - .add_child_data(values_data) - .build(); + let array_data = ArrayData::builder(DataType::FixedSizeList( + Box::new(Field::new("item", DataType::Binary, false)), + 4, + )) + .len(3) + .add_child_data(values_data) + .build(); let list_array = FixedSizeListArray::from(array_data); FixedSizeBinaryArray::from(list_array); } @@ -4197,7 +4215,8 @@ mod tests { .add_buffer(Buffer::from(values.to_byte_slice())) .build(); - let list_data_type = DataType::List(Box::new(DataType::Int32)); + let list_data_type = + DataType::List(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .add_buffer(buf2) .add_child_data(value_data) diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs index e93dfd10319..12a27d11436 100644 --- a/rust/arrow/src/array/builder.rs +++ b/rust/arrow/src/array/builder.rs @@ -775,7 +775,11 @@ where /// /// This is used for validating array data types in `append_data` fn data_type(&self) -> DataType { - DataType::List(Box::new(self.values_builder.data_type())) + DataType::List(Box::new(Field::new( + "item", + self.values_builder.data_type(), + true, + ))) } /// Returns the builder as a mutable `Any` reference. @@ -839,15 +843,19 @@ where let offset_buffer = self.offsets_builder.finish(); let null_bit_buffer = self.bitmap_builder.finish(); + let nulls = bit_util::count_set_bits(null_bit_buffer.data()); self.offsets_builder.append(0).unwrap(); - let data = - ArrayData::builder(DataType::List(Box::new(values_data.data_type().clone()))) - .len(len) - .null_count(len - bit_util::count_set_bits(null_bit_buffer.data())) - .add_buffer(offset_buffer) - .add_child_data(values_data) - .null_bit_buffer(null_bit_buffer) - .build(); + let data = ArrayData::builder(DataType::List(Box::new(Field::new( + "item", + values_data.data_type().clone(), + nulls > 0, + )))) + .len(len) + .null_count(len - nulls) + .add_buffer(offset_buffer) + .add_child_data(values_data) + .null_bit_buffer(null_bit_buffer) + .build(); ListArray::from(data) } @@ -980,7 +988,11 @@ where /// /// This is used for validating array data types in `append_data` fn data_type(&self) -> DataType { - DataType::LargeList(Box::new(self.values_builder.data_type())) + DataType::LargeList(Box::new(Field::new( + "item", + self.values_builder.data_type(), + true, + ))) } /// Returns the builder as a mutable `Any` reference. @@ -1044,12 +1056,15 @@ where let offset_buffer = self.offsets_builder.finish(); let null_bit_buffer = self.bitmap_builder.finish(); + let nulls = bit_util::count_set_bits(null_bit_buffer.data()); self.offsets_builder.append(0).unwrap(); - let data = ArrayData::builder(DataType::LargeList(Box::new( + let data = ArrayData::builder(DataType::LargeList(Box::new(Field::new( + "item", values_data.data_type().clone(), - ))) + nulls > 0, + )))) .len(len) - .null_count(len - bit_util::count_set_bits(null_bit_buffer.data())) + .null_count(len - nulls) .add_buffer(offset_buffer) .add_child_data(values_data) .null_bit_buffer(null_bit_buffer) @@ -1155,7 +1170,10 @@ where /// /// This is used for validating array data types in `append_data` fn data_type(&self) -> DataType { - DataType::FixedSizeList(Box::new(self.values_builder.data_type()), self.list_len) + DataType::FixedSizeList( + Box::new(Field::new("item", self.values_builder.data_type(), true)), + self.list_len, + ) } /// Returns the builder as a mutable `Any` reference. @@ -1230,12 +1248,17 @@ where } let null_bit_buffer = self.bitmap_builder.finish(); + let nulls = bit_util::count_set_bits(null_bit_buffer.data()); let data = ArrayData::builder(DataType::FixedSizeList( - Box::new(values_data.data_type().clone()), + Box::new(Field::new( + "item", + values_data.data_type().clone(), + nulls > 0, + )), self.list_len, )) .len(len) - .null_count(len - bit_util::count_set_bits(null_bit_buffer.data())) + .null_count(len - nulls) .add_child_data(values_data) .null_bit_buffer(null_bit_buffer) .build(); @@ -1445,7 +1468,7 @@ fn append_binary_data( )) as ArrayDataRef; Arc::new(ArrayData::new( - DataType::List(Box::new(DataType::UInt8)), + DataType::List(Box::new(Field::new("item", DataType::UInt8, true))), array.len(), None, array.null_buffer().cloned(), @@ -1497,7 +1520,11 @@ fn append_large_binary_data( )) as ArrayDataRef; Arc::new(ArrayData::new( - DataType::LargeList(Box::new(DataType::UInt8)), + DataType::LargeList(Box::new(Field::new( + "item", + DataType::UInt8, + true, + ))), array.len(), None, array.null_buffer().cloned(), @@ -1595,7 +1622,10 @@ impl ArrayBuilder for FixedSizeBinaryBuilder { vec![], )) as ArrayDataRef; let list_data = Arc::new(ArrayData::new( - DataType::FixedSizeList(Box::new(DataType::UInt8), self.builder.list_len), + DataType::FixedSizeList( + Box::new(Field::new("item", DataType::UInt8, true)), + self.builder.list_len, + ), array.len(), None, array.null_buffer().cloned(), @@ -3368,11 +3398,14 @@ mod tests { } #[test] - #[should_panic(expected = "Data type List(Int64) is not currently supported")] + #[should_panic( + expected = "Data type List(Field { name: \"item\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false }) is not currently supported" + )] fn test_struct_array_builder_from_schema_unsupported_type() { let mut fields = Vec::new(); fields.push(Field::new("f1", DataType::Int16, false)); - let list_type = DataType::List(Box::new(DataType::Int64)); + let list_type = + DataType::List(Box::new(Field::new("item", DataType::Int64, true))); fields.push(Field::new("f2", list_type, false)); let _ = StructBuilder::from_fields(fields, 5); @@ -3667,7 +3700,7 @@ mod tests { let list_value_offsets = Buffer::from(&[0, 3, 5, 11, 13, 13, 15, 15, 17].to_byte_slice()); let expected_list_data = ArrayData::new( - DataType::List(Box::new(DataType::Int64)), + DataType::List(Box::new(Field::new("item", DataType::Int64, true))), 8, None, None, @@ -3753,7 +3786,7 @@ mod tests { &[0, 3, 5, 5, 13, 15, 15, 15, 19, 19, 19, 19, 23].to_byte_slice(), ); let expected_list_data = ArrayData::new( - DataType::List(Box::new(DataType::Int64)), + DataType::List(Box::new(Field::new("item", DataType::Int64, true))), 12, None, None, @@ -3795,7 +3828,7 @@ mod tests { ]); let list_value_offsets = Buffer::from(&[0, 2, 3, 6].to_byte_slice()); let list_data = ArrayData::new( - DataType::List(Box::new(DataType::Utf8)), + DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), 3, None, None, @@ -3830,7 +3863,7 @@ mod tests { ]); let list_value_offsets = Buffer::from(&[0, 2, 2, 4, 5, 8, 9, 12].to_byte_slice()); let expected_list_data = ArrayData::new( - DataType::List(Box::new(DataType::Utf8)), + DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), 7, None, None, // is this correct? @@ -3918,7 +3951,10 @@ mod tests { Some(12), ]); let expected_list_data = ArrayData::new( - DataType::FixedSizeList(Box::new(DataType::UInt16), 2), + DataType::FixedSizeList( + Box::new(Field::new("item", DataType::UInt16, true)), + 2, + ), 12, None, None, @@ -3988,7 +4024,10 @@ mod tests { None, ]); let expected_list_data = ArrayData::new( - DataType::FixedSizeList(Box::new(DataType::UInt8), 2), + DataType::FixedSizeList( + Box::new(Field::new("item", DataType::UInt8, true)), + 2, + ), 12, None, None, diff --git a/rust/arrow/src/compute/kernels/cast.rs b/rust/arrow/src/compute/kernels/cast.rs index ab34c6a0950..9d1b7fc6011 100644 --- a/rust/arrow/src/compute/kernels/cast.rs +++ b/rust/arrow/src/compute/kernels/cast.rs @@ -57,9 +57,11 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { match (from_type, to_type) { (Struct(_), _) => false, (_, Struct(_)) => false, - (List(list_from), List(list_to)) => can_cast_types(list_from, list_to), + (List(list_from), List(list_to)) => { + can_cast_types(list_from.data_type(), list_to.data_type()) + } (List(_), _) => false, - (_, List(list_to)) => can_cast_types(from_type, list_to), + (_, List(list_to)) => can_cast_types(from_type, list_to.data_type()), (Dictionary(_, from_value_type), Dictionary(_, to_value_type)) => { can_cast_types(from_value_type, to_value_type) } @@ -243,9 +245,9 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result { (List(_), List(ref to)) => { let data = array.data_ref(); let underlying_array = make_array(data.child_data()[0].clone()); - let cast_array = cast(&underlying_array, &to)?; + let cast_array = cast(&underlying_array, to.data_type())?; let array_data = ArrayData::new( - *to.clone(), + to.data_type().clone(), array.len(), Some(cast_array.null_count()), cast_array @@ -266,12 +268,12 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result { )), (_, List(ref to)) => { // cast primitive to list's primitive - let cast_array = cast(array, &to)?; + let cast_array = cast(array, to.data_type())?; // create offsets, where if array.len() = 2, we have [0,1,2] let offsets: Vec = (0..=array.len() as i32).collect(); let value_offsets = Buffer::from(offsets[..].to_byte_slice()); let list_data = ArrayData::new( - *to.clone(), + to.data_type().clone(), array.len(), Some(cast_array.null_count()), cast_array @@ -1219,7 +1221,11 @@ mod tests { fn test_cast_i32_to_list_i32() { let a = Int32Array::from(vec![5, 6, 7, 8, 9]); let array = Arc::new(a) as ArrayRef; - let b = cast(&array, &DataType::List(Box::new(DataType::Int32))).unwrap(); + let b = cast( + &array, + &DataType::List(Box::new(Field::new("item", DataType::Int32, true))), + ) + .unwrap(); assert_eq!(5, b.len()); let arr = b.as_any().downcast_ref::().unwrap(); assert_eq!(0, arr.value_offset(0)); @@ -1245,7 +1251,11 @@ mod tests { fn test_cast_i32_to_list_i32_nullable() { let a = Int32Array::from(vec![Some(5), None, Some(7), Some(8), Some(9)]); let array = Arc::new(a) as ArrayRef; - let b = cast(&array, &DataType::List(Box::new(DataType::Int32))).unwrap(); + let b = cast( + &array, + &DataType::List(Box::new(Field::new("item", DataType::Int32, true))), + ) + .unwrap(); assert_eq!(5, b.len()); assert_eq!(1, b.null_count()); let arr = b.as_any().downcast_ref::().unwrap(); @@ -1274,7 +1284,11 @@ mod tests { let a = Int32Array::from(vec![Some(5), None, Some(7), Some(8), None, Some(10)]); let array = Arc::new(a) as ArrayRef; let array = array.slice(2, 4); - let b = cast(&array, &DataType::List(Box::new(DataType::Float64))).unwrap(); + let b = cast( + &array, + &DataType::List(Box::new(Field::new("item", DataType::Float64, true))), + ) + .unwrap(); assert_eq!(4, b.len()); assert_eq!(1, b.null_count()); let arr = b.as_any().downcast_ref::().unwrap(); @@ -1348,7 +1362,8 @@ mod tests { let value_offsets = Buffer::from(&[0, 3, 6, 8].to_byte_slice()); // Construct a list array from the above two - let list_data_type = DataType::List(Box::new(DataType::Int32)); + let list_data_type = + DataType::List(Box::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -1356,8 +1371,11 @@ mod tests { .build(); let list_array = Arc::new(ListArray::from(list_data)) as ArrayRef; - let cast_array = - cast(&list_array, &DataType::List(Box::new(DataType::UInt16))).unwrap(); + let cast_array = cast( + &list_array, + &DataType::List(Box::new(Field::new("item", DataType::UInt16, true))), + ) + .unwrap(); // 3 negative values should get lost when casting to unsigned, // 1 value should overflow assert_eq!(4, cast_array.null_count()); @@ -1403,7 +1421,8 @@ mod tests { let value_offsets = Buffer::from(&[0, 3, 6, 9].to_byte_slice()); // Construct a list array from the above two - let list_data_type = DataType::List(Box::new(DataType::Int32)); + let list_data_type = + DataType::List(Box::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -1413,7 +1432,11 @@ mod tests { cast( &list_array, - &DataType::List(Box::new(DataType::Timestamp(TimeUnit::Microsecond, None))), + &DataType::List(Box::new(Field::new( + "item", + DataType::Timestamp(TimeUnit::Microsecond, None), + true, + ))), ) .unwrap(); } @@ -2816,7 +2839,8 @@ mod tests { let value_offsets = Buffer::from(&[0, 3, 6, 8].to_byte_slice()); // Construct a list array from the above two - let list_data_type = DataType::List(Box::new(DataType::Int32)); + let list_data_type = + DataType::List(Box::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_buffer(value_offsets.clone()) @@ -2837,7 +2861,8 @@ mod tests { let value_offsets = Buffer::from(&[0i64, 3, 6, 8].to_byte_slice()); // Construct a list array from the above two - let list_data_type = DataType::LargeList(Box::new(DataType::Int32)); + let list_data_type = + DataType::LargeList(Box::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_buffer(value_offsets.clone()) @@ -2856,7 +2881,10 @@ mod tests { .build(); // Construct a fixed size list array from the above two - let list_data_type = DataType::FixedSizeList(Box::new(DataType::Int32), 2); + let list_data_type = DataType::FixedSizeList( + Box::new(Field::new("item", DataType::Int32, true)), + 2, + ); let list_data = ArrayData::builder(list_data_type) .len(5) .add_child_data(value_data.clone()) @@ -2949,12 +2977,12 @@ mod tests { LargeBinary, Utf8, LargeUtf8, - List(Box::new(DataType::Int8)), - List(Box::new(DataType::Utf8)), - FixedSizeList(Box::new(DataType::Int8), 10), - FixedSizeList(Box::new(DataType::Utf8), 10), - LargeList(Box::new(DataType::Int8)), - LargeList(Box::new(DataType::Utf8)), + List(Box::new(Field::new("item", DataType::Int8, true))), + List(Box::new(Field::new("item", DataType::Utf8, true))), + FixedSizeList(Box::new(Field::new("item", DataType::Int8, true)), 10), + FixedSizeList(Box::new(Field::new("item", DataType::Utf8, false)), 10), + LargeList(Box::new(Field::new("item", DataType::Int8, true))), + LargeList(Box::new(Field::new("item", DataType::Utf8, false))), Struct(vec![ Field::new("f1", DataType::Int32, false), Field::new("f2", DataType::Utf8, true), diff --git a/rust/arrow/src/compute/kernels/comparison.rs b/rust/arrow/src/compute/kernels/comparison.rs index 72d0f6bff94..88bb49987af 100644 --- a/rust/arrow/src/compute/kernels/comparison.rs +++ b/rust/arrow/src/compute/kernels/comparison.rs @@ -681,8 +681,8 @@ fn new_all_set_buffer(len: usize) -> Buffer { #[cfg(test)] mod tests { use super::*; - use crate::array::Int32Array; use crate::datatypes::{Int8Type, ToByteSlice}; + use crate::{array::Int32Array, datatypes::Field}; #[test] fn test_primitive_array_eq() { @@ -950,7 +950,8 @@ mod tests { ]) .data(); let value_offsets = Buffer::from(&[0i64, 3, 6, 6, 9].to_byte_slice()); - let list_data_type = DataType::LargeList(Box::new(DataType::Int32)); + let list_data_type = + DataType::LargeList(Box::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(4) .add_buffer(value_offsets) diff --git a/rust/arrow/src/compute/kernels/concat.rs b/rust/arrow/src/compute/kernels/concat.rs index 4dc945611e8..d07d35e82b1 100644 --- a/rust/arrow/src/compute/kernels/concat.rs +++ b/rust/arrow/src/compute/kernels/concat.rs @@ -114,7 +114,9 @@ pub fn concat(array_list: &[ArrayRef]) -> Result { DataType::Duration(TimeUnit::Nanosecond) => { concat_primitive::(array_data_list) } - DataType::List(nested_type) => concat_list(array_data_list, *nested_type.clone()), + DataType::List(nested_field) => { + concat_list(array_data_list, nested_field.data_type()) + } t => Err(ArrowError::ComputeError(format!( "Concat not supported for data type {:?}", t @@ -145,7 +147,7 @@ where #[inline] fn concat_list( array_data_list: &[ArrayDataRef], - data_type: DataType, + data_type: &DataType, ) -> Result { match data_type { DataType::Int8 => concat_primitive_list::(array_data_list), diff --git a/rust/arrow/src/compute/kernels/filter.rs b/rust/arrow/src/compute/kernels/filter.rs index 2e6ed1264a9..05d534840f3 100644 --- a/rust/arrow/src/compute/kernels/filter.rs +++ b/rust/arrow/src/compute/kernels/filter.rs @@ -488,7 +488,7 @@ impl FilterContext { key_type, value_type ))) } - DataType::List(dt) => match &**dt { + DataType::List(dt) => match dt.data_type() { DataType::UInt8 => { filter_primitive_item_list_array!(self, array, UInt8Type, ListArray, ListBuilder) } @@ -601,7 +601,7 @@ impl FilterContext { ))) } } - DataType::LargeList(dt) => match &**dt { + DataType::LargeList(dt) => match dt.data_type() { DataType::UInt8 => { filter_primitive_item_list_array!(self, array, UInt8Type, LargeListArray, LargeListBuilder) } @@ -1085,7 +1085,8 @@ mod tests { let value_offsets = Buffer::from(&[0i64, 3, 6, 8, 8].to_byte_slice()); - let list_data_type = DataType::LargeList(Box::new(DataType::Int32)); + let list_data_type = + DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(4) .add_buffer(value_offsets) diff --git a/rust/arrow/src/compute/kernels/limit.rs b/rust/arrow/src/compute/kernels/limit.rs index 44b6c1c5d75..65f66bce8e5 100644 --- a/rust/arrow/src/compute/kernels/limit.rs +++ b/rust/arrow/src/compute/kernels/limit.rs @@ -110,7 +110,8 @@ mod tests { bit_util::set_bit(&mut null_bits, 8); // Construct a list array from the above two - let list_data_type = DataType::List(Box::new(DataType::Int32)); + let list_data_type = + DataType::List(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(9) .add_buffer(value_offsets) diff --git a/rust/arrow/src/compute/kernels/take.rs b/rust/arrow/src/compute/kernels/take.rs index 9cb7f36cb85..35ca7b3c9d2 100644 --- a/rust/arrow/src/compute/kernels/take.rs +++ b/rust/arrow/src/compute/kernels/take.rs @@ -778,7 +778,7 @@ mod tests { let value_offsets: [$offset_type; 4] = [0, 3, 6, 8]; let value_offsets = Buffer::from(&value_offsets.to_byte_slice()); // Construct a list array from the above two - let list_data_type = DataType::$list_data_type(Box::new(DataType::Int32)); + let list_data_type = DataType::$list_data_type(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_buffer(value_offsets) @@ -847,7 +847,7 @@ mod tests { let value_offsets: [$offset_type; 5] = [0, 3, 6, 7, 9]; let value_offsets = Buffer::from(&value_offsets.to_byte_slice()); // Construct a list array from the above two - let list_data_type = DataType::$list_data_type(Box::new(DataType::Int32)); + let list_data_type = DataType::$list_data_type(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type.clone()) .len(4) .add_buffer(value_offsets) @@ -916,7 +916,7 @@ mod tests { let value_offsets: [$offset_type; 5] = [0, 3, 6, 6, 8]; let value_offsets = Buffer::from(&value_offsets.to_byte_slice()); // Construct a list array from the above two - let list_data_type = DataType::$list_data_type(Box::new(DataType::Int32)); + let list_data_type = DataType::$list_data_type(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type.clone()) .len(4) .add_buffer(value_offsets) @@ -1006,7 +1006,7 @@ mod tests { // Construct offsets let value_offsets = Buffer::from(&[0, 3, 6, 8].to_byte_slice()); // Construct a list array from the above two - let list_data_type = DataType::List(Box::new(DataType::Int32)); + let list_data_type = DataType::List(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_buffer(value_offsets) @@ -1160,4 +1160,4 @@ mod tests { assert_eq!(expected_keys.data_type(), result_keys.data_type()); assert_eq!(expected_keys, result_keys); } -} +} \ No newline at end of file diff --git a/rust/arrow/src/compute/util.rs b/rust/arrow/src/compute/util.rs index 686cc8b542f..b3836bf4dd4 100644 --- a/rust/arrow/src/compute/util.rs +++ b/rust/arrow/src/compute/util.rs @@ -321,7 +321,7 @@ mod tests { #[test] fn test_take_value_index_from_list() { let list = build_list( - DataType::List(Box::new(DataType::Int32)), + DataType::List(Box::new(Field::new("item", DataType::Int32, true))), Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), vec![0i32, 2i32, 5i32, 10i32], ); @@ -397,4 +397,4 @@ mod tests { } } } -} +} \ No newline at end of file diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index 6f0dc240ac9..71902a6af55 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -125,11 +125,11 @@ pub enum DataType { /// A variable-length string in Unicode with UFT-8 encoding and 64-bit offsets. LargeUtf8, /// A list of some logical data type with variable length. - List(Box), + List(Box), /// A list of some logical data type with fixed length. - FixedSizeList(Box, i32), + FixedSizeList(Box, i32), /// A list of some logical data type with variable length and 64-bit offsets. - LargeList(Box), + LargeList(Box), /// A nested datatype that contains a number of sub-fields. Struct(Vec), /// A nested datatype that can represent slots of differing types. @@ -189,8 +189,8 @@ pub struct Field { name: String, data_type: DataType, nullable: bool, - pub(crate) dict_id: i64, - pub(crate) dict_is_ordered: bool, + dict_id: i64, + dict_is_ordered: bool, } pub trait ArrowNativeType: @@ -872,8 +872,38 @@ impl ToByteSlice for T { } impl DataType { + /// Compare data types considering that list fields only match on type + // TODO: remove this custom comparator when we have clarity on how list field + // names should be handled between Arrow and Parquet + pub fn compare(&self, other: &Self) -> bool { + match (self, other) { + (DataType::List(f1), DataType::List(f2)) + | (DataType::LargeList(f1), DataType::LargeList(f2)) => { + f1.data_type().compare(f2.data_type()) + } + (DataType::FixedSizeList(f1, l1), DataType::FixedSizeList(f2, l2)) => { + f1.data_type().compare(f2.data_type()) && l1 == l2 + } + (DataType::Struct(fields1), DataType::Struct(fields2)) => { + if fields1.len() != fields2.len() { + return false; + } + fields1.iter().zip(fields2).all(|(a, b)| { + // fields must be the same, but if any field is a list child, + // we should only compare datatypes + a.data_type().compare(b.data_type()) + && a.name() == b.name() + && a.is_nullable() == b.is_nullable() + && a.dict_id == b.dict_id + && a.dict_is_ordered == b.dict_is_ordered + }) + } + _ => self == other, + } + } /// Parse a data type from a JSON representation fn from(json: &Value) -> Result { + let default_field = Field::new("", DataType::Boolean, true); match *json { Value::Object(ref map) => match map.get("name") { Some(s) if s == "null" => Ok(DataType::Null), @@ -1007,17 +1037,17 @@ impl DataType { }, Some(s) if s == "list" => { // return a list with any type as its child isn't defined in the map - Ok(DataType::List(Box::new(DataType::Boolean))) + Ok(DataType::List(Box::new(default_field))) } Some(s) if s == "largelist" => { // return a largelist with any type as its child isn't defined in the map - Ok(DataType::LargeList(Box::new(DataType::Boolean))) + Ok(DataType::LargeList(Box::new(default_field))) } Some(s) if s == "fixedsizelist" => { // return a list with any type as its child isn't defined in the map if let Some(Value::Number(size)) = map.get("listSize") { Ok(DataType::FixedSizeList( - Box::new(DataType::Boolean), + Box::new(default_field), size.as_i64().unwrap() as i32, )) } else { @@ -1182,6 +1212,18 @@ impl Field { self.nullable } + /// IReturns the dictionary ID + #[inline] + pub const fn dict_id(&self) -> i64 { + self.dict_id + } + + /// Indicates whether this `Field`'s dictionary is ordered + #[inline] + pub const fn dict_is_ordered(&self) -> bool { + self.dict_is_ordered + } + /// Parse a `Field` definition from a JSON representation pub fn from(json: &Value) -> Result { match *json { @@ -1223,14 +1265,14 @@ impl Field { } match data_type { DataType::List(_) => DataType::List(Box::new( - Self::from(&values[0])?.data_type, + Self::from(&values[0])?, )), DataType::LargeList(_) => DataType::LargeList(Box::new( - Self::from(&values[0])?.data_type, + Self::from(&values[0])?, )), DataType::FixedSizeList(_, int) => { DataType::FixedSizeList( - Box::new(Self::from(&values[0])?.data_type), + Box::new(Self::from(&values[0])?), int, ) } @@ -1322,18 +1364,9 @@ impl Field { pub fn to_json(&self) -> Value { let children: Vec = match self.data_type() { DataType::Struct(fields) => fields.iter().map(|f| f.to_json()).collect(), - DataType::List(dtype) => { - let item = Field::new("item", *dtype.clone(), self.nullable); - vec![item.to_json()] - } - DataType::LargeList(dtype) => { - let item = Field::new("item", *dtype.clone(), self.nullable); - vec![item.to_json()] - } - DataType::FixedSizeList(dtype, _) => { - let item = Field::new("item", *dtype.clone(), self.nullable); - vec![item.to_json()] - } + DataType::List(field) => vec![field.to_json()], + DataType::LargeList(field) => vec![field.to_json()], + DataType::FixedSizeList(field, _) => vec![field.to_json()], _ => vec![], }; match self.data_type() { @@ -1958,17 +1991,30 @@ mod tests { ), Field::new("c19", DataType::Interval(IntervalUnit::DayTime), false), Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false), - Field::new("c21", DataType::List(Box::new(DataType::Boolean)), false), + Field::new( + "c21", + DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), + false, + ), Field::new( "c22", - DataType::FixedSizeList(Box::new(DataType::Boolean), 5), + DataType::FixedSizeList( + Box::new(Field::new("bools", DataType::Boolean, false)), + 5, + ), false, ), Field::new( "c23", - DataType::List(Box::new(DataType::List(Box::new(DataType::Struct( - vec![], - ))))), + DataType::List(Box::new(Field::new( + "inner_list", + DataType::List(Box::new(Field::new( + "struct", + DataType::Struct(vec![]), + true, + ))), + false, + ))), true, ), Field::new( @@ -1999,9 +2045,15 @@ mod tests { Field::new("c33", DataType::LargeUtf8, true), Field::new( "c34", - DataType::LargeList(Box::new(DataType::LargeList(Box::new( - DataType::Struct(vec![]), - )))), + DataType::LargeList(Box::new(Field::new( + "inner_large_list", + DataType::LargeList(Box::new(Field::new( + "struct", + DataType::Struct(vec![]), + false, + ))), + true, + ))), true, ), ], @@ -2207,7 +2259,7 @@ mod tests { "children": [ { "name": "item", - "nullable": false, + "nullable": true, "type": { "name": "bool" }, @@ -2224,7 +2276,7 @@ mod tests { }, "children": [ { - "name": "item", + "name": "bools", "nullable": false, "type": { "name": "bool" @@ -2241,14 +2293,14 @@ mod tests { }, "children": [ { - "name": "item", - "nullable": true, + "name": "inner_list", + "nullable": false, "type": { "name": "list" }, "children": [ { - "name": "item", + "name": "struct", "nullable": true, "type": { "name": "struct" @@ -2381,15 +2433,15 @@ mod tests { }, "children": [ { - "name": "item", + "name": "inner_large_list", "nullable": true, "type": { "name": "largelist" }, "children": [ { - "name": "item", - "nullable": true, + "name": "struct", + "nullable": false, "type": { "name": "struct" }, diff --git a/rust/arrow/src/ipc/convert.rs b/rust/arrow/src/ipc/convert.rs index 63d55f043c6..dffd9a711bd 100644 --- a/rust/arrow/src/ipc/convert.rs +++ b/rust/arrow/src/ipc/convert.rs @@ -268,30 +268,22 @@ pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataT if children.len() != 1 { panic!("expect a list to have one child") } - let child_field = children.get(0); - // returning int16 for now, to test, not sure how to get data type - DataType::List(Box::new(get_data_type(child_field, false))) + DataType::List(Box::new(children.get(0).into())) } ipc::Type::LargeList => { let children = field.children().unwrap(); if children.len() != 1 { panic!("expect a large list to have one child") } - let child_field = children.get(0); - // returning int16 for now, to test, not sure how to get data type - DataType::LargeList(Box::new(get_data_type(child_field, false))) + DataType::LargeList(Box::new(children.get(0).into())) } ipc::Type::FixedSizeList => { let children = field.children().unwrap(); if children.len() != 1 { panic!("expect a list to have one child") } - let child_field = children.get(0); let fsl = field.type_as_fixed_size_list().unwrap(); - DataType::FixedSizeList( - Box::new(get_data_type(child_field, false)), - fsl.listSize(), - ) + DataType::FixedSizeList(Box::new(children.get(0).into()), fsl.listSize()) } ipc::Type::Struct_ => { let mut fields = vec![]; @@ -324,8 +316,8 @@ pub(crate) fn build_field<'a: 'b, 'b>( let fb_dictionary = if let Dictionary(index_type, _) = field.data_type() { Some(get_fb_dictionary( index_type, - field.dict_id, - field.dict_is_ordered, + field.dict_id(), + field.dict_is_ordered(), fbb, )) } else { @@ -537,19 +529,20 @@ pub(crate) fn get_fb_field_type<'a: 'b, 'b>( } } List(ref list_type) => { - let inner_types = get_fb_field_type(list_type, fbb); - let child = ipc::Field::create( - fbb, - &ipc::FieldArgs { - name: None, - nullable: false, - type_type: inner_types.type_type, - type_: Some(inner_types.type_), - children: inner_types.children, - dictionary: None, - custom_metadata: None, - }, - ); + let child = build_field(fbb, list_type); + // let inner_types = get_fb_field_type(list_type, fbb); + // let child = ipc::Field::create( + // fbb, + // &ipc::FieldArgs { + // name: None, + // nullable: false, + // type_type: inner_types.type_type, + // type_: Some(inner_types.type_), + // children: inner_types.children, + // dictionary: None, + // custom_metadata: None, + // }, + // ); FBFieldType { type_type: ipc::Type::List, type_: ipc::ListBuilder::new(fbb).finish().as_union_value(), @@ -557,19 +550,20 @@ pub(crate) fn get_fb_field_type<'a: 'b, 'b>( } } LargeList(ref list_type) => { - let inner_types = get_fb_field_type(list_type, fbb); - let child = ipc::Field::create( - fbb, - &ipc::FieldArgs { - name: None, - nullable: false, - type_type: inner_types.type_type, - type_: Some(inner_types.type_), - dictionary: None, - children: inner_types.children, - custom_metadata: None, - }, - ); + let child = build_field(fbb, list_type); + // let inner_types = get_fb_field_type(list_type, fbb); + // let child = ipc::Field::create( + // fbb, + // &ipc::FieldArgs { + // name: None, + // nullable: false, + // type_type: inner_types.type_type, + // type_: Some(inner_types.type_), + // dictionary: None, + // children: inner_types.children, + // custom_metadata: None, + // }, + // ); FBFieldType { type_type: ipc::Type::LargeList, type_: ipc::LargeListBuilder::new(fbb).finish().as_union_value(), @@ -577,19 +571,20 @@ pub(crate) fn get_fb_field_type<'a: 'b, 'b>( } } FixedSizeList(ref list_type, len) => { - let inner_types = get_fb_field_type(list_type, fbb); - let child = ipc::Field::create( - fbb, - &ipc::FieldArgs { - name: None, - nullable: false, - type_type: inner_types.type_type, - type_: Some(inner_types.type_), - dictionary: None, - children: inner_types.children, - custom_metadata: None, - }, - ); + let child = build_field(fbb, list_type); + // let inner_types = get_fb_field_type(list_type, fbb); + // let child = ipc::Field::create( + // fbb, + // &ipc::FieldArgs { + // name: None, + // nullable: false, + // type_type: inner_types.type_type, + // type_: Some(inner_types.type_), + // dictionary: None, + // children: inner_types.children, + // custom_metadata: None, + // }, + // ); let mut builder = ipc::FixedSizeListBuilder::new(fbb); builder.add_listSize(*len as i32); FBFieldType { @@ -735,14 +730,22 @@ mod tests { ), Field::new("utf8", DataType::Utf8, false), Field::new("binary", DataType::Binary, false), - Field::new("list[u8]", DataType::List(Box::new(DataType::UInt8)), true), + Field::new( + "list[u8]", + DataType::List(Box::new(Field::new("item", DataType::UInt8, false))), + true, + ), Field::new( "list[struct]", - DataType::List(Box::new(DataType::Struct(vec![ - Field::new("float32", DataType::UInt8, false), - Field::new("int32", DataType::Int32, true), - Field::new("bool", DataType::Boolean, true), - ]))), + DataType::List(Box::new(Field::new( + "struct", + DataType::Struct(vec![ + Field::new("float32", DataType::UInt8, false), + Field::new("int32", DataType::Int32, true), + Field::new("bool", DataType::Boolean, true), + ]), + true, + ))), false, ), Field::new( @@ -751,18 +754,26 @@ mod tests { Field::new("int64", DataType::Int64, true), Field::new( "list[struct]>]", - DataType::List(Box::new(DataType::Struct(vec![ - Field::new( - "date32", - DataType::Date32(DateUnit::Day), - true, - ), - Field::new( - "list[struct<>]", - DataType::List(Box::new(DataType::Struct(vec![]))), - false, - ), - ]))), + DataType::List(Box::new(Field::new( + "struct", + DataType::Struct(vec![ + Field::new( + "date32", + DataType::Date32(DateUnit::Day), + true, + ), + Field::new( + "list[struct<>]", + DataType::List(Box::new(Field::new( + "struct", + DataType::Struct(vec![]), + false, + ))), + false, + ), + ]), + false, + ))), false, ), ]), diff --git a/rust/arrow/src/ipc/reader.rs b/rust/arrow/src/ipc/reader.rs index e4bb003d0bc..05cf00b822b 100644 --- a/rust/arrow/src/ipc/reader.rs +++ b/rust/arrow/src/ipc/reader.rs @@ -89,7 +89,7 @@ fn create_array( buffer_index += 2; array } - List(ref list_data_type) | LargeList(ref list_data_type) => { + List(ref list_field) | LargeList(ref list_field) => { let list_node = &nodes[node_index]; let list_buffers: Vec = buffers[buffer_index..buffer_index + 2] .iter() @@ -99,7 +99,7 @@ fn create_array( buffer_index += 2; let triple = create_array( nodes, - list_data_type, + list_field.data_type(), data, buffers, dictionaries, @@ -111,7 +111,7 @@ fn create_array( create_list_array(list_node, data_type, &list_buffers[..], triple.0) } - FixedSizeList(ref list_data_type, _) => { + FixedSizeList(ref list_field, _) => { let list_node = &nodes[node_index]; let list_buffers: Vec = buffers[buffer_index..=buffer_index] .iter() @@ -121,7 +121,7 @@ fn create_array( buffer_index += 1; let triple = create_array( nodes, - list_data_type, + list_field.data_type(), data, buffers, dictionaries, diff --git a/rust/arrow/src/json/reader.rs b/rust/arrow/src/json/reader.rs index 42ae3506469..895945825ae 100644 --- a/rust/arrow/src/json/reader.rs +++ b/rust/arrow/src/json/reader.rs @@ -65,54 +65,86 @@ fn coerce_data_type(dt: Vec<&DataType>) -> Result { 1 => Ok(dt[0].clone()), 2 => { // there can be a case where a list and scalar both exist - if dt.contains(&&DataType::List(Box::new(DataType::Float64))) - || dt.contains(&&DataType::List(Box::new(DataType::Int64))) - || dt.contains(&&DataType::List(Box::new(DataType::Boolean))) - || dt.contains(&&DataType::List(Box::new(DataType::Utf8))) - { + if dt.contains(&&DataType::List(Box::new(Field::new( + "item", + DataType::Float64, + true, + )))) || dt.contains(&&DataType::List(Box::new(Field::new( + "item", + DataType::Int64, + true, + )))) || dt.contains(&&DataType::List(Box::new(Field::new( + "item", + DataType::Boolean, + true, + )))) || dt.contains(&&DataType::List(Box::new(Field::new( + "item", + DataType::Utf8, + true, + )))) { // we have a list and scalars, so we should get the values and coerce them let mut dt = dt; // sorting guarantees that the list will be the second value dt.sort(); match (dt[0], dt[1]) { - (t1, DataType::List(e)) if **e == DataType::Float64 => { + (t1, DataType::List(e)) if e.data_type() == &DataType::Float64 => { if t1 == &DataType::Float64 { - Ok(DataType::List(Box::new(DataType::Float64))) + Ok(DataType::List(Box::new(Field::new( + "item", + DataType::Float64, + true, + )))) } else { - Ok(DataType::List(Box::new(coerce_data_type(vec![ - t1, - &DataType::Float64, - ])?))) + Ok(DataType::List(Box::new(Field::new( + "item", + coerce_data_type(vec![t1, &DataType::Float64])?, + true, + )))) } } - (t1, DataType::List(e)) if **e == DataType::Int64 => { + (t1, DataType::List(e)) if e.data_type() == &DataType::Int64 => { if t1 == &DataType::Int64 { - Ok(DataType::List(Box::new(DataType::Int64))) + Ok(DataType::List(Box::new(Field::new( + "item", + DataType::Int64, + true, + )))) } else { - Ok(DataType::List(Box::new(coerce_data_type(vec![ - t1, - &DataType::Int64, - ])?))) + Ok(DataType::List(Box::new(Field::new( + "item", + coerce_data_type(vec![t1, &DataType::Int64])?, + true, + )))) } } - (t1, DataType::List(e)) if **e == DataType::Boolean => { + (t1, DataType::List(e)) if e.data_type() == &DataType::Boolean => { if t1 == &DataType::Boolean { - Ok(DataType::List(Box::new(DataType::Boolean))) + Ok(DataType::List(Box::new(Field::new( + "item", + DataType::Boolean, + true, + )))) } else { - Ok(DataType::List(Box::new(coerce_data_type(vec![ - t1, - &DataType::Boolean, - ])?))) + Ok(DataType::List(Box::new(Field::new( + "item", + coerce_data_type(vec![t1, &DataType::Boolean])?, + true, + )))) } } - (t1, DataType::List(e)) if **e == DataType::Utf8 => { + (t1, DataType::List(e)) if e.data_type() == &DataType::Utf8 => { if t1 == &DataType::Utf8 { - Ok(DataType::List(Box::new(DataType::Utf8))) + Ok(DataType::List(Box::new(Field::new( + "item", + DataType::Utf8, + true, + )))) } else { - Ok(DataType::List(Box::new(coerce_data_type(vec![ - t1, - &DataType::Utf8, - ])?))) + Ok(DataType::List(Box::new(Field::new( + "item", + coerce_data_type(vec![t1, &DataType::Utf8])?, + true, + )))) } } (t1, t2) => Err(ArrowError::JsonError(format!( @@ -129,7 +161,11 @@ fn coerce_data_type(dt: Vec<&DataType>) -> Result { _ => { // TODO(nevi_me) It's possible to have [float, int, list(float)], which should // return list(float). Will hash this out later - Ok(DataType::List(Box::new(DataType::Utf8))) + Ok(DataType::List(Box::new(Field::new( + "item", + DataType::Utf8, + true, + )))) } } } @@ -268,11 +304,15 @@ pub fn infer_json_schema( if values.contains_key(k) { let x = values.get_mut(k).unwrap(); - x.insert(DataType::List(Box::new(dt))); + x.insert(DataType::List(Box::new( + Field::new("item", dt, true), + ))); } else { // create hashset and add value type let mut hs = HashSet::new(); - hs.insert(DataType::List(Box::new(dt))); + hs.insert(DataType::List(Box::new( + Field::new("item", dt, true), + ))); values.insert(k.to_string(), hs); } } @@ -563,7 +603,7 @@ impl Reader { Ok(Arc::new(builder.finish()) as ArrayRef) } DataType::List(ref t) => { - match **t { + match t.data_type() { DataType::Int8 => { self.build_list_array::(rows, field.name()) } @@ -1352,12 +1392,12 @@ mod tests { assert_eq!(&DataType::Int64, a.1.data_type()); let b = schema.column_with_name("b").unwrap(); assert_eq!( - &DataType::List(Box::new(DataType::Float64)), + &DataType::List(Box::new(Field::new("item", DataType::Float64, true))), b.1.data_type() ); let c = schema.column_with_name("c").unwrap(); assert_eq!( - &DataType::List(Box::new(DataType::Boolean)), + &DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), c.1.data_type() ); let d = schema.column_with_name("d").unwrap(); @@ -1410,21 +1450,37 @@ mod tests { use crate::datatypes::DataType::*; assert_eq!( - List(Box::new(Float64)), - coerce_data_type(vec![&Float64, &List(Box::new(Float64))]).unwrap() + List(Box::new(Field::new("item", Float64, true))), + coerce_data_type(vec![ + &Float64, + &List(Box::new(Field::new("item", Float64, true))) + ]) + .unwrap() ); assert_eq!( - List(Box::new(Float64)), - coerce_data_type(vec![&Float64, &List(Box::new(Int64))]).unwrap() + List(Box::new(Field::new("item", Float64, true))), + coerce_data_type(vec![ + &Float64, + &List(Box::new(Field::new("item", Int64, true))) + ]) + .unwrap() ); assert_eq!( - List(Box::new(Int64)), - coerce_data_type(vec![&Int64, &List(Box::new(Int64))]).unwrap() + List(Box::new(Field::new("item", Int64, true))), + coerce_data_type(vec![ + &Int64, + &List(Box::new(Field::new("item", Int64, true))) + ]) + .unwrap() ); // boolean and number are incompatible, return utf8 assert_eq!( - List(Box::new(Utf8)), - coerce_data_type(vec![&Boolean, &List(Box::new(Float64))]).unwrap() + List(Box::new(Field::new("item", Utf8, true))), + coerce_data_type(vec![ + &Boolean, + &List(Box::new(Field::new("item", Float64, true))) + ]) + .unwrap() ); } @@ -1455,16 +1511,19 @@ mod tests { assert_eq!(&DataType::Int64, a.1.data_type()); let b = schema.column_with_name("b").unwrap(); assert_eq!( - &DataType::List(Box::new(DataType::Float64)), + &DataType::List(Box::new(Field::new("item", DataType::Float64, true))), b.1.data_type() ); let c = schema.column_with_name("c").unwrap(); assert_eq!( - &DataType::List(Box::new(DataType::Boolean)), + &DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), c.1.data_type() ); let d = schema.column_with_name("d").unwrap(); - assert_eq!(&DataType::List(Box::new(DataType::Utf8)), d.1.data_type()); + assert_eq!( + &DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), + d.1.data_type() + ); let bb = batch .column(b.0) @@ -1655,9 +1714,10 @@ mod tests { fn test_list_of_string_dictionary_from_json() { let schema = Schema::new(vec![Field::new( "events", - List(Box::new(Dictionary( - Box::new(DataType::UInt64), - Box::new(DataType::Utf8), + List(Box::new(Field::new( + "item", + Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), + true, ))), true, )]); @@ -1678,9 +1738,10 @@ mod tests { let events = schema.column_with_name("events").unwrap(); assert_eq!( - &List(Box::new(Dictionary( - Box::new(DataType::UInt64), - Box::new(DataType::Utf8) + &List(Box::new(Field::new( + "item", + Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), + true ))), events.1.data_type() ); @@ -1712,9 +1773,10 @@ mod tests { fn test_list_of_string_dictionary_from_json_with_nulls() { let schema = Schema::new(vec![Field::new( "events", - List(Box::new(Dictionary( - Box::new(DataType::UInt64), - Box::new(DataType::Utf8), + List(Box::new(Field::new( + "item", + Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), + true, ))), true, )]); @@ -1737,9 +1799,10 @@ mod tests { let events = schema.column_with_name("events").unwrap(); assert_eq!( - &List(Box::new(Dictionary( - Box::new(DataType::UInt64), - Box::new(DataType::Utf8) + &List(Box::new(Field::new( + "item", + Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), + true ))), events.1.data_type() ); @@ -1876,9 +1939,21 @@ mod tests { fn test_json_infer_schema() { let schema = Schema::new(vec![ Field::new("a", DataType::Int64, true), - Field::new("b", DataType::List(Box::new(DataType::Float64)), true), - Field::new("c", DataType::List(Box::new(DataType::Boolean)), true), - Field::new("d", DataType::List(Box::new(DataType::Utf8)), true), + Field::new( + "b", + DataType::List(Box::new(Field::new("item", DataType::Float64, true))), + true, + ), + Field::new( + "c", + DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), + true, + ), + Field::new( + "d", + DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), + true, + ), ]); let mut reader = diff --git a/rust/arrow/src/record_batch.rs b/rust/arrow/src/record_batch.rs index 484c9ff8972..86885b8f8ae 100644 --- a/rust/arrow/src/record_batch.rs +++ b/rust/arrow/src/record_batch.rs @@ -99,7 +99,8 @@ impl RecordBatch { "all columns in a record batch must have the same length".to_string(), )); } - if column.data_type() != schema.field(i).data_type() { + // list types can have different names, but we only need the data types to be the same + if !column.data_type().compare(schema.field(i).data_type()) { return Err(ArrowError::InvalidArgumentError(format!( "column types must match schema types, expected {:?} but found {:?} at column index {}", schema.field(i).data_type(), diff --git a/rust/arrow/src/util/integration_util.rs b/rust/arrow/src/util/integration_util.rs index c1bba13aee6..5b9e1bac3fa 100644 --- a/rust/arrow/src/util/integration_util.rs +++ b/rust/arrow/src/util/integration_util.rs @@ -375,9 +375,9 @@ impl ArrowJsonBatch { /// Convert an Arrow JSON column/array into a vector of `Value` fn json_from_col(col: &ArrowJsonColumn, data_type: &DataType) -> Vec { match data_type { - DataType::List(dt) => json_from_list_col(col, &**dt), - DataType::FixedSizeList(dt, list_size) => { - json_from_fixed_size_list_col(col, &**dt, *list_size as usize) + DataType::List(field) => json_from_list_col(col, field.data_type()), + DataType::FixedSizeList(field, list_size) => { + json_from_fixed_size_list_col(col, field.data_type(), *list_size as usize) } DataType::Struct(fields) => json_from_struct_col(col, fields), DataType::Int64 @@ -474,7 +474,7 @@ fn json_from_list_col(col: &ArrowJsonColumn, data_type: &DataType) -> Vec }) .collect(); let inner = match data_type { - DataType::List(ref dt) => json_from_col(child, &**dt), + DataType::List(ref field) => json_from_col(child, field.data_type()), DataType::Struct(fields) => json_from_struct_col(col, fields), _ => merge_json_array( child.validity.as_ref().unwrap().as_slice(), @@ -511,8 +511,8 @@ fn json_from_fixed_size_list_col( // get the inner array let child = &col.children.clone().expect("list type must have children")[0]; let inner = match data_type { - DataType::List(ref dt) => json_from_col(child, &**dt), - DataType::FixedSizeList(ref dt, _) => json_from_col(child, &**dt), + DataType::List(ref field) => json_from_col(child, field.data_type()), + DataType::FixedSizeList(ref field, _) => json_from_col(child, field.data_type()), DataType::Struct(fields) => json_from_struct_col(col, fields), _ => merge_json_array( child.validity.as_ref().unwrap().as_slice(), @@ -577,13 +577,13 @@ mod tests { "nullable": true, "children": [ { - "name": "item", + "name": "custom_item", "type": { "name": "int", "isSigned": true, "bitWidth": 32 }, - "nullable": true, + "nullable": false, "children": [] } ] @@ -595,7 +595,15 @@ mod tests { Field::new("c1", DataType::Int32, true), Field::new("c2", DataType::Float64, true), Field::new("c3", DataType::Utf8, true), - Field::new("c4", DataType::List(Box::new(DataType::Int32)), true), + Field::new( + "c4", + DataType::List(Box::new(Field::new( + "custom_item", + DataType::Int32, + false, + ))), + true, + ), ]); assert!(json_schema.equals_schema(&schema)); } @@ -661,7 +669,11 @@ mod tests { true, ), Field::new("utf8s", DataType::Utf8, true), - Field::new("lists", DataType::List(Box::new(DataType::Int32)), true), + Field::new( + "lists", + DataType::List(Box::new(Field::new("itemx", DataType::Int32, false))), + true, + ), Field::new( "structs", DataType::Struct(vec![ @@ -735,7 +747,8 @@ mod tests { let value_data = Int32Array::from(vec![None, Some(2), None, None]); let value_offsets = Buffer::from(&[0, 3, 4, 4].to_byte_slice()); - let list_data_type = DataType::List(Box::new(DataType::Int32)); + let list_data_type = + DataType::List(Box::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) diff --git a/rust/arrow/test/data/integration.json b/rust/arrow/test/data/integration.json index d54277981d5..df9dbb7e77c 100644 --- a/rust/arrow/test/data/integration.json +++ b/rust/arrow/test/data/integration.json @@ -257,13 +257,13 @@ "nullable": true, "children": [ { - "name": "item", + "name": "itemx", "type": { "name": "int", "isSigned": true, "bitWidth": 32 }, - "nullable": true, + "nullable": false, "children": [] } ] From d53b8393a44c68a63bfceb9e0e08666c2a042b79 Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Sat, 7 Nov 2020 09:55:39 +0200 Subject: [PATCH 2/6] Change Arrow lists to using Field type --- rust/parquet/src/arrow/array_reader.rs | 10 +- rust/parquet/src/arrow/arrow_writer.rs | 52 +++++--- rust/parquet/src/arrow/schema.rs | 157 +++++++++++++++++-------- 3 files changed, 147 insertions(+), 72 deletions(-) diff --git a/rust/parquet/src/arrow/array_reader.rs b/rust/parquet/src/arrow/array_reader.rs index a0026f3e289..a455e48abb6 100644 --- a/rust/parquet/src/arrow/array_reader.rs +++ b/rust/parquet/src/arrow/array_reader.rs @@ -1348,7 +1348,11 @@ impl<'a> TypeVisitor>, &'a ArrayReaderBuilderContext .ok() .map(|f| f.data_type().to_owned()) .unwrap_or_else(|| { - ArrowType::List(Box::new(item_reader_type.clone())) + ArrowType::List(Box::new(Field::new( + list_type.name(), + item_reader_type.clone(), + list_type.is_optional(), + ))) }); let list_array_reader: Box = match arrow_type { @@ -2309,7 +2313,7 @@ mod tests { let mut list_array_reader = ListArrayReader::::new( Box::new(item_array_reader), - ArrowType::List(Box::new(ArrowType::Int32)), + ArrowType::List(Box::new(Field::new("item", ArrowType::Int32, true))), ArrowType::Int32, 1, 1, @@ -2363,7 +2367,7 @@ mod tests { let mut list_array_reader = ListArrayReader::::new( Box::new(item_array_reader), - ArrowType::LargeList(Box::new(ArrowType::Int32)), + ArrowType::LargeList(Box::new(Field::new("item", ArrowType::Int32, true))), ArrowType::Int32, 1, 1, diff --git a/rust/parquet/src/arrow/arrow_writer.rs b/rust/parquet/src/arrow/arrow_writer.rs index de8bc7c1d46..f2279fbcd92 100644 --- a/rust/parquet/src/arrow/arrow_writer.rs +++ b/rust/parquet/src/arrow/arrow_writer.rs @@ -722,7 +722,7 @@ mod tests { // define schema let schema = Schema::new(vec![Field::new( "a", - DataType::List(Box::new(DataType::Int32)), + DataType::List(Box::new(Field::new("item", DataType::Int32, true))), false, )]); @@ -735,11 +735,15 @@ mod tests { arrow::buffer::Buffer::from(&[0, 1, 3, 3, 6, 10].to_byte_slice()); // Construct a list array from the above two - let a_list_data = ArrayData::builder(DataType::List(Box::new(DataType::Int32))) - .len(5) - .add_buffer(a_value_offsets) - .add_child_data(a_values.data()) - .build(); + let a_list_data = ArrayData::builder(DataType::List(Box::new(Field::new( + "items", + DataType::Int32, + true, + )))) + .len(5) + .add_buffer(a_value_offsets) + .add_child_data(a_values.data()) + .build(); let a = ListArray::from(a_list_data); // build a record batch @@ -816,8 +820,11 @@ mod tests { // define schema let struct_field_d = Field::new("d", DataType::Float64, true); let struct_field_f = Field::new("f", DataType::Float32, true); - let struct_field_g = - Field::new("g", DataType::List(Box::new(DataType::Int16)), false); + let struct_field_g = Field::new( + "g", + DataType::List(Box::new(Field::new("items", DataType::Int16, false))), + false, + ); let struct_field_e = Field::new( "e", DataType::Struct(vec![struct_field_f.clone(), struct_field_g.clone()]), @@ -1251,11 +1258,15 @@ mod tests { let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); let a_value_offsets = arrow::buffer::Buffer::from(&[0, 1, 3, 3, 6, 10].to_byte_slice()); - let a_list_data = ArrayData::builder(DataType::List(Box::new(DataType::Int32))) - .len(5) - .add_buffer(a_value_offsets) - .add_child_data(a_values.data()) - .build(); + let a_list_data = ArrayData::builder(DataType::List(Box::new(Field::new( + "item", + DataType::Int32, + true, + )))) + .len(5) + .add_buffer(a_value_offsets) + .add_child_data(a_values.data()) + .build(); // I think this setup is incorrect because this should pass assert_eq!(a_list_data.null_count(), 1); @@ -1272,12 +1283,15 @@ mod tests { let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); let a_value_offsets = arrow::buffer::Buffer::from(&[0i64, 1, 3, 3, 6, 10].to_byte_slice()); - let a_list_data = - ArrayData::builder(DataType::LargeList(Box::new(DataType::Int32))) - .len(5) - .add_buffer(a_value_offsets) - .add_child_data(a_values.data()) - .build(); + let a_list_data = ArrayData::builder(DataType::LargeList(Box::new(Field::new( + "large_item", + DataType::Int32, + true, + )))) + .len(5) + .add_buffer(a_value_offsets) + .add_child_data(a_values.data()) + .build(); // I think this setup is incorrect because this should pass assert_eq!(a_list_data.null_count(), 1); diff --git a/rust/parquet/src/arrow/schema.rs b/rust/parquet/src/arrow/schema.rs index 10270fff464..c84e8955501 100644 --- a/rust/parquet/src/arrow/schema.rs +++ b/rust/parquet/src/arrow/schema.rs @@ -406,22 +406,18 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_repetition(repetition) .build() } - DataType::List(dtype) - | DataType::FixedSizeList(dtype, _) - | DataType::LargeList(dtype) => Type::group_type_builder(name) - .with_fields(&mut vec![Rc::new( - Type::group_type_builder("list") - .with_fields(&mut vec![Rc::new({ - let list_field = - Field::new("element", *dtype.clone(), field.is_nullable()); - arrow_to_parquet_type(&list_field)? - })]) - .with_repetition(Repetition::REPEATED) - .build()?, - )]) - .with_logical_type(LogicalType::LIST) - .with_repetition(Repetition::REQUIRED) - .build(), + DataType::List(f) | DataType::FixedSizeList(f, _) | DataType::LargeList(f) => { + Type::group_type_builder(name) + .with_fields(&mut vec![Rc::new( + Type::group_type_builder("list") + .with_fields(&mut vec![Rc::new(arrow_to_parquet_type(f)?)]) + .with_repetition(Repetition::REPEATED) + .build()?, + )]) + .with_logical_type(LogicalType::LIST) + .with_repetition(Repetition::REQUIRED) + .build() + } DataType::Struct(fields) => { if fields.is_empty() { return Err(ArrowError( @@ -536,7 +532,11 @@ impl ParquetTypeConverter<'_> { if self.is_self_included() { self.to_primitive_type_inner().map(|dt| { if self.is_repeated() { - Some(DataType::List(Box::new(dt))) + Some(DataType::List(Box::new(Field::new( + self.schema.name(), + dt, + self.is_nullable(), + )))) } else { Some(dt) } @@ -630,8 +630,15 @@ impl ParquetTypeConverter<'_> { /// This function takes care of logical type and repetition. fn to_group_type(&self) -> Result> { if self.is_repeated() { - self.to_struct() - .map(|opt| opt.map(|dt| DataType::List(Box::new(dt)))) + self.to_struct().map(|opt| { + opt.map(|dt| { + DataType::List(Box::new(Field::new( + self.schema.name(), + dt, + self.is_nullable(), + ))) + }) + }) } else { match self.schema.get_basic_info().logical_type() { LogicalType::LIST => self.to_list(), @@ -716,7 +723,15 @@ impl ParquetTypeConverter<'_> { } }; - item_type.map(|opt| opt.map(|dt| DataType::List(Box::new(dt)))) + item_type.map(|opt| { + opt.map(|dt| { + DataType::List(Box::new(Field::new( + list_item.name(), + dt, + list_item.is_optional(), + ))) + }) + }) } _ => Err(ArrowError( "Group element type of list can only contain one field.".to_string(), @@ -890,7 +905,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(DataType::Utf8)), + DataType::List(Box::new(Field::new("list", DataType::Utf8, true))), false, )); } @@ -904,7 +919,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(DataType::Utf8)), + DataType::List(Box::new(Field::new("list", DataType::Utf8, true))), true, )); } @@ -922,10 +937,11 @@ mod tests { // } // } { - let arrow_inner_list = DataType::List(Box::new(DataType::Int32)); + let arrow_inner_list = + DataType::List(Box::new(Field::new("list", DataType::Int32, true))); arrow_fields.push(Field::new( "array_of_arrays", - DataType::List(Box::new(arrow_inner_list)), + DataType::List(Box::new(Field::new("list", arrow_inner_list, true))), true, )); } @@ -939,7 +955,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(DataType::Utf8)), + DataType::List(Box::new(Field::new("element", DataType::Utf8, true))), true, )); } @@ -951,7 +967,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(DataType::Int32)), + DataType::List(Box::new(Field::new("element", DataType::Int32, true))), true, )); } @@ -970,7 +986,7 @@ mod tests { ]); arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(arrow_struct)), + DataType::List(Box::new(Field::new("element", arrow_struct, true))), true, )); } @@ -987,7 +1003,7 @@ mod tests { DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]); arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(arrow_struct)), + DataType::List(Box::new(Field::new("array", arrow_struct, true))), true, )); } @@ -1004,7 +1020,7 @@ mod tests { DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]); arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(arrow_struct)), + DataType::List(Box::new(Field::new("my_list_tuple", arrow_struct, true))), true, )); } @@ -1014,7 +1030,7 @@ mod tests { { arrow_fields.push(Field::new( "name", - DataType::List(Box::new(DataType::Int32)), + DataType::List(Box::new(Field::new("name", DataType::Int32, true))), true, )); } @@ -1180,20 +1196,24 @@ mod tests { let inner_group_list = Field::new( "innerGroup", - DataType::List(Box::new(DataType::Struct(vec![Field::new( - "leaf3", - DataType::Int32, + DataType::List(Box::new(Field::new( + "innerGroup", + DataType::Struct(vec![Field::new("leaf3", DataType::Int32, true)]), true, - )]))), + ))), true, ); let outer_group_list = Field::new( "outerGroup", - DataType::List(Box::new(DataType::Struct(vec![ - Field::new("leaf2", DataType::Int32, true), - inner_group_list, - ]))), + DataType::List(Box::new(Field::new( + "outerGroup", + DataType::Struct(vec![ + Field::new("leaf2", DataType::Int32, true), + inner_group_list, + ]), + true, + ))), true, ); arrow_fields.push(outer_group_list); @@ -1261,7 +1281,11 @@ mod tests { Field::new("double", DataType::Float64, true), Field::new("float", DataType::Float32, true), Field::new("string", DataType::Utf8, true), - Field::new("bools", DataType::List(Box::new(DataType::Boolean)), true), + Field::new( + "bools", + DataType::List(Box::new(Field::new("bools", DataType::Boolean, true))), + true, + ), Field::new("date", DataType::Date32(DateUnit::Day), true), Field::new("time_milli", DataType::Time32(TimeUnit::Millisecond), true), Field::new("time_micro", DataType::Time64(TimeUnit::Microsecond), true), @@ -1327,7 +1351,11 @@ mod tests { Field::new("double", DataType::Float64, true), Field::new("float", DataType::Float32, true), Field::new("string", DataType::Utf8, true), - Field::new("bools", DataType::List(Box::new(DataType::Boolean)), true), + Field::new( + "bools", + DataType::List(Box::new(Field::new("element", DataType::Boolean, true))), + true, + ), Field::new("date", DataType::Date32(DateUnit::Day), true), Field::new("time_milli", DataType::Time32(TimeUnit::Millisecond), true), Field::new("time_micro", DataType::Time64(TimeUnit::Microsecond), true), @@ -1346,7 +1374,15 @@ mod tests { DataType::Struct(vec![ Field::new("bools", DataType::Boolean, false), Field::new("uint32", DataType::UInt32, false), - Field::new("int32", DataType::List(Box::new(DataType::Int32)), true), + Field::new( + "int32", + DataType::List(Box::new(Field::new( + "element", + DataType::Int32, + true, + ))), + true, + ), ]), false, ), @@ -1452,7 +1488,11 @@ mod tests { ), Field::new("c19", DataType::Interval(IntervalUnit::DayTime), false), Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false), - Field::new("c21", DataType::List(Box::new(DataType::Boolean)), false), + Field::new( + "c21", + DataType::List(Box::new(Field::new("list", DataType::Boolean, true))), + false, + ), // Field::new( // "c22", // DataType::FixedSizeList(Box::new(DataType::Boolean), 5), @@ -1543,20 +1583,37 @@ mod tests { let schema = Schema::new_with_metadata( vec![ - Field::new("c21", DataType::List(Box::new(DataType::Boolean)), false), + Field::new( + "c21", + DataType::List(Box::new(Field::new( + "array", + DataType::Boolean, + true, + ))), + false, + ), Field::new( "c22", - DataType::FixedSizeList(Box::new(DataType::Boolean), 5), + DataType::FixedSizeList( + Box::new(Field::new("items", DataType::Boolean, false)), + 5, + ), false, ), Field::new( "c23", - DataType::List(Box::new(DataType::LargeList(Box::new( - DataType::Struct(vec![ - Field::new("a", DataType::Int16, true), - Field::new("b", DataType::Float64, false), - ]), - )))), + DataType::List(Box::new(Field::new( + "items", + DataType::LargeList(Box::new(Field::new( + "items", + DataType::Struct(vec![ + Field::new("a", DataType::Int16, true), + Field::new("b", DataType::Float64, false), + ]), + true, + ))), + true, + ))), true, ), ], From 8af39e46ef695ac16edeccf6a0a53c3d55d986b5 Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Sat, 7 Nov 2020 10:12:13 +0200 Subject: [PATCH 3/6] Add Datafusion changes to List> --- .../src/physical_plan/distinct_expressions.rs | 6 +++++- rust/datafusion/src/physical_plan/functions.rs | 6 +++--- rust/datafusion/src/scalar.rs | 11 +++++++---- rust/datafusion/tests/sql.rs | 8 ++++++-- 4 files changed, 21 insertions(+), 10 deletions(-) diff --git a/rust/datafusion/src/physical_plan/distinct_expressions.rs b/rust/datafusion/src/physical_plan/distinct_expressions.rs index c2183ca3b66..5c9b371f6a0 100644 --- a/rust/datafusion/src/physical_plan/distinct_expressions.rs +++ b/rust/datafusion/src/physical_plan/distinct_expressions.rs @@ -80,7 +80,11 @@ impl AggregateExpr for DistinctCount { .map(|data_type| { Field::new( &format_state_name(&self.name, "count distinct"), - DataType::List(Box::new(data_type.clone())), + DataType::List(Box::new(Field::new( + "item", + data_type.clone(), + false, + ))), false, ) }) diff --git a/rust/datafusion/src/physical_plan/functions.rs b/rust/datafusion/src/physical_plan/functions.rs index afc4690ef34..d0a121139d8 100644 --- a/rust/datafusion/src/physical_plan/functions.rs +++ b/rust/datafusion/src/physical_plan/functions.rs @@ -42,7 +42,7 @@ use arrow::{ array::ArrayRef, compute::kernels::length::length, datatypes::TimeUnit, - datatypes::{DataType, Schema}, + datatypes::{DataType, Field, Schema}, record_batch::RecordBatch, }; use fmt::{Debug, Formatter}; @@ -203,7 +203,7 @@ pub fn return_type( Ok(DataType::Timestamp(TimeUnit::Nanosecond, None)) } BuiltinScalarFunction::Array => Ok(DataType::FixedSizeList( - Box::new(arg_types[0].clone()), + Box::new(Field::new("item", arg_types[0].clone(), true)), arg_types.len() as i32, )), _ => Ok(DataType::Float64), @@ -471,7 +471,7 @@ mod tests { assert_eq!( expr.data_type(&schema)?, // type equals to a common coercion - DataType::FixedSizeList(Box::new(expected_type), 2) + DataType::FixedSizeList(Box::new(Field::new("item", expected_type, true)), 2) ); // evaluate works diff --git a/rust/datafusion/src/scalar.rs b/rust/datafusion/src/scalar.rs index 83701e51c8c..2eb1d69a617 100644 --- a/rust/datafusion/src/scalar.rs +++ b/rust/datafusion/src/scalar.rs @@ -28,7 +28,10 @@ use arrow::array::{ Int16Builder, Int32Builder, Int64Builder, Int8Builder, ListBuilder, UInt16Builder, UInt32Builder, UInt64Builder, UInt8Builder, }; -use arrow::{array::ArrayRef, datatypes::DataType}; +use arrow::{ + array::ArrayRef, + datatypes::{DataType, Field}, +}; use crate::error::{DataFusionError, Result}; @@ -124,7 +127,7 @@ impl ScalarValue { ScalarValue::Utf8(_) => DataType::Utf8, ScalarValue::LargeUtf8(_) => DataType::LargeUtf8, ScalarValue::List(_, data_type) => { - DataType::List(Box::new(data_type.clone())) + DataType::List(Box::new(Field::new("item", data_type.clone(), true))) } } } @@ -212,7 +215,7 @@ impl ScalarValue { Some(scalar_vec) } }; - ScalarValue::List(value, *nested_type.clone()) + ScalarValue::List(value, nested_type.data_type().clone()) } other => { return Err(DataFusionError::NotImplemented(format!( @@ -309,7 +312,7 @@ impl TryFrom<&DataType> for ScalarValue { &DataType::Utf8 => ScalarValue::Utf8(None), &DataType::LargeUtf8 => ScalarValue::LargeUtf8(None), &DataType::List(ref nested_type) => { - ScalarValue::List(None, *nested_type.clone()) + ScalarValue::List(None, nested_type.data_type().clone()) } _ => { return Err(DataFusionError::NotImplemented(format!( diff --git a/rust/datafusion/tests/sql.rs b/rust/datafusion/tests/sql.rs index a35476d7f25..8dd7afa1d7e 100644 --- a/rust/datafusion/tests/sql.rs +++ b/rust/datafusion/tests/sql.rs @@ -142,10 +142,14 @@ async fn parquet_list_columns() { let schema = Arc::new(Schema::new(vec![ Field::new( "int64_list", - DataType::List(Box::new(DataType::Int64)), + DataType::List(Box::new(Field::new("item", DataType::Int64, true))), + true, + ), + Field::new( + "utf8_list", + DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), true, ), - Field::new("utf8_list", DataType::List(Box::new(DataType::Utf8)), true), ])); let sql = "SELECT int64_list, utf8_list FROM list_columns"; From 8ae32fd1b7d327c625d601ca1b3e877eb459aa21 Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Sat, 7 Nov 2020 12:39:52 +0200 Subject: [PATCH 4/6] fix failing tests --- rust/arrow/src/array/builder.rs | 10 ++----- rust/arrow/src/datatypes.rs | 29 ------------------- rust/arrow/src/record_batch.rs | 2 +- rust/arrow/src/util/integration_util.rs | 2 +- rust/arrow/test/data/integration.json | 10 +++---- .../src/physical_plan/distinct_expressions.rs | 6 +--- 6 files changed, 11 insertions(+), 48 deletions(-) diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs index 12a27d11436..09fb9e73744 100644 --- a/rust/arrow/src/array/builder.rs +++ b/rust/arrow/src/array/builder.rs @@ -848,7 +848,7 @@ where let data = ArrayData::builder(DataType::List(Box::new(Field::new( "item", values_data.data_type().clone(), - nulls > 0, + true, // TODO: find a consistent way of getting this )))) .len(len) .null_count(len - nulls) @@ -1061,7 +1061,7 @@ where let data = ArrayData::builder(DataType::LargeList(Box::new(Field::new( "item", values_data.data_type().clone(), - nulls > 0, + true, )))) .len(len) .null_count(len - nulls) @@ -1250,11 +1250,7 @@ where let null_bit_buffer = self.bitmap_builder.finish(); let nulls = bit_util::count_set_bits(null_bit_buffer.data()); let data = ArrayData::builder(DataType::FixedSizeList( - Box::new(Field::new( - "item", - values_data.data_type().clone(), - nulls > 0, - )), + Box::new(Field::new("item", values_data.data_type().clone(), true)), self.list_len, )) .len(len) diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index 71902a6af55..b0ff2cbcd42 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -872,35 +872,6 @@ impl ToByteSlice for T { } impl DataType { - /// Compare data types considering that list fields only match on type - // TODO: remove this custom comparator when we have clarity on how list field - // names should be handled between Arrow and Parquet - pub fn compare(&self, other: &Self) -> bool { - match (self, other) { - (DataType::List(f1), DataType::List(f2)) - | (DataType::LargeList(f1), DataType::LargeList(f2)) => { - f1.data_type().compare(f2.data_type()) - } - (DataType::FixedSizeList(f1, l1), DataType::FixedSizeList(f2, l2)) => { - f1.data_type().compare(f2.data_type()) && l1 == l2 - } - (DataType::Struct(fields1), DataType::Struct(fields2)) => { - if fields1.len() != fields2.len() { - return false; - } - fields1.iter().zip(fields2).all(|(a, b)| { - // fields must be the same, but if any field is a list child, - // we should only compare datatypes - a.data_type().compare(b.data_type()) - && a.name() == b.name() - && a.is_nullable() == b.is_nullable() - && a.dict_id == b.dict_id - && a.dict_is_ordered == b.dict_is_ordered - }) - } - _ => self == other, - } - } /// Parse a data type from a JSON representation fn from(json: &Value) -> Result { let default_field = Field::new("", DataType::Boolean, true); diff --git a/rust/arrow/src/record_batch.rs b/rust/arrow/src/record_batch.rs index 86885b8f8ae..b4aa97dd2a2 100644 --- a/rust/arrow/src/record_batch.rs +++ b/rust/arrow/src/record_batch.rs @@ -100,7 +100,7 @@ impl RecordBatch { )); } // list types can have different names, but we only need the data types to be the same - if !column.data_type().compare(schema.field(i).data_type()) { + if column.data_type() != schema.field(i).data_type() { return Err(ArrowError::InvalidArgumentError(format!( "column types must match schema types, expected {:?} but found {:?} at column index {}", schema.field(i).data_type(), diff --git a/rust/arrow/src/util/integration_util.rs b/rust/arrow/src/util/integration_util.rs index 5b9e1bac3fa..22d271618b6 100644 --- a/rust/arrow/src/util/integration_util.rs +++ b/rust/arrow/src/util/integration_util.rs @@ -671,7 +671,7 @@ mod tests { Field::new("utf8s", DataType::Utf8, true), Field::new( "lists", - DataType::List(Box::new(Field::new("itemx", DataType::Int32, false))), + DataType::List(Box::new(Field::new("item", DataType::Int32, true))), true, ), Field::new( diff --git a/rust/arrow/test/data/integration.json b/rust/arrow/test/data/integration.json index df9dbb7e77c..193636ff136 100644 --- a/rust/arrow/test/data/integration.json +++ b/rust/arrow/test/data/integration.json @@ -251,19 +251,19 @@ }, { "name": "lists", + "nullable": true, "type": { "name": "list" }, - "nullable": true, "children": [ { - "name": "itemx", + "name": "item", + "nullable": true, "type": { "name": "int", - "isSigned": true, - "bitWidth": 32 + "bitWidth": 32, + "isSigned": true }, - "nullable": false, "children": [] } ] diff --git a/rust/datafusion/src/physical_plan/distinct_expressions.rs b/rust/datafusion/src/physical_plan/distinct_expressions.rs index 5c9b371f6a0..ed90cb36bc4 100644 --- a/rust/datafusion/src/physical_plan/distinct_expressions.rs +++ b/rust/datafusion/src/physical_plan/distinct_expressions.rs @@ -80,11 +80,7 @@ impl AggregateExpr for DistinctCount { .map(|data_type| { Field::new( &format_state_name(&self.name, "count distinct"), - DataType::List(Box::new(Field::new( - "item", - data_type.clone(), - false, - ))), + DataType::List(Box::new(Field::new("item", data_type.clone(), true))), false, ) }) From 790da1a87a8681799b714aff38b4dd704fc8eecd Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Sat, 7 Nov 2020 13:34:19 +0200 Subject: [PATCH 5/6] address queries, remove commented out code --- rust/arrow/src/datatypes.rs | 2 +- rust/arrow/src/ipc/convert.rs | 39 ----------------------------------- 2 files changed, 1 insertion(+), 40 deletions(-) diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index b0ff2cbcd42..099355e7596 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -1183,7 +1183,7 @@ impl Field { self.nullable } - /// IReturns the dictionary ID + /// Returns the dictionary ID #[inline] pub const fn dict_id(&self) -> i64 { self.dict_id diff --git a/rust/arrow/src/ipc/convert.rs b/rust/arrow/src/ipc/convert.rs index dffd9a711bd..feb4133d383 100644 --- a/rust/arrow/src/ipc/convert.rs +++ b/rust/arrow/src/ipc/convert.rs @@ -530,19 +530,6 @@ pub(crate) fn get_fb_field_type<'a: 'b, 'b>( } List(ref list_type) => { let child = build_field(fbb, list_type); - // let inner_types = get_fb_field_type(list_type, fbb); - // let child = ipc::Field::create( - // fbb, - // &ipc::FieldArgs { - // name: None, - // nullable: false, - // type_type: inner_types.type_type, - // type_: Some(inner_types.type_), - // children: inner_types.children, - // dictionary: None, - // custom_metadata: None, - // }, - // ); FBFieldType { type_type: ipc::Type::List, type_: ipc::ListBuilder::new(fbb).finish().as_union_value(), @@ -551,19 +538,6 @@ pub(crate) fn get_fb_field_type<'a: 'b, 'b>( } LargeList(ref list_type) => { let child = build_field(fbb, list_type); - // let inner_types = get_fb_field_type(list_type, fbb); - // let child = ipc::Field::create( - // fbb, - // &ipc::FieldArgs { - // name: None, - // nullable: false, - // type_type: inner_types.type_type, - // type_: Some(inner_types.type_), - // dictionary: None, - // children: inner_types.children, - // custom_metadata: None, - // }, - // ); FBFieldType { type_type: ipc::Type::LargeList, type_: ipc::LargeListBuilder::new(fbb).finish().as_union_value(), @@ -572,19 +546,6 @@ pub(crate) fn get_fb_field_type<'a: 'b, 'b>( } FixedSizeList(ref list_type, len) => { let child = build_field(fbb, list_type); - // let inner_types = get_fb_field_type(list_type, fbb); - // let child = ipc::Field::create( - // fbb, - // &ipc::FieldArgs { - // name: None, - // nullable: false, - // type_type: inner_types.type_type, - // type_: Some(inner_types.type_), - // dictionary: None, - // children: inner_types.children, - // custom_metadata: None, - // }, - // ); let mut builder = ipc::FixedSizeListBuilder::new(fbb); builder.add_listSize(*len as i32); FBFieldType { From dd8178c5c41aa1cd5987d2a3c91c95d04cb1500d Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Sat, 7 Nov 2020 13:45:26 +0200 Subject: [PATCH 6/6] rebase --- rust/arrow/src/compute/kernels/take.rs | 23 ++++++++++++++++++----- rust/arrow/src/compute/util.rs | 4 ++-- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/rust/arrow/src/compute/kernels/take.rs b/rust/arrow/src/compute/kernels/take.rs index 35ca7b3c9d2..83139c3f031 100644 --- a/rust/arrow/src/compute/kernels/take.rs +++ b/rust/arrow/src/compute/kernels/take.rs @@ -778,7 +778,11 @@ mod tests { let value_offsets: [$offset_type; 4] = [0, 3, 6, 8]; let value_offsets = Buffer::from(&value_offsets.to_byte_slice()); // Construct a list array from the above two - let list_data_type = DataType::$list_data_type(Box::new(Field::new("item", DataType::Int32, false))); + let list_data_type = DataType::$list_data_type(Box::new(Field::new( + "item", + DataType::Int32, + false, + ))); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_buffer(value_offsets) @@ -847,7 +851,11 @@ mod tests { let value_offsets: [$offset_type; 5] = [0, 3, 6, 7, 9]; let value_offsets = Buffer::from(&value_offsets.to_byte_slice()); // Construct a list array from the above two - let list_data_type = DataType::$list_data_type(Box::new(Field::new("item", DataType::Int32, false))); + let list_data_type = DataType::$list_data_type(Box::new(Field::new( + "item", + DataType::Int32, + false, + ))); let list_data = ArrayData::builder(list_data_type.clone()) .len(4) .add_buffer(value_offsets) @@ -916,7 +924,11 @@ mod tests { let value_offsets: [$offset_type; 5] = [0, 3, 6, 6, 8]; let value_offsets = Buffer::from(&value_offsets.to_byte_slice()); // Construct a list array from the above two - let list_data_type = DataType::$list_data_type(Box::new(Field::new("item", DataType::Int32, false))); + let list_data_type = DataType::$list_data_type(Box::new(Field::new( + "item", + DataType::Int32, + false, + ))); let list_data = ArrayData::builder(list_data_type.clone()) .len(4) .add_buffer(value_offsets) @@ -1006,7 +1018,8 @@ mod tests { // Construct offsets let value_offsets = Buffer::from(&[0, 3, 6, 8].to_byte_slice()); // Construct a list array from the above two - let list_data_type = DataType::List(Box::new(Field::new("item", DataType::Int32, false))); + let list_data_type = + DataType::List(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_buffer(value_offsets) @@ -1160,4 +1173,4 @@ mod tests { assert_eq!(expected_keys.data_type(), result_keys.data_type()); assert_eq!(expected_keys, result_keys); } -} \ No newline at end of file +} diff --git a/rust/arrow/src/compute/util.rs b/rust/arrow/src/compute/util.rs index b3836bf4dd4..3b578011640 100644 --- a/rust/arrow/src/compute/util.rs +++ b/rust/arrow/src/compute/util.rs @@ -337,7 +337,7 @@ mod tests { #[test] fn test_take_value_index_from_large_list() { let list = build_list( - DataType::LargeList(Box::new(DataType::Int32)), + DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))), Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), vec![0i64, 2i64, 5i64, 10i64], ); @@ -397,4 +397,4 @@ mod tests { } } } -} \ No newline at end of file +}