From 5d1d0184633ccc2484611b1d8fdc635099675c8d Mon Sep 17 00:00:00 2001 From: Christoph Schulze Date: Thu, 19 Nov 2020 17:48:11 +0100 Subject: [PATCH 01/15] ARROW-10656 Use DataType comparison without values --- rust/arrow/src/datatypes.rs | 70 ++++++++++++++++++++++++++++++++++ rust/arrow/src/record_batch.rs | 50 +++++++++++++++++++++++- 2 files changed, 119 insertions(+), 1 deletion(-) diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index 630b14dfc1e..7d7bdaa3256 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -1142,6 +1142,44 @@ impl DataType { | Float64 ) } + + /// Compares this data type with another data type only based on the data type + /// including nested data types, but not based on other values. + pub fn cmp_type(&self, other: &Self) -> bool { + match (self, other) { + (DataType::List(f1), DataType::List(f2)) => { + f1.data_type().cmp_type(f2.data_type()) + } + (DataType::FixedSizeList(f1, _), DataType::FixedSizeList(f2, _)) => { + f1.data_type().cmp_type(f2.data_type()) + } + (DataType::LargeList(f1), DataType::LargeList(f2)) => { + f1.data_type().cmp_type(f2.data_type()) + } + (DataType::Struct(f1), DataType::Struct(f2)) => { + if f1.len() == f2.len() { + f1.iter() + .enumerate() + .all(|(i, f)| f.data_type().cmp_type(f2[i].data_type())) + } else { + false + } + } + (DataType::Union(f1), DataType::Union(f2)) => { + if f1.len() == f2.len() { + f1.iter() + .enumerate() + .all(|(i, f)| f.data_type().cmp_type(f2[i].data_type())) + } else { + false + } + } + (DataType::Dictionary(k1, v1), DataType::Dictionary(k2, v2)) => { + k1.as_ref().cmp_type(k2) && v1.cmp_type(v2.as_ref()) + } + t @ (_, _) => std::mem::discriminant(t.0) == std::mem::discriminant(t.1), + } + } } impl Field { @@ -2740,6 +2778,38 @@ mod tests { Ok(()) } + + #[test] + fn test_compare_nested_types() { + let list_type_a = &DataType::List(Box::new(Field::new( + "a", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + true, + ))); + let list_type_b = &DataType::List(Box::new(Field::new( + "b", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + false, + ))); + + assert!(list_type_a.cmp_type(list_type_b)); + } + + #[test] + fn test_compare_mismatching_types() { + let list_type_a = &DataType::LargeList(Box::new(Field::new( + "a", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + true, + ))); + let list_type_b = &DataType::LargeList(Box::new(Field::new( + "b", + DataType::Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), + false, + ))); + + assert!(!list_type_a.cmp_type(list_type_b)); + } } #[cfg(all( diff --git a/rust/arrow/src/record_batch.rs b/rust/arrow/src/record_batch.rs index b4aa97dd2a2..67588dc00d8 100644 --- a/rust/arrow/src/record_batch.rs +++ b/rust/arrow/src/record_batch.rs @@ -100,7 +100,7 @@ impl RecordBatch { )); } // list types can have different names, but we only need the data types to be the same - if column.data_type() != schema.field(i).data_type() { + if column.data_type().cmp_type(schema.field(i).data_type()) { return Err(ArrowError::InvalidArgumentError(format!( "column types must match schema types, expected {:?} but found {:?} at column index {}", schema.field(i).data_type(), @@ -298,6 +298,54 @@ mod tests { assert!(!batch.is_ok()); } + #[test] + fn create_record_batch_with_matching_nested_type() { + let schema = Schema::new(vec![Field::new( + "list", + DataType::List(Box::new(Field::new_dict( + "nested_dict_A", + DataType::Int32, + true, + 0, + false, + ))), + false, + )]); + + let child_data = Int32Array::from(vec![0, 1, 2, 3, 4, 5]); + let child_data_ref = Arc::new(ArrayData::new( + DataType::Int32, + 6, + None, + None, + 0, + vec![child_data.data_ref().buffers()[0].clone()], + vec![], + )); + + let offsets = UInt64Array::from(vec![0, 2, 4]); + let array_data = Arc::new(ArrayData::new( + DataType::List(Box::new(Field::new_dict( + "nested_dict_B", + DataType::Int32, + false, + 0, + false, + ))), + 3, + None, + None, + 0, + vec![offsets.data_ref().buffers()[0].clone()], + vec![child_data_ref.clone()], + )); + + let list_array = Arc::new(ListArray::from(array_data)); + + let result = RecordBatch::try_new(Arc::new(schema), vec![list_array]); + assert!(result.is_ok()); + } + #[test] fn create_record_batch_from_struct_array() { let boolean_data = ArrayData::builder(DataType::Boolean) From 6cffff2b4726a7f97fd8c65d706ed3ba1f136e0f Mon Sep 17 00:00:00 2001 From: Christoph Schulze Date: Thu, 19 Nov 2020 17:54:02 +0100 Subject: [PATCH 02/15] ARROW-10656 invert if condition --- rust/arrow/src/record_batch.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/arrow/src/record_batch.rs b/rust/arrow/src/record_batch.rs index 67588dc00d8..6c54e1d2a35 100644 --- a/rust/arrow/src/record_batch.rs +++ b/rust/arrow/src/record_batch.rs @@ -100,7 +100,7 @@ impl RecordBatch { )); } // list types can have different names, but we only need the data types to be the same - if column.data_type().cmp_type(schema.field(i).data_type()) { + if !column.data_type().cmp_type(schema.field(i).data_type()) { return Err(ArrowError::InvalidArgumentError(format!( "column types must match schema types, expected {:?} but found {:?} at column index {}", schema.field(i).data_type(), From 4324316c47241a5e555ee5dcc9d8849307ac3503 Mon Sep 17 00:00:00 2001 From: Christoph Schulze Date: Thu, 19 Nov 2020 18:14:24 +0100 Subject: [PATCH 03/15] ARROW-10656 Address clippy error --- rust/arrow/src/record_batch.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/arrow/src/record_batch.rs b/rust/arrow/src/record_batch.rs index 6c54e1d2a35..e35ab0596d9 100644 --- a/rust/arrow/src/record_batch.rs +++ b/rust/arrow/src/record_batch.rs @@ -337,7 +337,7 @@ mod tests { None, 0, vec![offsets.data_ref().buffers()[0].clone()], - vec![child_data_ref.clone()], + vec![child_data_ref], )); let list_array = Arc::new(ListArray::from(array_data)); From e680fc0466762232eebe0ce1f8f76a0973f5c252 Mon Sep 17 00:00:00 2001 From: Christoph Schulze Date: Fri, 20 Nov 2020 17:09:06 +0100 Subject: [PATCH 04/15] ARROW-10656 Remove if else --- rust/arrow/src/datatypes.rs | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index 7d7bdaa3256..c486d26cdad 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -1157,22 +1157,18 @@ impl DataType { f1.data_type().cmp_type(f2.data_type()) } (DataType::Struct(f1), DataType::Struct(f2)) => { - if f1.len() == f2.len() { - f1.iter() + f1.len() == f2.len() + && f1 + .iter() .enumerate() .all(|(i, f)| f.data_type().cmp_type(f2[i].data_type())) - } else { - false - } } (DataType::Union(f1), DataType::Union(f2)) => { - if f1.len() == f2.len() { - f1.iter() + f1.len() == f2.len() + && f1 + .iter() .enumerate() .all(|(i, f)| f.data_type().cmp_type(f2[i].data_type())) - } else { - false - } } (DataType::Dictionary(k1, v1), DataType::Dictionary(k2, v2)) => { k1.as_ref().cmp_type(k2) && v1.cmp_type(v2.as_ref()) From 5febab80db02a1016a2361fa71941130e2da30d7 Mon Sep 17 00:00:00 2001 From: Christoph Schulze Date: Fri, 20 Nov 2020 17:33:23 +0100 Subject: [PATCH 05/15] ARROW-10656 Rename cmp_type to eq_type --- rust/arrow/src/datatypes.rs | 2 +- rust/arrow/src/record_batch.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index c486d26cdad..9983893a378 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -1145,7 +1145,7 @@ impl DataType { /// Compares this data type with another data type only based on the data type /// including nested data types, but not based on other values. - pub fn cmp_type(&self, other: &Self) -> bool { + pub fn eq_type(&self, other: &Self) -> bool { match (self, other) { (DataType::List(f1), DataType::List(f2)) => { f1.data_type().cmp_type(f2.data_type()) diff --git a/rust/arrow/src/record_batch.rs b/rust/arrow/src/record_batch.rs index e35ab0596d9..40afd1652ed 100644 --- a/rust/arrow/src/record_batch.rs +++ b/rust/arrow/src/record_batch.rs @@ -100,7 +100,7 @@ impl RecordBatch { )); } // list types can have different names, but we only need the data types to be the same - if !column.data_type().cmp_type(schema.field(i).data_type()) { + if !column.data_type().eq_type(schema.field(i).data_type()) { return Err(ArrowError::InvalidArgumentError(format!( "column types must match schema types, expected {:?} but found {:?} at column index {}", schema.field(i).data_type(), From d527e162d9ced3ccc4751baca5315dfec4076fb3 Mon Sep 17 00:00:00 2001 From: Christoph Schulze Date: Sat, 21 Nov 2020 10:44:05 +0100 Subject: [PATCH 06/15] ARROW-10656 Rename cmp_type to eq_type --- rust/arrow/src/datatypes.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index 9983893a378..67a58a2003d 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -1148,30 +1148,30 @@ impl DataType { pub fn eq_type(&self, other: &Self) -> bool { match (self, other) { (DataType::List(f1), DataType::List(f2)) => { - f1.data_type().cmp_type(f2.data_type()) + f1.data_type().eq_type(f2.data_type()) } (DataType::FixedSizeList(f1, _), DataType::FixedSizeList(f2, _)) => { - f1.data_type().cmp_type(f2.data_type()) + f1.data_type().eq_type(f2.data_type()) } (DataType::LargeList(f1), DataType::LargeList(f2)) => { - f1.data_type().cmp_type(f2.data_type()) + f1.data_type().eq_type(f2.data_type()) } (DataType::Struct(f1), DataType::Struct(f2)) => { f1.len() == f2.len() && f1 .iter() .enumerate() - .all(|(i, f)| f.data_type().cmp_type(f2[i].data_type())) + .all(|(i, f)| f.data_type().eq_type(f2[i].data_type())) } (DataType::Union(f1), DataType::Union(f2)) => { f1.len() == f2.len() && f1 .iter() .enumerate() - .all(|(i, f)| f.data_type().cmp_type(f2[i].data_type())) + .all(|(i, f)| f.data_type().eq_type(f2[i].data_type())) } (DataType::Dictionary(k1, v1), DataType::Dictionary(k2, v2)) => { - k1.as_ref().cmp_type(k2) && v1.cmp_type(v2.as_ref()) + k1.as_ref().eq_type(k2) && v1.eq_type(v2.as_ref()) } t @ (_, _) => std::mem::discriminant(t.0) == std::mem::discriminant(t.1), } @@ -2788,7 +2788,7 @@ mod tests { false, ))); - assert!(list_type_a.cmp_type(list_type_b)); + assert!(list_type_a.eq_type(list_type_b)); } #[test] @@ -2804,7 +2804,7 @@ mod tests { false, ))); - assert!(!list_type_a.cmp_type(list_type_b)); + assert!(!list_type_a.eq_type(list_type_b)); } } From ccec7df2ad47a3079fda93134c51f946d6be9803 Mon Sep 17 00:00:00 2001 From: Christoph Schulze Date: Mon, 23 Nov 2020 21:18:01 +0100 Subject: [PATCH 07/15] ARROW-10656 Introduce data type context --- rust/arrow/examples/builders.rs | 6 +- rust/arrow/src/array/array_binary.rs | 4 +- rust/arrow/src/array/array_list.rs | 31 ++-- rust/arrow/src/array/builder.rs | 44 +++-- rust/arrow/src/compute/kernels/cast.rs | 33 ++-- rust/arrow/src/compute/kernels/comparison.rs | 6 +- rust/arrow/src/compute/kernels/filter.rs | 2 +- rust/arrow/src/compute/kernels/limit.rs | 4 +- rust/arrow/src/compute/kernels/take.rs | 26 ++- rust/arrow/src/compute/util.rs | 4 +- rust/arrow/src/datatypes.rs | 157 ++++++++++-------- rust/arrow/src/ipc/convert.rs | 93 +++++++++-- rust/arrow/src/ipc/reader.rs | 4 +- rust/arrow/src/json/reader.rs | 87 ++++------ rust/arrow/src/record_batch.rs | 18 +- rust/arrow/src/util/integration_util.rs | 10 +- .../src/physical_plan/distinct_expressions.rs | 7 +- .../datafusion/src/physical_plan/functions.rs | 10 +- rust/datafusion/src/scalar.rs | 8 +- rust/datafusion/tests/sql.rs | 6 +- .../src/bin/arrow-json-integration-test.rs | 25 ++- rust/parquet/src/arrow/array_reader.rs | 15 +- rust/parquet/src/arrow/arrow_writer.rs | 30 ++-- rust/parquet/src/arrow/schema.rs | 98 +++++------ 24 files changed, 384 insertions(+), 344 deletions(-) diff --git a/rust/arrow/examples/builders.rs b/rust/arrow/examples/builders.rs index 61cce0ed97a..0ec3d316a67 100644 --- a/rust/arrow/examples/builders.rs +++ b/rust/arrow/examples/builders.rs @@ -25,7 +25,9 @@ use arrow::array::{ StringArray, StructArray, }; use arrow::buffer::Buffer; -use arrow::datatypes::{DataType, Date64Type, Field, Time64NanosecondType, ToByteSlice}; +use arrow::datatypes::{ + DataType, DataTypeContext, Date64Type, Field, Time64NanosecondType, ToByteSlice, +}; fn main() { // Primitive Arrays @@ -100,7 +102,7 @@ fn main() { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); + DataType::List(Box::new(DataTypeContext::new(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) diff --git a/rust/arrow/src/array/array_binary.rs b/rust/arrow/src/array/array_binary.rs index d7a3eb7217a..fd17a06a424 100644 --- a/rust/arrow/src/array/array_binary.rs +++ b/rust/arrow/src/array/array_binary.rs @@ -443,7 +443,7 @@ impl Array for FixedSizeBinaryArray { #[cfg(test)] mod tests { - use crate::datatypes::Field; + use crate::datatypes::DataTypeContext; use super::*; @@ -755,7 +755,7 @@ mod tests { .build(); let array_data = ArrayData::builder(DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Binary, false)), + Box::new(DataTypeContext::new(DataType::Binary, false)), 4, )) .len(3) diff --git a/rust/arrow/src/array/array_list.rs b/rust/arrow/src/array/array_list.rs index 4eb8dc56640..91684425165 100644 --- a/rust/arrow/src/array/array_list.rs +++ b/rust/arrow/src/array/array_list.rs @@ -297,15 +297,12 @@ impl fmt::Debug for FixedSizeListArray { #[cfg(test)] mod tests { use crate::{ - array::ArrayData, - array::Int32Array, - buffer::Buffer, - datatypes::{Field, ToByteSlice}, - memory, - util::bit_util, + array::ArrayData, array::Int32Array, buffer::Buffer, datatypes::ToByteSlice, + memory, util::bit_util, }; use super::*; + use crate::datatypes::DataTypeContext; #[test] fn test_list_array() { @@ -321,7 +318,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); + DataType::List(Box::new(DataTypeContext::new(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_buffer(value_offsets.clone()) @@ -391,7 +388,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))); + DataType::LargeList(Box::new(DataTypeContext::new(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_buffer(value_offsets.clone()) @@ -457,7 +454,7 @@ mod tests { // Construct a list array from the above two let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Int32, false)), + Box::new(DataTypeContext::new(DataType::Int32, false)), 3, ); let list_data = ArrayData::builder(list_data_type.clone()) @@ -526,7 +523,7 @@ mod tests { // Construct a list array from the above two let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Int32, false)), + Box::new(DataTypeContext::new(DataType::Int32, false)), 3, ); let list_data = ArrayData::builder(list_data_type) @@ -560,7 +557,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); + DataType::List(Box::new(DataTypeContext::new(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(9) .add_buffer(value_offsets) @@ -625,7 +622,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))); + DataType::LargeList(Box::new(DataTypeContext::new(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(9) .add_buffer(value_offsets) @@ -688,7 +685,7 @@ mod tests { // Construct a fixed size list array from the above two let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Int32, false)), + Box::new(DataTypeContext::new(DataType::Int32, false)), 2, ); let list_data = ArrayData::builder(list_data_type) @@ -739,7 +736,7 @@ mod tests { .add_buffer(Buffer::from(&[0, 1, 2, 3, 4, 5, 6, 7].to_byte_slice())) .build(); let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); + DataType::List(Box::new(DataTypeContext::new(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_child_data(value_data) @@ -754,7 +751,7 @@ mod tests { fn test_list_array_invalid_child_array_len() { let value_offsets = Buffer::from(&[0, 2, 5, 7].to_byte_slice()); let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); + DataType::List(Box::new(DataTypeContext::new(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -773,7 +770,7 @@ mod tests { let value_offsets = Buffer::from(&[2, 2, 5, 7].to_byte_slice()); let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); + DataType::List(Box::new(DataTypeContext::new(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -805,7 +802,7 @@ mod tests { .build(); let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); + DataType::List(Box::new(DataTypeContext::new(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .add_buffer(buf2) .add_child_data(value_data) diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs index eb582c3e107..19ce0b23cfe 100644 --- a/rust/arrow/src/array/builder.rs +++ b/rust/arrow/src/array/builder.rs @@ -764,8 +764,7 @@ where /// /// This is used for validating array data types in `append_data` fn data_type(&self) -> DataType { - DataType::List(Box::new(Field::new( - "item", + DataType::List(Box::new(DataTypeContext::new( self.values_builder.data_type(), true, ))) @@ -834,8 +833,7 @@ where let null_bit_buffer = self.bitmap_builder.finish(); let nulls = null_bit_buffer.count_set_bits(); self.offsets_builder.append(0).unwrap(); - let data = ArrayData::builder(DataType::List(Box::new(Field::new( - "item", + let data = ArrayData::builder(DataType::List(Box::new(DataTypeContext::new( values_data.data_type().clone(), true, // TODO: find a consistent way of getting this )))) @@ -976,8 +974,7 @@ where /// /// This is used for validating array data types in `append_data` fn data_type(&self) -> DataType { - DataType::LargeList(Box::new(Field::new( - "item", + DataType::LargeList(Box::new(DataTypeContext::new( self.values_builder.data_type(), true, ))) @@ -1046,11 +1043,9 @@ where let null_bit_buffer = self.bitmap_builder.finish(); let nulls = null_bit_buffer.count_set_bits(); self.offsets_builder.append(0).unwrap(); - let data = ArrayData::builder(DataType::LargeList(Box::new(Field::new( - "item", - values_data.data_type().clone(), - true, - )))) + let data = ArrayData::builder(DataType::LargeList(Box::new( + DataTypeContext::new(values_data.data_type().clone(), true), + ))) .len(len) .null_count(len - nulls) .add_buffer(offset_buffer) @@ -1158,7 +1153,7 @@ where /// This is used for validating array data types in `append_data` fn data_type(&self) -> DataType { DataType::FixedSizeList( - Box::new(Field::new("item", self.values_builder.data_type(), true)), + Box::new(DataTypeContext::new(self.values_builder.data_type(), true)), self.list_len, ) } @@ -1237,7 +1232,7 @@ where let null_bit_buffer = self.bitmap_builder.finish(); let nulls = null_bit_buffer.count_set_bits(); let data = ArrayData::builder(DataType::FixedSizeList( - Box::new(Field::new("item", values_data.data_type().clone(), true)), + Box::new(DataTypeContext::new(values_data.data_type().clone(), true)), self.list_len, )) .len(len) @@ -1451,7 +1446,7 @@ fn append_binary_data( )) as ArrayDataRef; Arc::new(ArrayData::new( - DataType::List(Box::new(Field::new("item", DataType::UInt8, true))), + DataType::List(Box::new(DataTypeContext::new(DataType::UInt8, true))), array.len(), None, array.null_buffer().cloned(), @@ -1503,8 +1498,7 @@ fn append_large_binary_data( )) as ArrayDataRef; Arc::new(ArrayData::new( - DataType::LargeList(Box::new(Field::new( - "item", + DataType::LargeList(Box::new(DataTypeContext::new( DataType::UInt8, true, ))), @@ -1606,7 +1600,7 @@ impl ArrayBuilder for FixedSizeBinaryBuilder { )) as ArrayDataRef; let list_data = Arc::new(ArrayData::new( DataType::FixedSizeList( - Box::new(Field::new("item", DataType::UInt8, true)), + Box::new(DataTypeContext::new(DataType::UInt8, true)), self.builder.list_len, ), array.len(), @@ -3647,13 +3641,13 @@ mod tests { #[test] #[should_panic( - expected = "Data type List(Field { name: \"item\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false }) is not currently supported" + expected = "Data type List(DataTypeContext { data_type: Int64, nullable: true }) is not currently supported" )] fn test_struct_array_builder_from_schema_unsupported_type() { let mut fields = Vec::new(); fields.push(Field::new("f1", DataType::Int16, false)); let list_type = - DataType::List(Box::new(Field::new("item", DataType::Int64, true))); + DataType::List(Box::new(DataTypeContext::new(DataType::Int64, true))); fields.push(Field::new("f2", list_type, false)); let _ = StructBuilder::from_fields(fields, 5); @@ -3952,7 +3946,7 @@ mod tests { let list_value_offsets = Buffer::from(&[0, 3, 5, 11, 13, 13, 15, 15, 17].to_byte_slice()); let expected_list_data = ArrayData::new( - DataType::List(Box::new(Field::new("item", DataType::Int64, true))), + DataType::List(Box::new(DataTypeContext::new(DataType::Int64, true))), 8, None, None, @@ -4038,7 +4032,7 @@ mod tests { &[0, 3, 5, 5, 13, 15, 15, 15, 19, 19, 19, 19, 23].to_byte_slice(), ); let expected_list_data = ArrayData::new( - DataType::List(Box::new(Field::new("item", DataType::Int64, true))), + DataType::List(Box::new(DataTypeContext::new(DataType::Int64, true))), 12, None, None, @@ -4080,7 +4074,7 @@ mod tests { ]); let list_value_offsets = Buffer::from(&[0, 2, 3, 6].to_byte_slice()); let list_data = ArrayData::new( - DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), + DataType::List(Box::new(DataTypeContext::new(DataType::Utf8, true))), 3, None, None, @@ -4115,7 +4109,7 @@ mod tests { ]); let list_value_offsets = Buffer::from(&[0, 2, 2, 4, 5, 8, 9, 12].to_byte_slice()); let expected_list_data = ArrayData::new( - DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), + DataType::List(Box::new(DataTypeContext::new(DataType::Utf8, true))), 7, None, None, // is this correct? @@ -4204,7 +4198,7 @@ mod tests { ]); let expected_list_data = ArrayData::new( DataType::FixedSizeList( - Box::new(Field::new("item", DataType::UInt16, true)), + Box::new(DataTypeContext::new(DataType::UInt16, true)), 2, ), 12, @@ -4277,7 +4271,7 @@ mod tests { ]); let expected_list_data = ArrayData::new( DataType::FixedSizeList( - Box::new(Field::new("item", DataType::UInt8, true)), + Box::new(DataTypeContext::new(DataType::UInt8, true)), 2, ), 12, diff --git a/rust/arrow/src/compute/kernels/cast.rs b/rust/arrow/src/compute/kernels/cast.rs index ef79302927f..b8fbff756c5 100644 --- a/rust/arrow/src/compute/kernels/cast.rs +++ b/rust/arrow/src/compute/kernels/cast.rs @@ -1237,7 +1237,7 @@ mod tests { let array = Arc::new(a) as ArrayRef; let b = cast( &array, - &DataType::List(Box::new(Field::new("item", DataType::Int32, true))), + &DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))), ) .unwrap(); assert_eq!(5, b.len()); @@ -1267,7 +1267,7 @@ mod tests { let array = Arc::new(a) as ArrayRef; let b = cast( &array, - &DataType::List(Box::new(Field::new("item", DataType::Int32, true))), + &DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))), ) .unwrap(); assert_eq!(5, b.len()); @@ -1300,7 +1300,7 @@ mod tests { let array = array.slice(2, 4); let b = cast( &array, - &DataType::List(Box::new(Field::new("item", DataType::Float64, true))), + &DataType::List(Box::new(DataTypeContext::new(DataType::Float64, true))), ) .unwrap(); assert_eq!(4, b.len()); @@ -1377,7 +1377,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, true))); + DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -1387,7 +1387,7 @@ mod tests { let cast_array = cast( &list_array, - &DataType::List(Box::new(Field::new("item", DataType::UInt16, true))), + &DataType::List(Box::new(DataTypeContext::new(DataType::UInt16, true))), ) .unwrap(); // 3 negative values should get lost when casting to unsigned, @@ -1436,7 +1436,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, true))); + DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -1446,8 +1446,7 @@ mod tests { cast( &list_array, - &DataType::List(Box::new(Field::new( - "item", + &DataType::List(Box::new(DataTypeContext::new( DataType::Timestamp(TimeUnit::Microsecond, None), true, ))), @@ -2854,7 +2853,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, true))); + DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -2876,7 +2875,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, true))); + DataType::LargeList(Box::new(DataTypeContext::new(DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -2896,7 +2895,7 @@ mod tests { // Construct a fixed size list array from the above two let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Int32, true)), + Box::new(DataTypeContext::new(DataType::Int32, true)), 2, ); let list_data = ArrayData::builder(list_data_type) @@ -2989,12 +2988,12 @@ mod tests { LargeBinary, Utf8, LargeUtf8, - List(Box::new(Field::new("item", DataType::Int8, true))), - List(Box::new(Field::new("item", DataType::Utf8, true))), - FixedSizeList(Box::new(Field::new("item", DataType::Int8, true)), 10), - FixedSizeList(Box::new(Field::new("item", DataType::Utf8, false)), 10), - LargeList(Box::new(Field::new("item", DataType::Int8, true))), - LargeList(Box::new(Field::new("item", DataType::Utf8, false))), + List(Box::new(DataTypeContext::new(DataType::Int8, true))), + List(Box::new(DataTypeContext::new(DataType::Utf8, true))), + FixedSizeList(Box::new(DataTypeContext::new(DataType::Int8, true)), 10), + FixedSizeList(Box::new(DataTypeContext::new(DataType::Utf8, false)), 10), + LargeList(Box::new(DataTypeContext::new(DataType::Int8, true))), + LargeList(Box::new(DataTypeContext::new(DataType::Utf8, false))), Struct(vec![ Field::new("f1", DataType::Int32, false), Field::new("f2", DataType::Utf8, true), diff --git a/rust/arrow/src/compute/kernels/comparison.rs b/rust/arrow/src/compute/kernels/comparison.rs index 4268eaf568f..b7ff52075e6 100644 --- a/rust/arrow/src/compute/kernels/comparison.rs +++ b/rust/arrow/src/compute/kernels/comparison.rs @@ -735,8 +735,8 @@ fn new_all_set_buffer(len: usize) -> Buffer { #[cfg(test)] mod tests { use super::*; - use crate::datatypes::{Int8Type, ToByteSlice}; - use crate::{array::Int32Array, datatypes::Field}; + use crate::array::Int32Array; + use crate::datatypes::{DataTypeContext, Int8Type, ToByteSlice}; #[test] fn test_primitive_array_eq() { @@ -1005,7 +1005,7 @@ mod tests { .data(); let value_offsets = Buffer::from(&[0i64, 3, 6, 6, 9].to_byte_slice()); let list_data_type = - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, true))); + DataType::LargeList(Box::new(DataTypeContext::new(DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(4) .add_buffer(value_offsets) diff --git a/rust/arrow/src/compute/kernels/filter.rs b/rust/arrow/src/compute/kernels/filter.rs index eb8d3397cfc..31beb33ee2f 100644 --- a/rust/arrow/src/compute/kernels/filter.rs +++ b/rust/arrow/src/compute/kernels/filter.rs @@ -1085,7 +1085,7 @@ mod tests { let value_offsets = Buffer::from(&[0i64, 3, 6, 8, 8].to_byte_slice()); let list_data_type = - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))); + DataType::LargeList(Box::new(DataTypeContext::new(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(4) .add_buffer(value_offsets) diff --git a/rust/arrow/src/compute/kernels/limit.rs b/rust/arrow/src/compute/kernels/limit.rs index 65f66bce8e5..e1511017ee6 100644 --- a/rust/arrow/src/compute/kernels/limit.rs +++ b/rust/arrow/src/compute/kernels/limit.rs @@ -36,7 +36,7 @@ mod tests { use super::*; use crate::array::*; use crate::buffer::Buffer; - use crate::datatypes::{DataType, Field, ToByteSlice}; + use crate::datatypes::{DataType, DataTypeContext, Field, ToByteSlice}; use crate::util::bit_util; use std::sync::Arc; @@ -111,7 +111,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); + DataType::List(Box::new(DataTypeContext::new(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(9) .add_buffer(value_offsets) diff --git a/rust/arrow/src/compute/kernels/take.rs b/rust/arrow/src/compute/kernels/take.rs index 0d999e34128..0c8af67f309 100644 --- a/rust/arrow/src/compute/kernels/take.rs +++ b/rust/arrow/src/compute/kernels/take.rs @@ -766,11 +766,9 @@ mod tests { let value_offsets: [$offset_type; 4] = [0, 3, 6, 8]; let value_offsets = Buffer::from(&value_offsets.to_byte_slice()); // Construct a list array from the above two - let list_data_type = DataType::$list_data_type(Box::new(Field::new( - "item", - DataType::Int32, - false, - ))); + let list_data_type = DataType::$list_data_type(Box::new( + DataTypeContext::new(DataType::Int32, false), + )); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_buffer(value_offsets) @@ -839,11 +837,9 @@ mod tests { let value_offsets: [$offset_type; 5] = [0, 3, 6, 7, 9]; let value_offsets = Buffer::from(&value_offsets.to_byte_slice()); // Construct a list array from the above two - let list_data_type = DataType::$list_data_type(Box::new(Field::new( - "item", - DataType::Int32, - false, - ))); + let list_data_type = DataType::$list_data_type(Box::new( + DataTypeContext::new(DataType::Int32, false), + )); let list_data = ArrayData::builder(list_data_type.clone()) .len(4) .add_buffer(value_offsets) @@ -912,11 +908,9 @@ mod tests { let value_offsets: [$offset_type; 5] = [0, 3, 6, 6, 8]; let value_offsets = Buffer::from(&value_offsets.to_byte_slice()); // Construct a list array from the above two - let list_data_type = DataType::$list_data_type(Box::new(Field::new( - "item", - DataType::Int32, - false, - ))); + let list_data_type = DataType::$list_data_type(Box::new( + DataTypeContext::new(DataType::Int32, false), + )); let list_data = ArrayData::builder(list_data_type.clone()) .len(4) .add_buffer(value_offsets) @@ -1007,7 +1001,7 @@ mod tests { let value_offsets = Buffer::from(&[0, 3, 6, 8].to_byte_slice()); // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); + DataType::List(Box::new(DataTypeContext::new(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) diff --git a/rust/arrow/src/compute/util.rs b/rust/arrow/src/compute/util.rs index ba7de77f6b0..2112bdeeaa6 100644 --- a/rust/arrow/src/compute/util.rs +++ b/rust/arrow/src/compute/util.rs @@ -321,7 +321,7 @@ mod tests { #[test] fn test_take_value_index_from_list() { let list = build_list( - DataType::List(Box::new(Field::new("item", DataType::Int32, true))), + DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))), Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), vec![0i32, 2i32, 5i32, 10i32], ); @@ -337,7 +337,7 @@ mod tests { #[test] fn test_take_value_index_from_large_list() { let list = build_list( - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))), + DataType::LargeList(Box::new(DataTypeContext::new(DataType::Int32, false))), Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), vec![0i64, 2i64, 5i64, 10i64], ); diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index 67a58a2003d..405657c4486 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -125,11 +125,11 @@ pub enum DataType { /// A variable-length string in Unicode with UFT-8 encoding and 64-bit offsets. LargeUtf8, /// A list of some logical data type with variable length. - List(Box), + List(Box), /// A list of some logical data type with fixed length. - FixedSizeList(Box, i32), + FixedSizeList(Box, i32), /// A list of some logical data type with variable length and 64-bit offsets. - LargeList(Box), + LargeList(Box), /// A nested datatype that contains a number of sub-fields. Struct(Vec), /// A nested datatype that can represent slots of differing types. @@ -147,6 +147,13 @@ pub enum DataType { Dictionary(Box, Box), } +/// Data type context that holds additional metadata +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct DataTypeContext { + data_type: DataType, + nullable: bool, +} + /// Date is either a 32-bit or 64-bit type representing elapsed time since UNIX /// epoch (1970-01-01) in days or milliseconds. #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] @@ -874,7 +881,7 @@ impl ToByteSlice for T { impl DataType { /// Parse a data type from a JSON representation pub(crate) fn from(json: &Value) -> Result { - let default_field = Field::new("", DataType::Boolean, true); + let default_dt_ctx = DataTypeContext::new(DataType::Boolean, true); match *json { Value::Object(ref map) => match map.get("name") { Some(s) if s == "null" => Ok(DataType::Null), @@ -1008,17 +1015,17 @@ impl DataType { }, Some(s) if s == "list" => { // return a list with any type as its child isn't defined in the map - Ok(DataType::List(Box::new(default_field))) + Ok(DataType::List(Box::new(default_dt_ctx))) } Some(s) if s == "largelist" => { // return a largelist with any type as its child isn't defined in the map - Ok(DataType::LargeList(Box::new(default_field))) + Ok(DataType::LargeList(Box::new(default_dt_ctx))) } Some(s) if s == "fixedsizelist" => { // return a list with any type as its child isn't defined in the map if let Some(Value::Number(size)) = map.get("listSize") { Ok(DataType::FixedSizeList( - Box::new(default_field), + Box::new(default_dt_ctx), size.as_i64().unwrap() as i32, )) } else { @@ -1142,40 +1149,28 @@ impl DataType { | Float64 ) } +} - /// Compares this data type with another data type only based on the data type - /// including nested data types, but not based on other values. - pub fn eq_type(&self, other: &Self) -> bool { - match (self, other) { - (DataType::List(f1), DataType::List(f2)) => { - f1.data_type().eq_type(f2.data_type()) - } - (DataType::FixedSizeList(f1, _), DataType::FixedSizeList(f2, _)) => { - f1.data_type().eq_type(f2.data_type()) - } - (DataType::LargeList(f1), DataType::LargeList(f2)) => { - f1.data_type().eq_type(f2.data_type()) - } - (DataType::Struct(f1), DataType::Struct(f2)) => { - f1.len() == f2.len() - && f1 - .iter() - .enumerate() - .all(|(i, f)| f.data_type().eq_type(f2[i].data_type())) - } - (DataType::Union(f1), DataType::Union(f2)) => { - f1.len() == f2.len() - && f1 - .iter() - .enumerate() - .all(|(i, f)| f.data_type().eq_type(f2[i].data_type())) - } - (DataType::Dictionary(k1, v1), DataType::Dictionary(k2, v2)) => { - k1.as_ref().eq_type(k2) && v1.eq_type(v2.as_ref()) - } - t @ (_, _) => std::mem::discriminant(t.0) == std::mem::discriminant(t.1), +impl DataTypeContext { + /// Creates a new data type context + pub fn new(data_type: DataType, nullable: bool) -> Self { + DataTypeContext { + data_type, + nullable, } } + + /// Returns an immutable reference to the data type + #[inline] + pub const fn data_type(&self) -> &DataType { + &self.data_type + } + + /// Indicates whether in this data type context null values are eligible + #[inline] + pub const fn is_nullable(&self) -> bool { + self.nullable + } } impl Field { @@ -1183,6 +1178,7 @@ impl Field { pub fn new(name: &str, data_type: DataType, nullable: bool) -> Self { Field { name: name.to_string(), + //todo: combine data type and nullability in type context data_type, nullable, dict_id: 0, @@ -1276,16 +1272,21 @@ impl Field { "Field 'children' must have one element for a list data type".to_string(), )); } + let nested_field = Self::from(&values[0])?; + let nexted_dt_ctx = DataTypeContext::new( + nested_field.data_type, + nested_field.nullable, + ); match data_type { DataType::List(_) => DataType::List(Box::new( - Self::from(&values[0])?, + nexted_dt_ctx, )), DataType::LargeList(_) => DataType::LargeList(Box::new( - Self::from(&values[0])?, + nexted_dt_ctx, )), DataType::FixedSizeList(_, int) => { DataType::FixedSizeList( - Box::new(Self::from(&values[0])?), + Box::new(nexted_dt_ctx), int, ) } @@ -1377,9 +1378,30 @@ impl Field { pub fn to_json(&self) -> Value { let children: Vec = match self.data_type() { DataType::Struct(fields) => fields.iter().map(|f| f.to_json()).collect(), - DataType::List(field) => vec![field.to_json()], - DataType::LargeList(field) => vec![field.to_json()], - DataType::FixedSizeList(field, _) => vec![field.to_json()], + DataType::List(type_ctx) => { + let item = Field::new( + "item", + type_ctx.data_type().clone(), + type_ctx.is_nullable(), + ); + vec![item.to_json()] + } + DataType::LargeList(type_ctx) => { + let item = Field::new( + "item", + type_ctx.data_type().clone(), + type_ctx.is_nullable(), + ); + vec![item.to_json()] + } + DataType::FixedSizeList(type_ctx, _) => { + let item = Field::new( + "item", + type_ctx.data_type().clone(), + type_ctx.is_nullable(), + ); + vec![item.to_json()] + } _ => vec![], }; match self.data_type() { @@ -2025,23 +2047,24 @@ mod tests { Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false), Field::new( "c21", - DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), + DataType::List(Box::new(DataTypeContext::new( + DataType::Boolean, + true, + ))), false, ), Field::new( "c22", DataType::FixedSizeList( - Box::new(Field::new("bools", DataType::Boolean, false)), + Box::new(DataTypeContext::new(DataType::Boolean, false)), 5, ), false, ), Field::new( "c23", - DataType::List(Box::new(Field::new( - "inner_list", - DataType::List(Box::new(Field::new( - "struct", + DataType::List(Box::new(DataTypeContext::new( + DataType::List(Box::new(DataTypeContext::new( DataType::Struct(vec![]), true, ))), @@ -2077,10 +2100,8 @@ mod tests { Field::new("c33", DataType::LargeUtf8, true), Field::new( "c34", - DataType::LargeList(Box::new(Field::new( - "inner_large_list", - DataType::LargeList(Box::new(Field::new( - "struct", + DataType::LargeList(Box::new(DataTypeContext::new( + DataType::LargeList(Box::new(DataTypeContext::new( DataType::Struct(vec![]), false, ))), @@ -2308,7 +2329,7 @@ mod tests { }, "children": [ { - "name": "bools", + "name": "item", "nullable": false, "type": { "name": "bool" @@ -2325,14 +2346,14 @@ mod tests { }, "children": [ { - "name": "inner_list", + "name": "item", "nullable": false, "type": { "name": "list" }, "children": [ { - "name": "struct", + "name": "item", "nullable": true, "type": { "name": "struct" @@ -2465,14 +2486,14 @@ mod tests { }, "children": [ { - "name": "inner_large_list", + "name": "item", "nullable": true, "type": { "name": "largelist" }, "children": [ { - "name": "struct", + "name": "item", "nullable": false, "type": { "name": "struct" @@ -2777,34 +2798,30 @@ mod tests { #[test] fn test_compare_nested_types() { - let list_type_a = &DataType::List(Box::new(Field::new( - "a", + let list_type_a = &DataType::List(Box::new(DataTypeContext::new( DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), true, ))); - let list_type_b = &DataType::List(Box::new(Field::new( - "b", + let list_type_b = &DataType::List(Box::new(DataTypeContext::new( DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - false, + true, ))); - assert!(list_type_a.eq_type(list_type_b)); + assert_eq!(list_type_a, list_type_b); } #[test] fn test_compare_mismatching_types() { - let list_type_a = &DataType::LargeList(Box::new(Field::new( - "a", + let list_type_a = &DataType::LargeList(Box::new(DataTypeContext::new( DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), true, ))); - let list_type_b = &DataType::LargeList(Box::new(Field::new( - "b", + let list_type_b = &DataType::LargeList(Box::new(DataTypeContext::new( DataType::Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), false, ))); - assert!(!list_type_a.eq_type(list_type_b)); + assert_ne!(list_type_a, list_type_b); } } diff --git a/rust/arrow/src/ipc/convert.rs b/rust/arrow/src/ipc/convert.rs index 127a3631553..d105adee93c 100644 --- a/rust/arrow/src/ipc/convert.rs +++ b/rust/arrow/src/ipc/convert.rs @@ -17,7 +17,9 @@ //! Utilities for converting between IPC types and native Arrow types -use crate::datatypes::{DataType, DateUnit, Field, IntervalUnit, Schema, TimeUnit}; +use crate::datatypes::{ + DataType, DataTypeContext, DateUnit, Field, IntervalUnit, Schema, TimeUnit, +}; use crate::ipc; use flatbuffers::{ @@ -268,14 +270,23 @@ pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataT if children.len() != 1 { panic!("expect a list to have one child") } - DataType::List(Box::new(children.get(0).into())) + let child_field = children.get(0); + // returning int16 for now, to test, not sure how to get data type + DataType::List(Box::new(DataTypeContext::new( + get_data_type(child_field, false), + child_field.nullable(), + ))) } ipc::Type::LargeList => { let children = field.children().unwrap(); if children.len() != 1 { panic!("expect a large list to have one child") } - DataType::LargeList(Box::new(children.get(0).into())) + let child_field = children.get(0); + DataType::LargeList(Box::new(DataTypeContext::new( + get_data_type(child_field, false), + child_field.nullable(), + ))) } ipc::Type::FixedSizeList => { let children = field.children().unwrap(); @@ -283,7 +294,14 @@ pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataT panic!("expect a list to have one child") } let fsl = field.type_as_fixed_size_list().unwrap(); - DataType::FixedSizeList(Box::new(children.get(0).into()), fsl.listSize()) + let child_field = children.get(0); + DataType::FixedSizeList( + Box::new(DataTypeContext::new( + get_data_type(child_field, false), + child_field.nullable(), + )), + fsl.listSize(), + ) } ipc::Type::Struct_ => { let mut fields = vec![]; @@ -529,24 +547,63 @@ pub(crate) fn get_fb_field_type<'a: 'b, 'b>( children: Some(fbb.create_vector(&empty_fields[..])), } } - List(ref list_type) => { - let child = build_field(fbb, list_type); + List(ref type_ctx) => { + let nested_type = + get_fb_field_type(type_ctx.data_type(), type_ctx.is_nullable(), fbb); + let child = ipc::Field::create( + fbb, + &ipc::FieldArgs { + name: None, + nullable: type_ctx.is_nullable(), + type_type: nested_type.type_type, + type_: Some(nested_type.type_), + children: nested_type.children, + dictionary: None, + custom_metadata: None, + }, + ); FBFieldType { type_type: ipc::Type::List, type_: ipc::ListBuilder::new(fbb).finish().as_union_value(), children: Some(fbb.create_vector(&[child])), } } - LargeList(ref list_type) => { - let child = build_field(fbb, list_type); + LargeList(ref type_ctx) => { + let inner_types = + get_fb_field_type(type_ctx.data_type(), type_ctx.is_nullable(), fbb); + let child = ipc::Field::create( + fbb, + &ipc::FieldArgs { + name: None, + nullable: type_ctx.is_nullable(), + type_type: inner_types.type_type, + type_: Some(inner_types.type_), + dictionary: None, + children: inner_types.children, + custom_metadata: None, + }, + ); FBFieldType { type_type: ipc::Type::LargeList, type_: ipc::LargeListBuilder::new(fbb).finish().as_union_value(), children: Some(fbb.create_vector(&[child])), } } - FixedSizeList(ref list_type, len) => { - let child = build_field(fbb, list_type); + FixedSizeList(ref type_ctx, len) => { + let inner_types = + get_fb_field_type(type_ctx.data_type(), type_ctx.is_nullable(), fbb); + let child = ipc::Field::create( + fbb, + &ipc::FieldArgs { + name: None, + nullable: type_ctx.is_nullable(), + type_type: inner_types.type_type, + type_: Some(inner_types.type_), + dictionary: None, + children: inner_types.children, + custom_metadata: None, + }, + ); let mut builder = ipc::FixedSizeListBuilder::new(fbb); builder.add_listSize(*len as i32); FBFieldType { @@ -629,7 +686,7 @@ pub(crate) fn get_fb_dictionary<'a: 'b, 'b>( #[cfg(test)] mod tests { use super::*; - use crate::datatypes::{DataType, Field, Schema}; + use crate::datatypes::{DataType, DataTypeContext, Field, Schema}; #[test] fn convert_schema_round_trip() { @@ -695,13 +752,15 @@ mod tests { Field::new("binary", DataType::Binary, false), Field::new( "list[u8]", - DataType::List(Box::new(Field::new("item", DataType::UInt8, false))), + DataType::List(Box::new(DataTypeContext::new( + DataType::UInt8, + false, + ))), true, ), Field::new( "list[struct]", - DataType::List(Box::new(Field::new( - "struct", + DataType::List(Box::new(DataTypeContext::new( DataType::Struct(vec![ Field::new("float32", DataType::UInt8, false), Field::new("int32", DataType::Int32, true), @@ -717,8 +776,7 @@ mod tests { Field::new("int64", DataType::Int64, true), Field::new( "list[struct]>]", - DataType::List(Box::new(Field::new( - "struct", + DataType::List(Box::new(DataTypeContext::new( DataType::Struct(vec![ Field::new( "date32", @@ -727,8 +785,7 @@ mod tests { ), Field::new( "list[struct<>]", - DataType::List(Box::new(Field::new( - "struct", + DataType::List(Box::new(DataTypeContext::new( DataType::Struct(vec![]), false, ))), diff --git a/rust/arrow/src/ipc/reader.rs b/rust/arrow/src/ipc/reader.rs index 76ad6b77cf3..d5a929f066a 100644 --- a/rust/arrow/src/ipc/reader.rs +++ b/rust/arrow/src/ipc/reader.rs @@ -89,7 +89,7 @@ fn create_array( buffer_index += 2; array } - List(ref list_field) | LargeList(ref list_field) => { + List(ref type_ctx) | LargeList(ref type_ctx) => { let list_node = &nodes[node_index]; let list_buffers: Vec = buffers[buffer_index..buffer_index + 2] .iter() @@ -99,7 +99,7 @@ fn create_array( buffer_index += 2; let triple = create_array( nodes, - list_field.data_type(), + type_ctx.data_type(), data, buffers, dictionaries, diff --git a/rust/arrow/src/json/reader.rs b/rust/arrow/src/json/reader.rs index a3368cdd65e..ea9429f7d0d 100644 --- a/rust/arrow/src/json/reader.rs +++ b/rust/arrow/src/json/reader.rs @@ -65,20 +65,16 @@ fn coerce_data_type(dt: Vec<&DataType>) -> Result { 1 => Ok(dt[0].clone()), 2 => { // there can be a case where a list and scalar both exist - if dt.contains(&&DataType::List(Box::new(Field::new( - "item", + if dt.contains(&&DataType::List(Box::new(DataTypeContext::new( DataType::Float64, true, - )))) || dt.contains(&&DataType::List(Box::new(Field::new( - "item", + )))) || dt.contains(&&DataType::List(Box::new(DataTypeContext::new( DataType::Int64, true, - )))) || dt.contains(&&DataType::List(Box::new(Field::new( - "item", + )))) || dt.contains(&&DataType::List(Box::new(DataTypeContext::new( DataType::Boolean, true, - )))) || dt.contains(&&DataType::List(Box::new(Field::new( - "item", + )))) || dt.contains(&&DataType::List(Box::new(DataTypeContext::new( DataType::Utf8, true, )))) { @@ -89,14 +85,12 @@ fn coerce_data_type(dt: Vec<&DataType>) -> Result { match (dt[0], dt[1]) { (t1, DataType::List(e)) if e.data_type() == &DataType::Float64 => { if t1 == &DataType::Float64 { - Ok(DataType::List(Box::new(Field::new( - "item", + Ok(DataType::List(Box::new(DataTypeContext::new( DataType::Float64, true, )))) } else { - Ok(DataType::List(Box::new(Field::new( - "item", + Ok(DataType::List(Box::new(DataTypeContext::new( coerce_data_type(vec![t1, &DataType::Float64])?, true, )))) @@ -104,14 +98,12 @@ fn coerce_data_type(dt: Vec<&DataType>) -> Result { } (t1, DataType::List(e)) if e.data_type() == &DataType::Int64 => { if t1 == &DataType::Int64 { - Ok(DataType::List(Box::new(Field::new( - "item", + Ok(DataType::List(Box::new(DataTypeContext::new( DataType::Int64, true, )))) } else { - Ok(DataType::List(Box::new(Field::new( - "item", + Ok(DataType::List(Box::new(DataTypeContext::new( coerce_data_type(vec![t1, &DataType::Int64])?, true, )))) @@ -119,14 +111,12 @@ fn coerce_data_type(dt: Vec<&DataType>) -> Result { } (t1, DataType::List(e)) if e.data_type() == &DataType::Boolean => { if t1 == &DataType::Boolean { - Ok(DataType::List(Box::new(Field::new( - "item", + Ok(DataType::List(Box::new(DataTypeContext::new( DataType::Boolean, true, )))) } else { - Ok(DataType::List(Box::new(Field::new( - "item", + Ok(DataType::List(Box::new(DataTypeContext::new( coerce_data_type(vec![t1, &DataType::Boolean])?, true, )))) @@ -134,14 +124,12 @@ fn coerce_data_type(dt: Vec<&DataType>) -> Result { } (t1, DataType::List(e)) if e.data_type() == &DataType::Utf8 => { if t1 == &DataType::Utf8 { - Ok(DataType::List(Box::new(Field::new( - "item", + Ok(DataType::List(Box::new(DataTypeContext::new( DataType::Utf8, true, )))) } else { - Ok(DataType::List(Box::new(Field::new( - "item", + Ok(DataType::List(Box::new(DataTypeContext::new( coerce_data_type(vec![t1, &DataType::Utf8])?, true, )))) @@ -161,8 +149,7 @@ fn coerce_data_type(dt: Vec<&DataType>) -> Result { _ => { // TODO(nevi_me) It's possible to have [float, int, list(float)], which should // return list(float). Will hash this out later - Ok(DataType::List(Box::new(Field::new( - "item", + Ok(DataType::List(Box::new(DataTypeContext::new( DataType::Utf8, true, )))) @@ -303,13 +290,13 @@ pub fn infer_json_schema( if values.contains_key(k) { let x = values.get_mut(k).unwrap(); x.insert(DataType::List(Box::new( - Field::new("item", dt, true), + DataTypeContext::new(dt, true), ))); } else { // create hashset and add value type let mut hs = HashSet::new(); hs.insert(DataType::List(Box::new( - Field::new("item", dt, true), + DataTypeContext::new(dt, true), ))); values.insert(k.to_string(), hs); } @@ -1389,12 +1376,12 @@ mod tests { assert_eq!(&DataType::Int64, a.1.data_type()); let b = schema.column_with_name("b").unwrap(); assert_eq!( - &DataType::List(Box::new(Field::new("item", DataType::Float64, true))), + &DataType::List(Box::new(DataTypeContext::new(DataType::Float64, true))), b.1.data_type() ); let c = schema.column_with_name("c").unwrap(); assert_eq!( - &DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), + &DataType::List(Box::new(DataTypeContext::new(DataType::Boolean, true))), c.1.data_type() ); let d = schema.column_with_name("d").unwrap(); @@ -1447,35 +1434,35 @@ mod tests { use crate::datatypes::DataType::*; assert_eq!( - List(Box::new(Field::new("item", Float64, true))), + List(Box::new(DataTypeContext::new(Float64, true))), coerce_data_type(vec![ &Float64, - &List(Box::new(Field::new("item", Float64, true))) + &List(Box::new(DataTypeContext::new(Float64, true))) ]) .unwrap() ); assert_eq!( - List(Box::new(Field::new("item", Float64, true))), + List(Box::new(DataTypeContext::new(Float64, true))), coerce_data_type(vec![ &Float64, - &List(Box::new(Field::new("item", Int64, true))) + &List(Box::new(DataTypeContext::new(Int64, true))) ]) .unwrap() ); assert_eq!( - List(Box::new(Field::new("item", Int64, true))), + List(Box::new(DataTypeContext::new(Int64, true))), coerce_data_type(vec![ &Int64, - &List(Box::new(Field::new("item", Int64, true))) + &List(Box::new(DataTypeContext::new(Int64, true))) ]) .unwrap() ); // boolean and number are incompatible, return utf8 assert_eq!( - List(Box::new(Field::new("item", Utf8, true))), + List(Box::new(DataTypeContext::new(Utf8, true))), coerce_data_type(vec![ &Boolean, - &List(Box::new(Field::new("item", Float64, true))) + &List(Box::new(DataTypeContext::new(Float64, true))) ]) .unwrap() ); @@ -1508,17 +1495,17 @@ mod tests { assert_eq!(&DataType::Int64, a.1.data_type()); let b = schema.column_with_name("b").unwrap(); assert_eq!( - &DataType::List(Box::new(Field::new("item", DataType::Float64, true))), + &DataType::List(Box::new(DataTypeContext::new(DataType::Float64, true))), b.1.data_type() ); let c = schema.column_with_name("c").unwrap(); assert_eq!( - &DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), + &DataType::List(Box::new(DataTypeContext::new(DataType::Boolean, true))), c.1.data_type() ); let d = schema.column_with_name("d").unwrap(); assert_eq!( - &DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), + &DataType::List(Box::new(DataTypeContext::new(DataType::Utf8, true))), d.1.data_type() ); @@ -1710,8 +1697,7 @@ mod tests { fn test_list_of_string_dictionary_from_json() { let schema = Schema::new(vec![Field::new( "events", - List(Box::new(Field::new( - "item", + List(Box::new(DataTypeContext::new( Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), true, ))), @@ -1734,8 +1720,7 @@ mod tests { let events = schema.column_with_name("events").unwrap(); assert_eq!( - &List(Box::new(Field::new( - "item", + &List(Box::new(DataTypeContext::new( Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), true ))), @@ -1769,8 +1754,7 @@ mod tests { fn test_list_of_string_dictionary_from_json_with_nulls() { let schema = Schema::new(vec![Field::new( "events", - List(Box::new(Field::new( - "item", + List(Box::new(DataTypeContext::new( Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), true, ))), @@ -1795,8 +1779,7 @@ mod tests { let events = schema.column_with_name("events").unwrap(); assert_eq!( - &List(Box::new(Field::new( - "item", + &List(Box::new(DataTypeContext::new( Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), true ))), @@ -1937,17 +1920,17 @@ mod tests { Field::new("a", DataType::Int64, true), Field::new( "b", - DataType::List(Box::new(Field::new("item", DataType::Float64, true))), + DataType::List(Box::new(DataTypeContext::new(DataType::Float64, true))), true, ), Field::new( "c", - DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), + DataType::List(Box::new(DataTypeContext::new(DataType::Boolean, true))), true, ), Field::new( "d", - DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), + DataType::List(Box::new(DataTypeContext::new(DataType::Utf8, true))), true, ), ]); diff --git a/rust/arrow/src/record_batch.rs b/rust/arrow/src/record_batch.rs index 40afd1652ed..92fe678b181 100644 --- a/rust/arrow/src/record_batch.rs +++ b/rust/arrow/src/record_batch.rs @@ -100,7 +100,7 @@ impl RecordBatch { )); } // list types can have different names, but we only need the data types to be the same - if !column.data_type().eq_type(schema.field(i).data_type()) { + if column.data_type() != schema.field(i).data_type() { return Err(ArrowError::InvalidArgumentError(format!( "column types must match schema types, expected {:?} but found {:?} at column index {}", schema.field(i).data_type(), @@ -302,13 +302,7 @@ mod tests { fn create_record_batch_with_matching_nested_type() { let schema = Schema::new(vec![Field::new( "list", - DataType::List(Box::new(Field::new_dict( - "nested_dict_A", - DataType::Int32, - true, - 0, - false, - ))), + DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))), false, )]); @@ -325,13 +319,7 @@ mod tests { let offsets = UInt64Array::from(vec![0, 2, 4]); let array_data = Arc::new(ArrayData::new( - DataType::List(Box::new(Field::new_dict( - "nested_dict_B", - DataType::Int32, - false, - 0, - false, - ))), + DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))), 3, None, None, diff --git a/rust/arrow/src/util/integration_util.rs b/rust/arrow/src/util/integration_util.rs index 94d0a9b75a0..9d64f5b775c 100644 --- a/rust/arrow/src/util/integration_util.rs +++ b/rust/arrow/src/util/integration_util.rs @@ -688,11 +688,7 @@ mod tests { Field::new("c3", DataType::Utf8, true), Field::new( "c4", - DataType::List(Box::new(Field::new( - "custom_item", - DataType::Int32, - false, - ))), + DataType::List(Box::new(DataTypeContext::new(DataType::Int32, false))), true, ), ]); @@ -762,7 +758,7 @@ mod tests { Field::new("utf8s", DataType::Utf8, true), Field::new( "lists", - DataType::List(Box::new(Field::new("item", DataType::Int32, true))), + DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))), true, ), Field::new( @@ -839,7 +835,7 @@ mod tests { let value_data = Int32Array::from(vec![None, Some(2), None, None]); let value_offsets = Buffer::from(&[0, 3, 4, 4].to_byte_slice()); let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, true))); + DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) diff --git a/rust/datafusion/src/physical_plan/distinct_expressions.rs b/rust/datafusion/src/physical_plan/distinct_expressions.rs index 09194439777..752df5e199c 100644 --- a/rust/datafusion/src/physical_plan/distinct_expressions.rs +++ b/rust/datafusion/src/physical_plan/distinct_expressions.rs @@ -22,7 +22,7 @@ use std::fmt::Debug; use std::hash::Hash; use std::sync::Arc; -use arrow::datatypes::{DataType, Field}; +use arrow::datatypes::{DataType, DataTypeContext, Field}; use ahash::RandomState; use std::collections::HashSet; @@ -81,7 +81,10 @@ impl AggregateExpr for DistinctCount { .map(|data_type| { Field::new( &format_state_name(&self.name, "count distinct"), - DataType::List(Box::new(Field::new("item", data_type.clone(), true))), + DataType::List(Box::new(DataTypeContext::new( + data_type.clone(), + true, + ))), false, ) }) diff --git a/rust/datafusion/src/physical_plan/functions.rs b/rust/datafusion/src/physical_plan/functions.rs index 12402ec90b0..4cd9aee8356 100644 --- a/rust/datafusion/src/physical_plan/functions.rs +++ b/rust/datafusion/src/physical_plan/functions.rs @@ -38,11 +38,12 @@ use crate::physical_plan::array_expressions; use crate::physical_plan::datetime_expressions; use crate::physical_plan::math_expressions; use crate::physical_plan::string_expressions; +use arrow::datatypes::DataTypeContext; use arrow::{ array::ArrayRef, compute::kernels::length::length, datatypes::TimeUnit, - datatypes::{DataType, Field, Schema}, + datatypes::{DataType, Schema}, record_batch::RecordBatch, }; use fmt::{Debug, Formatter}; @@ -203,7 +204,7 @@ pub fn return_type( Ok(DataType::Timestamp(TimeUnit::Nanosecond, None)) } BuiltinScalarFunction::Array => Ok(DataType::FixedSizeList( - Box::new(Field::new("item", arg_types[0].clone(), true)), + Box::new(DataTypeContext::new(arg_types[0].clone(), true)), arg_types.len() as i32, )), _ => Ok(DataType::Float64), @@ -471,7 +472,10 @@ mod tests { assert_eq!( expr.data_type(&schema)?, // type equals to a common coercion - DataType::FixedSizeList(Box::new(Field::new("item", expected_type, true)), 2) + DataType::FixedSizeList( + Box::new(DataTypeContext::new(expected_type, true)), + 2 + ) ); // evaluate works diff --git a/rust/datafusion/src/scalar.rs b/rust/datafusion/src/scalar.rs index 06309ab84c0..943a0e893af 100644 --- a/rust/datafusion/src/scalar.rs +++ b/rust/datafusion/src/scalar.rs @@ -23,10 +23,7 @@ use arrow::array::{ Int16Builder, Int32Builder, Int64Builder, Int8Builder, ListBuilder, UInt16Builder, UInt32Builder, UInt64Builder, UInt8Builder, }; -use arrow::{ - array::ArrayRef, - datatypes::{DataType, Field}, -}; +use arrow::{array::ArrayRef, datatypes::DataType}; use arrow::{ array::{ Array, BooleanArray, Date32Array, Float32Array, Float64Array, Int16Array, @@ -37,6 +34,7 @@ use arrow::{ }; use crate::error::{DataFusionError, Result}; +use arrow::datatypes::DataTypeContext; /// Represents a dynamically typed, nullable single value. /// This is the single-valued counter-part of arrow’s `Array`. @@ -136,7 +134,7 @@ impl ScalarValue { ScalarValue::Utf8(_) => DataType::Utf8, ScalarValue::LargeUtf8(_) => DataType::LargeUtf8, ScalarValue::List(_, data_type) => { - DataType::List(Box::new(Field::new("item", data_type.clone(), true))) + DataType::List(Box::new(DataTypeContext::new(data_type.clone(), true))) } ScalarValue::Date32(_) => DataType::Date32(DateUnit::Day), } diff --git a/rust/datafusion/tests/sql.rs b/rust/datafusion/tests/sql.rs index fc35f4fd975..480548b773e 100644 --- a/rust/datafusion/tests/sql.rs +++ b/rust/datafusion/tests/sql.rs @@ -25,7 +25,7 @@ extern crate datafusion; use arrow::{array::*, datatypes::TimeUnit}; use arrow::{datatypes::Int32Type, datatypes::Int64Type, record_batch::RecordBatch}; use arrow::{ - datatypes::{DataType, Field, Schema, SchemaRef}, + datatypes::{DataType, DataTypeContext, Field, Schema, SchemaRef}, util::display::array_value_to_string, }; @@ -142,12 +142,12 @@ async fn parquet_list_columns() { let schema = Arc::new(Schema::new(vec![ Field::new( "int64_list", - DataType::List(Box::new(Field::new("item", DataType::Int64, true))), + DataType::List(Box::new(DataTypeContext::new(DataType::Int64, true))), true, ), Field::new( "utf8_list", - DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), + DataType::List(Box::new(DataTypeContext::new(DataType::Utf8, true))), true, ), ])); diff --git a/rust/integration-testing/src/bin/arrow-json-integration-test.rs b/rust/integration-testing/src/bin/arrow-json-integration-test.rs index d4afd13528d..26f888b74a9 100644 --- a/rust/integration-testing/src/bin/arrow-json-integration-test.rs +++ b/rust/integration-testing/src/bin/arrow-json-integration-test.rs @@ -418,9 +418,14 @@ fn array_from_json( } Ok(Arc::new(b.finish())) } - DataType::List(child_field) => { + DataType::List(type_ctx) => { let null_buf = create_null_buf(&json_col); let children = json_col.children.clone().unwrap(); + let child_field = Field::new( + "element", + type_ctx.data_type().clone(), + type_ctx.is_nullable(), + ); let child_array = array_from_json( &child_field, children.get(0).unwrap().clone(), @@ -441,9 +446,14 @@ fn array_from_json( .build(); Ok(Arc::new(ListArray::from(list_data))) } - DataType::LargeList(child_field) => { + DataType::LargeList(type_ctx) => { let null_buf = create_null_buf(&json_col); let children = json_col.children.clone().unwrap(); + let child_field = Field::new( + "element", + type_ctx.data_type().clone(), + type_ctx.is_nullable(), + ); let child_array = array_from_json( &child_field, children.get(0).unwrap().clone(), @@ -468,8 +478,13 @@ fn array_from_json( .build(); Ok(Arc::new(LargeListArray::from(list_data))) } - DataType::FixedSizeList(child_field, _) => { + DataType::FixedSizeList(type_ctx, _) => { let children = json_col.children.clone().unwrap(); + let child_field = Field::new( + "element", + type_ctx.data_type().clone(), + type_ctx.is_nullable(), + ); let child_array = array_from_json( &child_field, children.get(0).unwrap().clone(), @@ -490,8 +505,8 @@ fn array_from_json( .len(json_col.count) .null_bit_buffer(null_buf); - for (field, col) in fields.iter().zip(json_col.children.unwrap()) { - let array = array_from_json(field, col, dictionaries)?; + for (f, col) in fields.iter().zip(json_col.children.unwrap()) { + let array = array_from_json(f, col, dictionaries)?; array_data = array_data.add_child_data(array.data()); } diff --git a/rust/parquet/src/arrow/array_reader.rs b/rust/parquet/src/arrow/array_reader.rs index 00e7c74147f..766af11a003 100644 --- a/rust/parquet/src/arrow/array_reader.rs +++ b/rust/parquet/src/arrow/array_reader.rs @@ -34,7 +34,7 @@ use arrow::array::{ use arrow::buffer::{Buffer, MutableBuffer}; use arrow::datatypes::{ ArrowPrimitiveType, BooleanType as ArrowBooleanType, DataType as ArrowType, - Date32Type as ArrowDate32Type, Date64Type as ArrowDate64Type, + DataTypeContext, Date32Type as ArrowDate32Type, Date64Type as ArrowDate64Type, DurationMicrosecondType as ArrowDurationMicrosecondType, DurationMillisecondType as ArrowDurationMillisecondType, DurationNanosecondType as ArrowDurationNanosecondType, @@ -1348,8 +1348,7 @@ impl<'a> TypeVisitor>, &'a ArrayReaderBuilderContext .ok() .map(|f| f.data_type().to_owned()) .unwrap_or_else(|| { - ArrowType::List(Box::new(Field::new( - list_type.name(), + ArrowType::List(Box::new(DataTypeContext::new( item_reader_type.clone(), list_type.is_optional(), ))) @@ -1628,9 +1627,9 @@ mod tests { StructArray, }; use arrow::datatypes::{ - ArrowPrimitiveType, DataType as ArrowType, Date32Type as ArrowDate32, Field, - Int32Type as ArrowInt32, Int64Type as ArrowInt64, - Time32MillisecondType as ArrowTime32MillisecondArray, + ArrowPrimitiveType, DataType as ArrowType, DataTypeContext, + Date32Type as ArrowDate32, Field, Int32Type as ArrowInt32, + Int64Type as ArrowInt64, Time32MillisecondType as ArrowTime32MillisecondArray, Time64MicrosecondType as ArrowTime64MicrosecondArray, TimestampMicrosecondType as ArrowTimestampMicrosecondType, TimestampMillisecondType as ArrowTimestampMillisecondType, @@ -2313,7 +2312,7 @@ mod tests { let mut list_array_reader = ListArrayReader::::new( Box::new(item_array_reader), - ArrowType::List(Box::new(Field::new("item", ArrowType::Int32, true))), + ArrowType::List(Box::new(DataTypeContext::new(ArrowType::Int32, false))), ArrowType::Int32, 1, 1, @@ -2367,7 +2366,7 @@ mod tests { let mut list_array_reader = ListArrayReader::::new( Box::new(item_array_reader), - ArrowType::LargeList(Box::new(Field::new("item", ArrowType::Int32, true))), + ArrowType::LargeList(Box::new(DataTypeContext::new(ArrowType::Int32, true))), ArrowType::Int32, 1, 1, diff --git a/rust/parquet/src/arrow/arrow_writer.rs b/rust/parquet/src/arrow/arrow_writer.rs index d103b4726d6..5dee4f4a9ca 100644 --- a/rust/parquet/src/arrow/arrow_writer.rs +++ b/rust/parquet/src/arrow/arrow_writer.rs @@ -670,8 +670,8 @@ mod tests { use std::sync::Arc; use arrow::array::*; - use arrow::datatypes::ToByteSlice; use arrow::datatypes::{DataType, Field, Schema, UInt32Type, UInt8Type}; + use arrow::datatypes::{DataTypeContext, ToByteSlice}; use arrow::record_batch::RecordBatch; use crate::arrow::{ArrowReader, ParquetFileArrowReader}; @@ -709,7 +709,7 @@ mod tests { // define schema let schema = Schema::new(vec![Field::new( "a", - DataType::List(Box::new(Field::new("item", DataType::Int32, true))), + DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))), false, )]); @@ -722,11 +722,9 @@ mod tests { arrow::buffer::Buffer::from(&[0, 1, 3, 3, 6, 10].to_byte_slice()); // Construct a list array from the above two - let a_list_data = ArrayData::builder(DataType::List(Box::new(Field::new( - "items", - DataType::Int32, - true, - )))) + let a_list_data = ArrayData::builder(DataType::List(Box::new( + DataTypeContext::new(DataType::Int32, true), + ))) .len(5) .add_buffer(a_value_offsets) .add_child_data(a_values.data()) @@ -809,7 +807,7 @@ mod tests { let struct_field_f = Field::new("f", DataType::Float32, true); let struct_field_g = Field::new( "g", - DataType::List(Box::new(Field::new("items", DataType::Int16, false))), + DataType::List(Box::new(DataTypeContext::new(DataType::Int16, false))), false, ); let struct_field_e = Field::new( @@ -1233,11 +1231,9 @@ mod tests { let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); let a_value_offsets = arrow::buffer::Buffer::from(&[0, 1, 3, 3, 6, 10].to_byte_slice()); - let a_list_data = ArrayData::builder(DataType::List(Box::new(Field::new( - "item", - DataType::Int32, - true, - )))) + let a_list_data = ArrayData::builder(DataType::List(Box::new( + DataTypeContext::new(DataType::Int32, true), + ))) .len(5) .add_buffer(a_value_offsets) .add_child_data(a_values.data()) @@ -1258,11 +1254,9 @@ mod tests { let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); let a_value_offsets = arrow::buffer::Buffer::from(&[0i64, 1, 3, 3, 6, 10].to_byte_slice()); - let a_list_data = ArrayData::builder(DataType::LargeList(Box::new(Field::new( - "large_item", - DataType::Int32, - true, - )))) + let a_list_data = ArrayData::builder(DataType::LargeList(Box::new( + DataTypeContext::new(DataType::Int32, true), + ))) .len(5) .add_buffer(a_value_offsets) .add_child_data(a_values.data()) diff --git a/rust/parquet/src/arrow/schema.rs b/rust/parquet/src/arrow/schema.rs index 87a1004639e..1ce732e5604 100644 --- a/rust/parquet/src/arrow/schema.rs +++ b/rust/parquet/src/arrow/schema.rs @@ -26,7 +26,7 @@ use std::collections::{HashMap, HashSet}; use std::rc::Rc; -use arrow::datatypes::{DataType, DateUnit, Field, Schema, TimeUnit}; +use arrow::datatypes::{DataType, DataTypeContext, DateUnit, Field, Schema, TimeUnit}; use arrow::ipc::writer; use crate::basic::{LogicalType, Repetition, Type as PhysicalType}; @@ -406,18 +406,25 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_repetition(repetition) .build() } - DataType::List(f) | DataType::FixedSizeList(f, _) | DataType::LargeList(f) => { - Type::group_type_builder(name) - .with_fields(&mut vec![Rc::new( - Type::group_type_builder("list") - .with_fields(&mut vec![Rc::new(arrow_to_parquet_type(f)?)]) - .with_repetition(Repetition::REPEATED) - .build()?, - )]) - .with_logical_type(LogicalType::LIST) - .with_repetition(Repetition::REQUIRED) - .build() - } + DataType::List(type_ctx) + | DataType::FixedSizeList(type_ctx, _) + | DataType::LargeList(type_ctx) => Type::group_type_builder(name) + .with_fields(&mut vec![Rc::new( + Type::group_type_builder("list") + .with_fields(&mut vec![Rc::new({ + let list_field = Field::new( + "element", + type_ctx.data_type().clone(), + type_ctx.is_nullable(), + ); + arrow_to_parquet_type(&list_field)? + })]) + .with_repetition(Repetition::REPEATED) + .build()?, + )]) + .with_logical_type(LogicalType::LIST) + .with_repetition(Repetition::REQUIRED) + .build(), DataType::Struct(fields) => { if fields.is_empty() { return Err(ArrowError( @@ -532,8 +539,7 @@ impl ParquetTypeConverter<'_> { if self.is_self_included() { self.to_primitive_type_inner().map(|dt| { if self.is_repeated() { - Some(DataType::List(Box::new(Field::new( - self.schema.name(), + Some(DataType::List(Box::new(DataTypeContext::new( dt, self.is_nullable(), )))) @@ -632,11 +638,7 @@ impl ParquetTypeConverter<'_> { if self.is_repeated() { self.to_struct().map(|opt| { opt.map(|dt| { - DataType::List(Box::new(Field::new( - self.schema.name(), - dt, - self.is_nullable(), - ))) + DataType::List(Box::new(DataTypeContext::new(dt, self.is_nullable()))) }) }) } else { @@ -725,8 +727,7 @@ impl ParquetTypeConverter<'_> { item_type.map(|opt| { opt.map(|dt| { - DataType::List(Box::new(Field::new( - list_item.name(), + DataType::List(Box::new(DataTypeContext::new( dt, list_item.is_optional(), ))) @@ -746,7 +747,9 @@ mod tests { use std::{collections::HashMap, convert::TryFrom, sync::Arc}; - use arrow::datatypes::{DataType, DateUnit, Field, IntervalUnit, TimeUnit}; + use arrow::datatypes::{ + DataType, DataTypeContext, DateUnit, Field, IntervalUnit, TimeUnit, + }; use crate::file::{metadata::KeyValue, reader::SerializedFileReader}; use crate::{ @@ -905,7 +908,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(Field::new("list", DataType::Utf8, true))), + DataType::List(Box::new(DataTypeContext::new(DataType::Utf8, true))), false, )); } @@ -919,7 +922,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(Field::new("list", DataType::Utf8, true))), + DataType::List(Box::new(DataTypeContext::new(DataType::Utf8, true))), true, )); } @@ -938,10 +941,10 @@ mod tests { // } { let arrow_inner_list = - DataType::List(Box::new(Field::new("list", DataType::Int32, true))); + DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))); arrow_fields.push(Field::new( "array_of_arrays", - DataType::List(Box::new(Field::new("list", arrow_inner_list, true))), + DataType::List(Box::new(DataTypeContext::new(arrow_inner_list, true))), true, )); } @@ -955,7 +958,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(Field::new("element", DataType::Utf8, true))), + DataType::List(Box::new(DataTypeContext::new(DataType::Utf8, true))), true, )); } @@ -967,7 +970,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(Field::new("element", DataType::Int32, true))), + DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))), true, )); } @@ -986,7 +989,7 @@ mod tests { ]); arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(Field::new("element", arrow_struct, true))), + DataType::List(Box::new(DataTypeContext::new(arrow_struct, true))), true, )); } @@ -1003,7 +1006,7 @@ mod tests { DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]); arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(Field::new("array", arrow_struct, true))), + DataType::List(Box::new(DataTypeContext::new(arrow_struct, true))), true, )); } @@ -1020,7 +1023,7 @@ mod tests { DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]); arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(Field::new("my_list_tuple", arrow_struct, true))), + DataType::List(Box::new(DataTypeContext::new(arrow_struct, true))), true, )); } @@ -1030,7 +1033,7 @@ mod tests { { arrow_fields.push(Field::new( "name", - DataType::List(Box::new(Field::new("name", DataType::Int32, true))), + DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))), true, )); } @@ -1196,8 +1199,7 @@ mod tests { let inner_group_list = Field::new( "innerGroup", - DataType::List(Box::new(Field::new( - "innerGroup", + DataType::List(Box::new(DataTypeContext::new( DataType::Struct(vec![Field::new("leaf3", DataType::Int32, true)]), true, ))), @@ -1206,8 +1208,7 @@ mod tests { let outer_group_list = Field::new( "outerGroup", - DataType::List(Box::new(Field::new( - "outerGroup", + DataType::List(Box::new(DataTypeContext::new( DataType::Struct(vec![ Field::new("leaf2", DataType::Int32, true), inner_group_list, @@ -1283,7 +1284,7 @@ mod tests { Field::new("string", DataType::Utf8, true), Field::new( "bools", - DataType::List(Box::new(Field::new("bools", DataType::Boolean, true))), + DataType::List(Box::new(DataTypeContext::new(DataType::Boolean, true))), true, ), Field::new("date", DataType::Date32(DateUnit::Day), true), @@ -1353,7 +1354,7 @@ mod tests { Field::new("string", DataType::Utf8, true), Field::new( "bools", - DataType::List(Box::new(Field::new("element", DataType::Boolean, true))), + DataType::List(Box::new(DataTypeContext::new(DataType::Boolean, true))), true, ), Field::new("date", DataType::Date32(DateUnit::Day), true), @@ -1376,8 +1377,7 @@ mod tests { Field::new("uint32", DataType::UInt32, false), Field::new( "int32", - DataType::List(Box::new(Field::new( - "element", + DataType::List(Box::new(DataTypeContext::new( DataType::Int32, true, ))), @@ -1490,7 +1490,10 @@ mod tests { Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false), Field::new( "c21", - DataType::List(Box::new(Field::new("list", DataType::Boolean, true))), + DataType::List(Box::new(DataTypeContext::new( + DataType::Boolean, + true, + ))), false, ), // Field::new( @@ -1585,8 +1588,7 @@ mod tests { vec![ Field::new( "c21", - DataType::List(Box::new(Field::new( - "array", + DataType::List(Box::new(DataTypeContext::new( DataType::Boolean, true, ))), @@ -1595,17 +1597,15 @@ mod tests { Field::new( "c22", DataType::FixedSizeList( - Box::new(Field::new("items", DataType::Boolean, false)), + Box::new(DataTypeContext::new(DataType::Boolean, false)), 5, ), false, ), Field::new( "c23", - DataType::List(Box::new(Field::new( - "items", - DataType::LargeList(Box::new(Field::new( - "items", + DataType::List(Box::new(DataTypeContext::new( + DataType::LargeList(Box::new(DataTypeContext::new( DataType::Struct(vec![ Field::new("a", DataType::Int16, true), Field::new("b", DataType::Float64, false), From 3ab2126b39cbddc3e46f346473f8e39ea14b2814 Mon Sep 17 00:00:00 2001 From: Christoph Schulze Date: Mon, 23 Nov 2020 22:04:05 +0100 Subject: [PATCH 08/15] ARROW-10656 Fix test after merging --- rust/arrow/src/array/builder.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs index f6f699354a6..c61ab71cc97 100644 --- a/rust/arrow/src/array/builder.rs +++ b/rust/arrow/src/array/builder.rs @@ -1693,7 +1693,7 @@ impl ArrayBuilder for DecimalBuilder { )) as ArrayDataRef; let list_data = Arc::new(ArrayData::new( DataType::FixedSizeList( - Box::new(Field::new("item", DataType::UInt8, true)), + Box::new(DataTypeContext::new(DataType::UInt8, true)), self.builder.list_len, ), array.len(), From d62749f2ebe706dbb5ab0bfa83e89c0c826cabe5 Mon Sep 17 00:00:00 2001 From: Christoph Schulze Date: Tue, 24 Nov 2020 14:02:47 +0100 Subject: [PATCH 09/15] ARROW-10656 keep prior conversion interface --- rust/arrow/src/ipc/convert.rs | 36 +++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/rust/arrow/src/ipc/convert.rs b/rust/arrow/src/ipc/convert.rs index d105adee93c..646e8eb6a0a 100644 --- a/rust/arrow/src/ipc/convert.rs +++ b/rust/arrow/src/ipc/convert.rs @@ -152,6 +152,12 @@ pub fn schema_from_bytes(bytes: &[u8]) -> Option { /// Get the Arrow data type from the flatbuffer Field table pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataType { + get_data_type_context(field, may_be_dictionary) + .data_type() + .clone() +} + +fn get_data_type_context(field: ipc::Field, may_be_dictionary: bool) -> DataTypeContext { if let Some(dictionary) = field.dictionary() { if may_be_dictionary { let int = dictionary.indexType().unwrap(); @@ -166,14 +172,16 @@ pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataT (64, false) => DataType::UInt64, _ => panic!("Unexpected bitwidth and signed"), }; - return DataType::Dictionary( - Box::new(index_type), - Box::new(get_data_type(field, false)), + let value_type = get_data_type_context(field, false).data_type().clone(); + return DataTypeContext::new( + DataType::Dictionary(Box::new(index_type), Box::new(value_type)), + // taking nullability from parent field + field.nullable(), ); } } - match field.type_type() { + let data_type = match field.type_type() { ipc::Type::Null => DataType::Null, ipc::Type::Bool => DataType::Boolean, ipc::Type::Int => { @@ -271,11 +279,7 @@ pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataT panic!("expect a list to have one child") } let child_field = children.get(0); - // returning int16 for now, to test, not sure how to get data type - DataType::List(Box::new(DataTypeContext::new( - get_data_type(child_field, false), - child_field.nullable(), - ))) + DataType::List(Box::new(get_data_type_context(child_field, false))) } ipc::Type::LargeList => { let children = field.children().unwrap(); @@ -283,10 +287,7 @@ pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataT panic!("expect a large list to have one child") } let child_field = children.get(0); - DataType::LargeList(Box::new(DataTypeContext::new( - get_data_type(child_field, false), - child_field.nullable(), - ))) + DataType::LargeList(Box::new(get_data_type_context(child_field, false))) } ipc::Type::FixedSizeList => { let children = field.children().unwrap(); @@ -296,10 +297,7 @@ pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataT let fsl = field.type_as_fixed_size_list().unwrap(); let child_field = children.get(0); DataType::FixedSizeList( - Box::new(DataTypeContext::new( - get_data_type(child_field, false), - child_field.nullable(), - )), + Box::new(get_data_type_context(child_field, false)), fsl.listSize(), ) } @@ -314,7 +312,9 @@ pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataT DataType::Struct(fields) } t => unimplemented!("Type {:?} not supported", t), - } + }; + + DataTypeContext::new(data_type, field.nullable()) } pub(crate) struct FBFieldType<'b> { From 4cdd16958260ad75f2b8c551786819939df16fcc Mon Sep 17 00:00:00 2001 From: Christoph Schulze Date: Tue, 24 Nov 2020 15:52:51 +0100 Subject: [PATCH 10/15] Rename DataTypeContext to NullableDataType --- rust/arrow/examples/builders.rs | 4 +- rust/arrow/src/array/array_binary.rs | 4 +- rust/arrow/src/array/array_list.rs | 24 ++--- rust/arrow/src/array/builder.rs | 39 ++++---- rust/arrow/src/compute/kernels/cast.rs | 32 +++---- rust/arrow/src/compute/kernels/comparison.rs | 4 +- rust/arrow/src/compute/kernels/filter.rs | 2 +- rust/arrow/src/compute/kernels/limit.rs | 4 +- rust/arrow/src/compute/kernels/take.rs | 8 +- rust/arrow/src/compute/util.rs | 4 +- rust/arrow/src/datatypes.rs | 95 +++++++++---------- rust/arrow/src/ipc/convert.rs | 18 ++-- rust/arrow/src/json/reader.rs | 70 +++++++------- rust/arrow/src/record_batch.rs | 4 +- rust/arrow/src/util/integration_util.rs | 6 +- .../src/physical_plan/distinct_expressions.rs | 4 +- .../datafusion/src/physical_plan/functions.rs | 6 +- rust/datafusion/src/physical_plan/planner.rs | 2 +- rust/datafusion/src/scalar.rs | 4 +- rust/datafusion/tests/sql.rs | 6 +- rust/parquet/src/arrow/array_reader.rs | 16 ++-- rust/parquet/src/arrow/arrow_writer.rs | 12 +-- rust/parquet/src/arrow/schema.rs | 53 ++++++----- 23 files changed, 211 insertions(+), 210 deletions(-) diff --git a/rust/arrow/examples/builders.rs b/rust/arrow/examples/builders.rs index 0ec3d316a67..e4f7eca1caa 100644 --- a/rust/arrow/examples/builders.rs +++ b/rust/arrow/examples/builders.rs @@ -26,7 +26,7 @@ use arrow::array::{ }; use arrow::buffer::Buffer; use arrow::datatypes::{ - DataType, DataTypeContext, Date64Type, Field, Time64NanosecondType, ToByteSlice, + DataType, Date64Type, Field, NullableDataType, Time64NanosecondType, ToByteSlice, }; fn main() { @@ -102,7 +102,7 @@ fn main() { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(DataTypeContext::new(DataType::Int32, false))); + DataType::List(Box::new(NullableDataType::new(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) diff --git a/rust/arrow/src/array/array_binary.rs b/rust/arrow/src/array/array_binary.rs index 08ac7f4e783..ef1cf8d8b2d 100644 --- a/rust/arrow/src/array/array_binary.rs +++ b/rust/arrow/src/array/array_binary.rs @@ -596,7 +596,7 @@ impl Array for DecimalArray { #[cfg(test)] mod tests { - use crate::datatypes::DataTypeContext; + use crate::datatypes::NullableDataType; use super::*; @@ -908,7 +908,7 @@ mod tests { .build(); let array_data = ArrayData::builder(DataType::FixedSizeList( - Box::new(DataTypeContext::new(DataType::Binary, false)), + Box::new(NullableDataType::new(DataType::Binary, false)), 4, )) .len(3) diff --git a/rust/arrow/src/array/array_list.rs b/rust/arrow/src/array/array_list.rs index 91684425165..00e7d29d0b8 100644 --- a/rust/arrow/src/array/array_list.rs +++ b/rust/arrow/src/array/array_list.rs @@ -302,7 +302,7 @@ mod tests { }; use super::*; - use crate::datatypes::DataTypeContext; + use crate::datatypes::NullableDataType; #[test] fn test_list_array() { @@ -318,7 +318,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(DataTypeContext::new(DataType::Int32, false))); + DataType::List(Box::new(NullableDataType::new(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_buffer(value_offsets.clone()) @@ -388,7 +388,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::LargeList(Box::new(DataTypeContext::new(DataType::Int32, false))); + DataType::LargeList(Box::new(NullableDataType::new(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_buffer(value_offsets.clone()) @@ -454,7 +454,7 @@ mod tests { // Construct a list array from the above two let list_data_type = DataType::FixedSizeList( - Box::new(DataTypeContext::new(DataType::Int32, false)), + Box::new(NullableDataType::new(DataType::Int32, false)), 3, ); let list_data = ArrayData::builder(list_data_type.clone()) @@ -523,7 +523,7 @@ mod tests { // Construct a list array from the above two let list_data_type = DataType::FixedSizeList( - Box::new(DataTypeContext::new(DataType::Int32, false)), + Box::new(NullableDataType::new(DataType::Int32, false)), 3, ); let list_data = ArrayData::builder(list_data_type) @@ -557,7 +557,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(DataTypeContext::new(DataType::Int32, false))); + DataType::List(Box::new(NullableDataType::new(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(9) .add_buffer(value_offsets) @@ -622,7 +622,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::LargeList(Box::new(DataTypeContext::new(DataType::Int32, false))); + DataType::LargeList(Box::new(NullableDataType::new(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(9) .add_buffer(value_offsets) @@ -685,7 +685,7 @@ mod tests { // Construct a fixed size list array from the above two let list_data_type = DataType::FixedSizeList( - Box::new(DataTypeContext::new(DataType::Int32, false)), + Box::new(NullableDataType::new(DataType::Int32, false)), 2, ); let list_data = ArrayData::builder(list_data_type) @@ -736,7 +736,7 @@ mod tests { .add_buffer(Buffer::from(&[0, 1, 2, 3, 4, 5, 6, 7].to_byte_slice())) .build(); let list_data_type = - DataType::List(Box::new(DataTypeContext::new(DataType::Int32, false))); + DataType::List(Box::new(NullableDataType::new(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_child_data(value_data) @@ -751,7 +751,7 @@ mod tests { fn test_list_array_invalid_child_array_len() { let value_offsets = Buffer::from(&[0, 2, 5, 7].to_byte_slice()); let list_data_type = - DataType::List(Box::new(DataTypeContext::new(DataType::Int32, false))); + DataType::List(Box::new(NullableDataType::new(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -770,7 +770,7 @@ mod tests { let value_offsets = Buffer::from(&[2, 2, 5, 7].to_byte_slice()); let list_data_type = - DataType::List(Box::new(DataTypeContext::new(DataType::Int32, false))); + DataType::List(Box::new(NullableDataType::new(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -802,7 +802,7 @@ mod tests { .build(); let list_data_type = - DataType::List(Box::new(DataTypeContext::new(DataType::Int32, false))); + DataType::List(Box::new(NullableDataType::new(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .add_buffer(buf2) .add_child_data(value_data) diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs index c61ab71cc97..11d375a932b 100644 --- a/rust/arrow/src/array/builder.rs +++ b/rust/arrow/src/array/builder.rs @@ -764,7 +764,7 @@ where /// /// This is used for validating array data types in `append_data` fn data_type(&self) -> DataType { - DataType::List(Box::new(DataTypeContext::new( + DataType::List(Box::new(NullableDataType::new( self.values_builder.data_type(), true, ))) @@ -833,7 +833,7 @@ where let null_bit_buffer = self.bitmap_builder.finish(); let nulls = null_bit_buffer.count_set_bits(); self.offsets_builder.append(0).unwrap(); - let data = ArrayData::builder(DataType::List(Box::new(DataTypeContext::new( + let data = ArrayData::builder(DataType::List(Box::new(NullableDataType::new( values_data.data_type().clone(), true, // TODO: find a consistent way of getting this )))) @@ -974,7 +974,7 @@ where /// /// This is used for validating array data types in `append_data` fn data_type(&self) -> DataType { - DataType::LargeList(Box::new(DataTypeContext::new( + DataType::LargeList(Box::new(NullableDataType::new( self.values_builder.data_type(), true, ))) @@ -1044,7 +1044,7 @@ where let nulls = null_bit_buffer.count_set_bits(); self.offsets_builder.append(0).unwrap(); let data = ArrayData::builder(DataType::LargeList(Box::new( - DataTypeContext::new(values_data.data_type().clone(), true), + NullableDataType::new(values_data.data_type().clone(), true), ))) .len(len) .null_count(len - nulls) @@ -1153,7 +1153,7 @@ where /// This is used for validating array data types in `append_data` fn data_type(&self) -> DataType { DataType::FixedSizeList( - Box::new(DataTypeContext::new(self.values_builder.data_type(), true)), + Box::new(NullableDataType::new(self.values_builder.data_type(), true)), self.list_len, ) } @@ -1232,7 +1232,7 @@ where let null_bit_buffer = self.bitmap_builder.finish(); let nulls = null_bit_buffer.count_set_bits(); let data = ArrayData::builder(DataType::FixedSizeList( - Box::new(DataTypeContext::new(values_data.data_type().clone(), true)), + Box::new(NullableDataType::new(values_data.data_type().clone(), true)), self.list_len, )) .len(len) @@ -1453,7 +1453,10 @@ fn append_binary_data( )) as ArrayDataRef; Arc::new(ArrayData::new( - DataType::List(Box::new(DataTypeContext::new(DataType::UInt8, true))), + DataType::List(Box::new(NullableDataType::new( + DataType::UInt8, + true, + ))), array.len(), None, array.null_buffer().cloned(), @@ -1505,7 +1508,7 @@ fn append_large_binary_data( )) as ArrayDataRef; Arc::new(ArrayData::new( - DataType::LargeList(Box::new(DataTypeContext::new( + DataType::LargeList(Box::new(NullableDataType::new( DataType::UInt8, true, ))), @@ -1607,7 +1610,7 @@ impl ArrayBuilder for FixedSizeBinaryBuilder { )) as ArrayDataRef; let list_data = Arc::new(ArrayData::new( DataType::FixedSizeList( - Box::new(DataTypeContext::new(DataType::UInt8, true)), + Box::new(NullableDataType::new(DataType::UInt8, true)), self.builder.list_len, ), array.len(), @@ -1693,7 +1696,7 @@ impl ArrayBuilder for DecimalBuilder { )) as ArrayDataRef; let list_data = Arc::new(ArrayData::new( DataType::FixedSizeList( - Box::new(DataTypeContext::new(DataType::UInt8, true)), + Box::new(NullableDataType::new(DataType::UInt8, true)), self.builder.list_len, ), array.len(), @@ -3814,13 +3817,13 @@ mod tests { #[test] #[should_panic( - expected = "Data type List(DataTypeContext { data_type: Int64, nullable: true }) is not currently supported" + expected = "Data type List(NullableDataType { data_type: Int64, nullable: true }) is not currently supported" )] fn test_struct_array_builder_from_schema_unsupported_type() { let mut fields = Vec::new(); fields.push(Field::new("f1", DataType::Int16, false)); let list_type = - DataType::List(Box::new(DataTypeContext::new(DataType::Int64, true))); + DataType::List(Box::new(NullableDataType::new(DataType::Int64, true))); fields.push(Field::new("f2", list_type, false)); let _ = StructBuilder::from_fields(fields, 5); @@ -4119,7 +4122,7 @@ mod tests { let list_value_offsets = Buffer::from(&[0, 3, 5, 11, 13, 13, 15, 15, 17].to_byte_slice()); let expected_list_data = ArrayData::new( - DataType::List(Box::new(DataTypeContext::new(DataType::Int64, true))), + DataType::List(Box::new(NullableDataType::new(DataType::Int64, true))), 8, None, None, @@ -4205,7 +4208,7 @@ mod tests { &[0, 3, 5, 5, 13, 15, 15, 15, 19, 19, 19, 19, 23].to_byte_slice(), ); let expected_list_data = ArrayData::new( - DataType::List(Box::new(DataTypeContext::new(DataType::Int64, true))), + DataType::List(Box::new(NullableDataType::new(DataType::Int64, true))), 12, None, None, @@ -4247,7 +4250,7 @@ mod tests { ]); let list_value_offsets = Buffer::from(&[0, 2, 3, 6].to_byte_slice()); let list_data = ArrayData::new( - DataType::List(Box::new(DataTypeContext::new(DataType::Utf8, true))), + DataType::List(Box::new(NullableDataType::new(DataType::Utf8, true))), 3, None, None, @@ -4282,7 +4285,7 @@ mod tests { ]); let list_value_offsets = Buffer::from(&[0, 2, 2, 4, 5, 8, 9, 12].to_byte_slice()); let expected_list_data = ArrayData::new( - DataType::List(Box::new(DataTypeContext::new(DataType::Utf8, true))), + DataType::List(Box::new(NullableDataType::new(DataType::Utf8, true))), 7, None, None, // is this correct? @@ -4371,7 +4374,7 @@ mod tests { ]); let expected_list_data = ArrayData::new( DataType::FixedSizeList( - Box::new(DataTypeContext::new(DataType::UInt16, true)), + Box::new(NullableDataType::new(DataType::UInt16, true)), 2, ), 12, @@ -4444,7 +4447,7 @@ mod tests { ]); let expected_list_data = ArrayData::new( DataType::FixedSizeList( - Box::new(DataTypeContext::new(DataType::UInt8, true)), + Box::new(NullableDataType::new(DataType::UInt8, true)), 2, ), 12, diff --git a/rust/arrow/src/compute/kernels/cast.rs b/rust/arrow/src/compute/kernels/cast.rs index b8fbff756c5..96254c4e16d 100644 --- a/rust/arrow/src/compute/kernels/cast.rs +++ b/rust/arrow/src/compute/kernels/cast.rs @@ -1237,7 +1237,7 @@ mod tests { let array = Arc::new(a) as ArrayRef; let b = cast( &array, - &DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))), + &DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))), ) .unwrap(); assert_eq!(5, b.len()); @@ -1267,7 +1267,7 @@ mod tests { let array = Arc::new(a) as ArrayRef; let b = cast( &array, - &DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))), + &DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))), ) .unwrap(); assert_eq!(5, b.len()); @@ -1300,7 +1300,7 @@ mod tests { let array = array.slice(2, 4); let b = cast( &array, - &DataType::List(Box::new(DataTypeContext::new(DataType::Float64, true))), + &DataType::List(Box::new(NullableDataType::new(DataType::Float64, true))), ) .unwrap(); assert_eq!(4, b.len()); @@ -1377,7 +1377,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))); + DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -1387,7 +1387,7 @@ mod tests { let cast_array = cast( &list_array, - &DataType::List(Box::new(DataTypeContext::new(DataType::UInt16, true))), + &DataType::List(Box::new(NullableDataType::new(DataType::UInt16, true))), ) .unwrap(); // 3 negative values should get lost when casting to unsigned, @@ -1436,7 +1436,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))); + DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -1446,7 +1446,7 @@ mod tests { cast( &list_array, - &DataType::List(Box::new(DataTypeContext::new( + &DataType::List(Box::new(NullableDataType::new( DataType::Timestamp(TimeUnit::Microsecond, None), true, ))), @@ -2853,7 +2853,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))); + DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -2875,7 +2875,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::LargeList(Box::new(DataTypeContext::new(DataType::Int32, true))); + DataType::LargeList(Box::new(NullableDataType::new(DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -2895,7 +2895,7 @@ mod tests { // Construct a fixed size list array from the above two let list_data_type = DataType::FixedSizeList( - Box::new(DataTypeContext::new(DataType::Int32, true)), + Box::new(NullableDataType::new(DataType::Int32, true)), 2, ); let list_data = ArrayData::builder(list_data_type) @@ -2988,12 +2988,12 @@ mod tests { LargeBinary, Utf8, LargeUtf8, - List(Box::new(DataTypeContext::new(DataType::Int8, true))), - List(Box::new(DataTypeContext::new(DataType::Utf8, true))), - FixedSizeList(Box::new(DataTypeContext::new(DataType::Int8, true)), 10), - FixedSizeList(Box::new(DataTypeContext::new(DataType::Utf8, false)), 10), - LargeList(Box::new(DataTypeContext::new(DataType::Int8, true))), - LargeList(Box::new(DataTypeContext::new(DataType::Utf8, false))), + List(Box::new(NullableDataType::new(DataType::Int8, true))), + List(Box::new(NullableDataType::new(DataType::Utf8, true))), + FixedSizeList(Box::new(NullableDataType::new(DataType::Int8, true)), 10), + FixedSizeList(Box::new(NullableDataType::new(DataType::Utf8, false)), 10), + LargeList(Box::new(NullableDataType::new(DataType::Int8, true))), + LargeList(Box::new(NullableDataType::new(DataType::Utf8, false))), Struct(vec![ Field::new("f1", DataType::Int32, false), Field::new("f2", DataType::Utf8, true), diff --git a/rust/arrow/src/compute/kernels/comparison.rs b/rust/arrow/src/compute/kernels/comparison.rs index b7ff52075e6..91877f86883 100644 --- a/rust/arrow/src/compute/kernels/comparison.rs +++ b/rust/arrow/src/compute/kernels/comparison.rs @@ -736,7 +736,7 @@ fn new_all_set_buffer(len: usize) -> Buffer { mod tests { use super::*; use crate::array::Int32Array; - use crate::datatypes::{DataTypeContext, Int8Type, ToByteSlice}; + use crate::datatypes::{Int8Type, NullableDataType, ToByteSlice}; #[test] fn test_primitive_array_eq() { @@ -1005,7 +1005,7 @@ mod tests { .data(); let value_offsets = Buffer::from(&[0i64, 3, 6, 6, 9].to_byte_slice()); let list_data_type = - DataType::LargeList(Box::new(DataTypeContext::new(DataType::Int32, true))); + DataType::LargeList(Box::new(NullableDataType::new(DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(4) .add_buffer(value_offsets) diff --git a/rust/arrow/src/compute/kernels/filter.rs b/rust/arrow/src/compute/kernels/filter.rs index 31beb33ee2f..1f5443a982e 100644 --- a/rust/arrow/src/compute/kernels/filter.rs +++ b/rust/arrow/src/compute/kernels/filter.rs @@ -1085,7 +1085,7 @@ mod tests { let value_offsets = Buffer::from(&[0i64, 3, 6, 8, 8].to_byte_slice()); let list_data_type = - DataType::LargeList(Box::new(DataTypeContext::new(DataType::Int32, false))); + DataType::LargeList(Box::new(NullableDataType::new(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(4) .add_buffer(value_offsets) diff --git a/rust/arrow/src/compute/kernels/limit.rs b/rust/arrow/src/compute/kernels/limit.rs index 50d5747b4a0..5e182e6bc49 100644 --- a/rust/arrow/src/compute/kernels/limit.rs +++ b/rust/arrow/src/compute/kernels/limit.rs @@ -35,7 +35,7 @@ mod tests { use super::*; use crate::array::*; use crate::buffer::Buffer; - use crate::datatypes::{DataType, DataTypeContext, Field, ToByteSlice}; + use crate::datatypes::{DataType, Field, NullableDataType, ToByteSlice}; use crate::util::bit_util; use std::sync::Arc; @@ -110,7 +110,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(DataTypeContext::new(DataType::Int32, false))); + DataType::List(Box::new(NullableDataType::new(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(9) .add_buffer(value_offsets) diff --git a/rust/arrow/src/compute/kernels/take.rs b/rust/arrow/src/compute/kernels/take.rs index 0c8af67f309..41259718f65 100644 --- a/rust/arrow/src/compute/kernels/take.rs +++ b/rust/arrow/src/compute/kernels/take.rs @@ -767,7 +767,7 @@ mod tests { let value_offsets = Buffer::from(&value_offsets.to_byte_slice()); // Construct a list array from the above two let list_data_type = DataType::$list_data_type(Box::new( - DataTypeContext::new(DataType::Int32, false), + NullableDataType::new(DataType::Int32, false), )); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) @@ -838,7 +838,7 @@ mod tests { let value_offsets = Buffer::from(&value_offsets.to_byte_slice()); // Construct a list array from the above two let list_data_type = DataType::$list_data_type(Box::new( - DataTypeContext::new(DataType::Int32, false), + NullableDataType::new(DataType::Int32, false), )); let list_data = ArrayData::builder(list_data_type.clone()) .len(4) @@ -909,7 +909,7 @@ mod tests { let value_offsets = Buffer::from(&value_offsets.to_byte_slice()); // Construct a list array from the above two let list_data_type = DataType::$list_data_type(Box::new( - DataTypeContext::new(DataType::Int32, false), + NullableDataType::new(DataType::Int32, false), )); let list_data = ArrayData::builder(list_data_type.clone()) .len(4) @@ -1001,7 +1001,7 @@ mod tests { let value_offsets = Buffer::from(&[0, 3, 6, 8].to_byte_slice()); // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(DataTypeContext::new(DataType::Int32, false))); + DataType::List(Box::new(NullableDataType::new(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) diff --git a/rust/arrow/src/compute/util.rs b/rust/arrow/src/compute/util.rs index 2112bdeeaa6..0fd0e64355a 100644 --- a/rust/arrow/src/compute/util.rs +++ b/rust/arrow/src/compute/util.rs @@ -321,7 +321,7 @@ mod tests { #[test] fn test_take_value_index_from_list() { let list = build_list( - DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))), + DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))), Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), vec![0i32, 2i32, 5i32, 10i32], ); @@ -337,7 +337,7 @@ mod tests { #[test] fn test_take_value_index_from_large_list() { let list = build_list( - DataType::LargeList(Box::new(DataTypeContext::new(DataType::Int32, false))), + DataType::LargeList(Box::new(NullableDataType::new(DataType::Int32, false))), Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), vec![0i64, 2i64, 5i64, 10i64], ); diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index a86aedc76d2..dc7ae77f344 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -125,11 +125,11 @@ pub enum DataType { /// A variable-length string in Unicode with UFT-8 encoding and 64-bit offsets. LargeUtf8, /// A list of some logical data type with variable length. - List(Box), + List(Box), /// A list of some logical data type with fixed length. - FixedSizeList(Box, i32), + FixedSizeList(Box, i32), /// A list of some logical data type with variable length and 64-bit offsets. - LargeList(Box), + LargeList(Box), /// A nested datatype that contains a number of sub-fields. Struct(Vec), /// A nested datatype that can represent slots of differing types. @@ -149,9 +149,9 @@ pub enum DataType { Decimal(usize, usize), } -/// Data type context that holds additional metadata +/// Extends data type with nullability #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -pub struct DataTypeContext { +pub struct NullableDataType { data_type: DataType, nullable: bool, } @@ -196,8 +196,7 @@ pub enum IntervalUnit { #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct Field { name: String, - data_type: DataType, - nullable: bool, + data_type: NullableDataType, dict_id: i64, dict_is_ordered: bool, } @@ -883,7 +882,7 @@ impl ToByteSlice for T { impl DataType { /// Parse a data type from a JSON representation pub(crate) fn from(json: &Value) -> Result { - let default_dt_ctx = DataTypeContext::new(DataType::Boolean, true); + let default_dt_ctx = NullableDataType::new(DataType::Boolean, true); match *json { Value::Object(ref map) => match map.get("name") { Some(s) if s == "null" => Ok(DataType::Null), @@ -1156,10 +1155,10 @@ impl DataType { } } -impl DataTypeContext { +impl NullableDataType { /// Creates a new data type context pub fn new(data_type: DataType, nullable: bool) -> Self { - DataTypeContext { + NullableDataType { data_type, nullable, } @@ -1183,9 +1182,7 @@ impl Field { pub fn new(name: &str, data_type: DataType, nullable: bool) -> Self { Field { name: name.to_string(), - //todo: combine data type and nullability in type context - data_type, - nullable, + data_type: NullableDataType::new(data_type, nullable), dict_id: 0, dict_is_ordered: false, } @@ -1201,8 +1198,7 @@ impl Field { ) -> Self { Field { name: name.to_string(), - data_type, - nullable, + data_type: NullableDataType::new(data_type, nullable), dict_id, dict_is_ordered, } @@ -1217,13 +1213,13 @@ impl Field { /// Returns an immutable reference to the `Field`'s data-type #[inline] pub const fn data_type(&self) -> &DataType { - &self.data_type + self.data_type.data_type() } /// Indicates whether this `Field` supports null values #[inline] pub const fn is_nullable(&self) -> bool { - self.nullable + self.data_type.nullable } /// Returns the dictionary ID @@ -1278,9 +1274,9 @@ impl Field { )); } let nested_field = Self::from(&values[0])?; - let nexted_dt_ctx = DataTypeContext::new( - nested_field.data_type, - nested_field.nullable, + let nexted_dt_ctx = NullableDataType::new( + nested_field.data_type.data_type, + nested_field.data_type.nullable, ); match data_type { DataType::List(_) => DataType::List(Box::new( @@ -1367,8 +1363,7 @@ impl Field { }; Ok(Field { name, - nullable, - data_type, + data_type: NullableDataType::new(data_type, nullable), dict_id, dict_is_ordered, }) @@ -1412,7 +1407,7 @@ impl Field { match self.data_type() { DataType::Dictionary(ref index_type, ref value_type) => json!({ "name": self.name, - "nullable": self.nullable, + "nullable": self.data_type.nullable, "type": value_type.to_json(), "children": children, "dictionary": { @@ -1423,8 +1418,8 @@ impl Field { }), _ => json!({ "name": self.name, - "nullable": self.nullable, - "type": self.data_type.to_json(), + "nullable": self.data_type.is_nullable(), + "type": self.data_type.data_type().to_json(), "children": children }), } @@ -1453,8 +1448,8 @@ impl Field { .to_string(), )); } - match &mut self.data_type { - DataType::Struct(nested_fields) => match &from.data_type { + match &mut self.data_type.data_type { + DataType::Struct(nested_fields) => match &from.data_type.data_type { DataType::Struct(from_nested_fields) => { for from_field in from_nested_fields { let mut is_new_field = true; @@ -1477,7 +1472,7 @@ impl Field { )); } }, - DataType::Union(nested_fields) => match &from.data_type { + DataType::Union(nested_fields) => match &from.data_type.data_type { DataType::Union(from_nested_fields) => { for from_field in from_nested_fields { let mut is_new_field = true; @@ -1529,7 +1524,7 @@ impl Field { | DataType::Utf8 | DataType::LargeUtf8 | DataType::Decimal(_, _) => { - if self.data_type != from.data_type { + if self.data_type.data_type != from.data_type.data_type { return Err(ArrowError::SchemaError( "Fail to merge schema Field due to conflicting datatype" .to_string(), @@ -1537,8 +1532,8 @@ impl Field { } } } - if from.nullable { - self.nullable = from.nullable; + if from.data_type.nullable { + self.data_type.nullable = from.data_type.nullable; } Ok(()) @@ -1547,7 +1542,7 @@ impl Field { impl fmt::Display for Field { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}: {:?}", self.name, self.data_type) + write!(f, "{}: {:?}", self.name, self.data_type.data_type) } } @@ -1867,12 +1862,12 @@ mod tests { assert_eq!( "{\"Struct\":[\ - {\"name\":\"first_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false},\ - {\"name\":\"last_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false},\ - {\"name\":\"address\",\"data_type\":{\"Struct\":\ - [{\"name\":\"street\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false},\ - {\"name\":\"zip\",\"data_type\":\"UInt16\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false}\ - ]},\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false}]}", + {\"name\":\"first_name\",\"data_type\":{\"data_type\":\"Utf8\",\"nullable\":false},\"dict_id\":0,\"dict_is_ordered\":false},\ + {\"name\":\"last_name\",\"data_type\":{\"data_type\":\"Utf8\",\"nullable\":false},\"dict_id\":0,\"dict_is_ordered\":false},\ + {\"name\":\"address\",\"data_type\":{\"data_type\":{\"Struct\":\ + [{\"name\":\"street\",\"data_type\":{\"data_type\":\"Utf8\",\"nullable\":false},\"dict_id\":0,\"dict_is_ordered\":false},\ + {\"name\":\"zip\",\"data_type\":{\"data_type\":\"UInt16\",\"nullable\":false},\"dict_id\":0,\"dict_is_ordered\":false}\ + ]},\"nullable\":false},\"dict_id\":0,\"dict_is_ordered\":false}]}", serialized ); @@ -2053,7 +2048,7 @@ mod tests { Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false), Field::new( "c21", - DataType::List(Box::new(DataTypeContext::new( + DataType::List(Box::new(NullableDataType::new( DataType::Boolean, true, ))), @@ -2062,15 +2057,15 @@ mod tests { Field::new( "c22", DataType::FixedSizeList( - Box::new(DataTypeContext::new(DataType::Boolean, false)), + Box::new(NullableDataType::new(DataType::Boolean, false)), 5, ), false, ), Field::new( "c23", - DataType::List(Box::new(DataTypeContext::new( - DataType::List(Box::new(DataTypeContext::new( + DataType::List(Box::new(NullableDataType::new( + DataType::List(Box::new(NullableDataType::new( DataType::Struct(vec![]), true, ))), @@ -2106,8 +2101,8 @@ mod tests { Field::new("c33", DataType::LargeUtf8, true), Field::new( "c34", - DataType::LargeList(Box::new(DataTypeContext::new( - DataType::LargeList(Box::new(DataTypeContext::new( + DataType::LargeList(Box::new(NullableDataType::new( + DataType::LargeList(Box::new(NullableDataType::new( DataType::Struct(vec![]), false, ))), @@ -2566,8 +2561,8 @@ mod tests { assert_eq!(schema.to_string(), "first_name: Utf8, \ last_name: Utf8, \ address: Struct([\ - Field { name: \"street\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false }, \ - Field { name: \"zip\", data_type: UInt16, nullable: false, dict_id: 0, dict_is_ordered: false }])") + Field { name: \"street\", data_type: NullableDataType { data_type: Utf8, nullable: false }, dict_id: 0, dict_is_ordered: false }, \ + Field { name: \"zip\", data_type: NullableDataType { data_type: UInt16, nullable: false }, dict_id: 0, dict_is_ordered: false }])") } #[test] @@ -2804,11 +2799,11 @@ mod tests { #[test] fn test_compare_nested_types() { - let list_type_a = &DataType::List(Box::new(DataTypeContext::new( + let list_type_a = &DataType::List(Box::new(NullableDataType::new( DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), true, ))); - let list_type_b = &DataType::List(Box::new(DataTypeContext::new( + let list_type_b = &DataType::List(Box::new(NullableDataType::new( DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), true, ))); @@ -2818,11 +2813,11 @@ mod tests { #[test] fn test_compare_mismatching_types() { - let list_type_a = &DataType::LargeList(Box::new(DataTypeContext::new( + let list_type_a = &DataType::LargeList(Box::new(NullableDataType::new( DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), true, ))); - let list_type_b = &DataType::LargeList(Box::new(DataTypeContext::new( + let list_type_b = &DataType::LargeList(Box::new(NullableDataType::new( DataType::Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), false, ))); diff --git a/rust/arrow/src/ipc/convert.rs b/rust/arrow/src/ipc/convert.rs index 646e8eb6a0a..460c1b9c29c 100644 --- a/rust/arrow/src/ipc/convert.rs +++ b/rust/arrow/src/ipc/convert.rs @@ -18,7 +18,7 @@ //! Utilities for converting between IPC types and native Arrow types use crate::datatypes::{ - DataType, DataTypeContext, DateUnit, Field, IntervalUnit, Schema, TimeUnit, + DataType, DateUnit, Field, IntervalUnit, NullableDataType, Schema, TimeUnit, }; use crate::ipc; @@ -157,7 +157,7 @@ pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataT .clone() } -fn get_data_type_context(field: ipc::Field, may_be_dictionary: bool) -> DataTypeContext { +fn get_data_type_context(field: ipc::Field, may_be_dictionary: bool) -> NullableDataType { if let Some(dictionary) = field.dictionary() { if may_be_dictionary { let int = dictionary.indexType().unwrap(); @@ -173,7 +173,7 @@ fn get_data_type_context(field: ipc::Field, may_be_dictionary: bool) -> DataType _ => panic!("Unexpected bitwidth and signed"), }; let value_type = get_data_type_context(field, false).data_type().clone(); - return DataTypeContext::new( + return NullableDataType::new( DataType::Dictionary(Box::new(index_type), Box::new(value_type)), // taking nullability from parent field field.nullable(), @@ -314,7 +314,7 @@ fn get_data_type_context(field: ipc::Field, may_be_dictionary: bool) -> DataType t => unimplemented!("Type {:?} not supported", t), }; - DataTypeContext::new(data_type, field.nullable()) + NullableDataType::new(data_type, field.nullable()) } pub(crate) struct FBFieldType<'b> { @@ -686,7 +686,7 @@ pub(crate) fn get_fb_dictionary<'a: 'b, 'b>( #[cfg(test)] mod tests { use super::*; - use crate::datatypes::{DataType, DataTypeContext, Field, Schema}; + use crate::datatypes::{DataType, Field, NullableDataType, Schema}; #[test] fn convert_schema_round_trip() { @@ -752,7 +752,7 @@ mod tests { Field::new("binary", DataType::Binary, false), Field::new( "list[u8]", - DataType::List(Box::new(DataTypeContext::new( + DataType::List(Box::new(NullableDataType::new( DataType::UInt8, false, ))), @@ -760,7 +760,7 @@ mod tests { ), Field::new( "list[struct]", - DataType::List(Box::new(DataTypeContext::new( + DataType::List(Box::new(NullableDataType::new( DataType::Struct(vec![ Field::new("float32", DataType::UInt8, false), Field::new("int32", DataType::Int32, true), @@ -776,7 +776,7 @@ mod tests { Field::new("int64", DataType::Int64, true), Field::new( "list[struct]>]", - DataType::List(Box::new(DataTypeContext::new( + DataType::List(Box::new(NullableDataType::new( DataType::Struct(vec![ Field::new( "date32", @@ -785,7 +785,7 @@ mod tests { ), Field::new( "list[struct<>]", - DataType::List(Box::new(DataTypeContext::new( + DataType::List(Box::new(NullableDataType::new( DataType::Struct(vec![]), false, ))), diff --git a/rust/arrow/src/json/reader.rs b/rust/arrow/src/json/reader.rs index acf0c630f76..15f0717b2e1 100644 --- a/rust/arrow/src/json/reader.rs +++ b/rust/arrow/src/json/reader.rs @@ -64,16 +64,16 @@ fn coerce_data_type(dt: Vec<&DataType>) -> Result { 1 => Ok(dt[0].clone()), 2 => { // there can be a case where a list and scalar both exist - if dt.contains(&&DataType::List(Box::new(DataTypeContext::new( + if dt.contains(&&DataType::List(Box::new(NullableDataType::new( DataType::Float64, true, - )))) || dt.contains(&&DataType::List(Box::new(DataTypeContext::new( + )))) || dt.contains(&&DataType::List(Box::new(NullableDataType::new( DataType::Int64, true, - )))) || dt.contains(&&DataType::List(Box::new(DataTypeContext::new( + )))) || dt.contains(&&DataType::List(Box::new(NullableDataType::new( DataType::Boolean, true, - )))) || dt.contains(&&DataType::List(Box::new(DataTypeContext::new( + )))) || dt.contains(&&DataType::List(Box::new(NullableDataType::new( DataType::Utf8, true, )))) { @@ -84,12 +84,12 @@ fn coerce_data_type(dt: Vec<&DataType>) -> Result { match (dt[0], dt[1]) { (t1, DataType::List(e)) if e.data_type() == &DataType::Float64 => { if t1 == &DataType::Float64 { - Ok(DataType::List(Box::new(DataTypeContext::new( + Ok(DataType::List(Box::new(NullableDataType::new( DataType::Float64, true, )))) } else { - Ok(DataType::List(Box::new(DataTypeContext::new( + Ok(DataType::List(Box::new(NullableDataType::new( coerce_data_type(vec![t1, &DataType::Float64])?, true, )))) @@ -97,12 +97,12 @@ fn coerce_data_type(dt: Vec<&DataType>) -> Result { } (t1, DataType::List(e)) if e.data_type() == &DataType::Int64 => { if t1 == &DataType::Int64 { - Ok(DataType::List(Box::new(DataTypeContext::new( + Ok(DataType::List(Box::new(NullableDataType::new( DataType::Int64, true, )))) } else { - Ok(DataType::List(Box::new(DataTypeContext::new( + Ok(DataType::List(Box::new(NullableDataType::new( coerce_data_type(vec![t1, &DataType::Int64])?, true, )))) @@ -110,12 +110,12 @@ fn coerce_data_type(dt: Vec<&DataType>) -> Result { } (t1, DataType::List(e)) if e.data_type() == &DataType::Boolean => { if t1 == &DataType::Boolean { - Ok(DataType::List(Box::new(DataTypeContext::new( + Ok(DataType::List(Box::new(NullableDataType::new( DataType::Boolean, true, )))) } else { - Ok(DataType::List(Box::new(DataTypeContext::new( + Ok(DataType::List(Box::new(NullableDataType::new( coerce_data_type(vec![t1, &DataType::Boolean])?, true, )))) @@ -123,12 +123,12 @@ fn coerce_data_type(dt: Vec<&DataType>) -> Result { } (t1, DataType::List(e)) if e.data_type() == &DataType::Utf8 => { if t1 == &DataType::Utf8 { - Ok(DataType::List(Box::new(DataTypeContext::new( + Ok(DataType::List(Box::new(NullableDataType::new( DataType::Utf8, true, )))) } else { - Ok(DataType::List(Box::new(DataTypeContext::new( + Ok(DataType::List(Box::new(NullableDataType::new( coerce_data_type(vec![t1, &DataType::Utf8])?, true, )))) @@ -148,7 +148,7 @@ fn coerce_data_type(dt: Vec<&DataType>) -> Result { _ => { // TODO(nevi_me) It's possible to have [float, int, list(float)], which should // return list(float). Will hash this out later - Ok(DataType::List(Box::new(DataTypeContext::new( + Ok(DataType::List(Box::new(NullableDataType::new( DataType::Utf8, true, )))) @@ -289,13 +289,13 @@ pub fn infer_json_schema( if values.contains_key(k) { let x = values.get_mut(k).unwrap(); x.insert(DataType::List(Box::new( - DataTypeContext::new(dt, true), + NullableDataType::new(dt, true), ))); } else { // create hashset and add value type let mut hs = HashSet::new(); hs.insert(DataType::List(Box::new( - DataTypeContext::new(dt, true), + NullableDataType::new(dt, true), ))); values.insert(k.to_string(), hs); } @@ -1373,12 +1373,12 @@ mod tests { assert_eq!(&DataType::Int64, a.1.data_type()); let b = schema.column_with_name("b").unwrap(); assert_eq!( - &DataType::List(Box::new(DataTypeContext::new(DataType::Float64, true))), + &DataType::List(Box::new(NullableDataType::new(DataType::Float64, true))), b.1.data_type() ); let c = schema.column_with_name("c").unwrap(); assert_eq!( - &DataType::List(Box::new(DataTypeContext::new(DataType::Boolean, true))), + &DataType::List(Box::new(NullableDataType::new(DataType::Boolean, true))), c.1.data_type() ); let d = schema.column_with_name("d").unwrap(); @@ -1431,35 +1431,35 @@ mod tests { use crate::datatypes::DataType::*; assert_eq!( - List(Box::new(DataTypeContext::new(Float64, true))), + List(Box::new(NullableDataType::new(Float64, true))), coerce_data_type(vec![ &Float64, - &List(Box::new(DataTypeContext::new(Float64, true))) + &List(Box::new(NullableDataType::new(Float64, true))) ]) .unwrap() ); assert_eq!( - List(Box::new(DataTypeContext::new(Float64, true))), + List(Box::new(NullableDataType::new(Float64, true))), coerce_data_type(vec![ &Float64, - &List(Box::new(DataTypeContext::new(Int64, true))) + &List(Box::new(NullableDataType::new(Int64, true))) ]) .unwrap() ); assert_eq!( - List(Box::new(DataTypeContext::new(Int64, true))), + List(Box::new(NullableDataType::new(Int64, true))), coerce_data_type(vec![ &Int64, - &List(Box::new(DataTypeContext::new(Int64, true))) + &List(Box::new(NullableDataType::new(Int64, true))) ]) .unwrap() ); // boolean and number are incompatible, return utf8 assert_eq!( - List(Box::new(DataTypeContext::new(Utf8, true))), + List(Box::new(NullableDataType::new(Utf8, true))), coerce_data_type(vec![ &Boolean, - &List(Box::new(DataTypeContext::new(Float64, true))) + &List(Box::new(NullableDataType::new(Float64, true))) ]) .unwrap() ); @@ -1492,17 +1492,17 @@ mod tests { assert_eq!(&DataType::Int64, a.1.data_type()); let b = schema.column_with_name("b").unwrap(); assert_eq!( - &DataType::List(Box::new(DataTypeContext::new(DataType::Float64, true))), + &DataType::List(Box::new(NullableDataType::new(DataType::Float64, true))), b.1.data_type() ); let c = schema.column_with_name("c").unwrap(); assert_eq!( - &DataType::List(Box::new(DataTypeContext::new(DataType::Boolean, true))), + &DataType::List(Box::new(NullableDataType::new(DataType::Boolean, true))), c.1.data_type() ); let d = schema.column_with_name("d").unwrap(); assert_eq!( - &DataType::List(Box::new(DataTypeContext::new(DataType::Utf8, true))), + &DataType::List(Box::new(NullableDataType::new(DataType::Utf8, true))), d.1.data_type() ); @@ -1694,7 +1694,7 @@ mod tests { fn test_list_of_string_dictionary_from_json() { let schema = Schema::new(vec![Field::new( "events", - List(Box::new(DataTypeContext::new( + List(Box::new(NullableDataType::new( Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), true, ))), @@ -1717,7 +1717,7 @@ mod tests { let events = schema.column_with_name("events").unwrap(); assert_eq!( - &List(Box::new(DataTypeContext::new( + &List(Box::new(NullableDataType::new( Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), true ))), @@ -1751,7 +1751,7 @@ mod tests { fn test_list_of_string_dictionary_from_json_with_nulls() { let schema = Schema::new(vec![Field::new( "events", - List(Box::new(DataTypeContext::new( + List(Box::new(NullableDataType::new( Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), true, ))), @@ -1776,7 +1776,7 @@ mod tests { let events = schema.column_with_name("events").unwrap(); assert_eq!( - &List(Box::new(DataTypeContext::new( + &List(Box::new(NullableDataType::new( Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), true ))), @@ -1917,17 +1917,17 @@ mod tests { Field::new("a", DataType::Int64, true), Field::new( "b", - DataType::List(Box::new(DataTypeContext::new(DataType::Float64, true))), + DataType::List(Box::new(NullableDataType::new(DataType::Float64, true))), true, ), Field::new( "c", - DataType::List(Box::new(DataTypeContext::new(DataType::Boolean, true))), + DataType::List(Box::new(NullableDataType::new(DataType::Boolean, true))), true, ), Field::new( "d", - DataType::List(Box::new(DataTypeContext::new(DataType::Utf8, true))), + DataType::List(Box::new(NullableDataType::new(DataType::Utf8, true))), true, ), ]); diff --git a/rust/arrow/src/record_batch.rs b/rust/arrow/src/record_batch.rs index 92fe678b181..41cbd6dcfb0 100644 --- a/rust/arrow/src/record_batch.rs +++ b/rust/arrow/src/record_batch.rs @@ -302,7 +302,7 @@ mod tests { fn create_record_batch_with_matching_nested_type() { let schema = Schema::new(vec![Field::new( "list", - DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))), + DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))), false, )]); @@ -319,7 +319,7 @@ mod tests { let offsets = UInt64Array::from(vec![0, 2, 4]); let array_data = Arc::new(ArrayData::new( - DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))), + DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))), 3, None, None, diff --git a/rust/arrow/src/util/integration_util.rs b/rust/arrow/src/util/integration_util.rs index 9d64f5b775c..4e419968d7c 100644 --- a/rust/arrow/src/util/integration_util.rs +++ b/rust/arrow/src/util/integration_util.rs @@ -688,7 +688,7 @@ mod tests { Field::new("c3", DataType::Utf8, true), Field::new( "c4", - DataType::List(Box::new(DataTypeContext::new(DataType::Int32, false))), + DataType::List(Box::new(NullableDataType::new(DataType::Int32, false))), true, ), ]); @@ -758,7 +758,7 @@ mod tests { Field::new("utf8s", DataType::Utf8, true), Field::new( "lists", - DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))), + DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))), true, ), Field::new( @@ -835,7 +835,7 @@ mod tests { let value_data = Int32Array::from(vec![None, Some(2), None, None]); let value_offsets = Buffer::from(&[0, 3, 4, 4].to_byte_slice()); let list_data_type = - DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))); + DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) diff --git a/rust/datafusion/src/physical_plan/distinct_expressions.rs b/rust/datafusion/src/physical_plan/distinct_expressions.rs index b7929a8456e..a441c0b76c2 100644 --- a/rust/datafusion/src/physical_plan/distinct_expressions.rs +++ b/rust/datafusion/src/physical_plan/distinct_expressions.rs @@ -22,7 +22,7 @@ use std::fmt::Debug; use std::hash::Hash; use std::sync::Arc; -use arrow::datatypes::{DataType, DataTypeContext, Field}; +use arrow::datatypes::{DataType, Field, NullableDataType}; use ahash::RandomState; use std::collections::HashSet; @@ -81,7 +81,7 @@ impl AggregateExpr for DistinctCount { .map(|data_type| { Field::new( &format_state_name(&self.name, "count distinct"), - DataType::List(Box::new(DataTypeContext::new( + DataType::List(Box::new(NullableDataType::new( data_type.clone(), true, ))), diff --git a/rust/datafusion/src/physical_plan/functions.rs b/rust/datafusion/src/physical_plan/functions.rs index d9367de02ab..e81615fe906 100644 --- a/rust/datafusion/src/physical_plan/functions.rs +++ b/rust/datafusion/src/physical_plan/functions.rs @@ -38,7 +38,7 @@ use crate::physical_plan::array_expressions; use crate::physical_plan::datetime_expressions; use crate::physical_plan::math_expressions; use crate::physical_plan::string_expressions; -use arrow::datatypes::DataTypeContext; +use arrow::datatypes::NullableDataType; use arrow::{ array::ArrayRef, compute::kernels::length::length, @@ -204,7 +204,7 @@ pub fn return_type( Ok(DataType::Timestamp(TimeUnit::Nanosecond, None)) } BuiltinScalarFunction::Array => Ok(DataType::FixedSizeList( - Box::new(DataTypeContext::new(arg_types[0].clone(), true)), + Box::new(NullableDataType::new(arg_types[0].clone(), true)), arg_types.len() as i32, )), _ => Ok(DataType::Float64), @@ -473,7 +473,7 @@ mod tests { expr.data_type(&schema)?, // type equals to a common coercion DataType::FixedSizeList( - Box::new(DataTypeContext::new(expected_type, true)), + Box::new(NullableDataType::new(expected_type, true)), 2 ) ); diff --git a/rust/datafusion/src/physical_plan/planner.rs b/rust/datafusion/src/physical_plan/planner.rs index 036da6624eb..aa1fc2ba25e 100644 --- a/rust/datafusion/src/physical_plan/planner.rs +++ b/rust/datafusion/src/physical_plan/planner.rs @@ -737,7 +737,7 @@ mod tests { }; let plan = planner.create_physical_plan(&logical_plan, &ctx_state); - let expected_error = "Extension planner for NoOp created an ExecutionPlan with mismatched schema. LogicalPlan schema: Schema { fields: [Field { name: \"a\", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false }], metadata: {} }, ExecutionPlan schema: Schema { fields: [Field { name: \"b\", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false }], metadata: {} }"; + let expected_error = "Extension planner for NoOp created an ExecutionPlan with mismatched schema. LogicalPlan schema: Schema { fields: [Field { name: \"a\", data_type: NullableDataType { data_type: Int32, nullable: false }, dict_id: 0, dict_is_ordered: false }], metadata: {} }, ExecutionPlan schema: Schema { fields: [Field { name: \"b\", data_type: NullableDataType { data_type: Int32, nullable: false }, dict_id: 0, dict_is_ordered: false }], metadata: {} }"; match plan { Ok(_) => assert!(false, "Expected planning failure"), diff --git a/rust/datafusion/src/scalar.rs b/rust/datafusion/src/scalar.rs index 943a0e893af..c64f1a2d02d 100644 --- a/rust/datafusion/src/scalar.rs +++ b/rust/datafusion/src/scalar.rs @@ -34,7 +34,7 @@ use arrow::{ }; use crate::error::{DataFusionError, Result}; -use arrow::datatypes::DataTypeContext; +use arrow::datatypes::NullableDataType; /// Represents a dynamically typed, nullable single value. /// This is the single-valued counter-part of arrow’s `Array`. @@ -134,7 +134,7 @@ impl ScalarValue { ScalarValue::Utf8(_) => DataType::Utf8, ScalarValue::LargeUtf8(_) => DataType::LargeUtf8, ScalarValue::List(_, data_type) => { - DataType::List(Box::new(DataTypeContext::new(data_type.clone(), true))) + DataType::List(Box::new(NullableDataType::new(data_type.clone(), true))) } ScalarValue::Date32(_) => DataType::Date32(DateUnit::Day), } diff --git a/rust/datafusion/tests/sql.rs b/rust/datafusion/tests/sql.rs index 480548b773e..e63b4b8972f 100644 --- a/rust/datafusion/tests/sql.rs +++ b/rust/datafusion/tests/sql.rs @@ -25,7 +25,7 @@ extern crate datafusion; use arrow::{array::*, datatypes::TimeUnit}; use arrow::{datatypes::Int32Type, datatypes::Int64Type, record_batch::RecordBatch}; use arrow::{ - datatypes::{DataType, DataTypeContext, Field, Schema, SchemaRef}, + datatypes::{DataType, Field, NullableDataType, Schema, SchemaRef}, util::display::array_value_to_string, }; @@ -142,12 +142,12 @@ async fn parquet_list_columns() { let schema = Arc::new(Schema::new(vec![ Field::new( "int64_list", - DataType::List(Box::new(DataTypeContext::new(DataType::Int64, true))), + DataType::List(Box::new(NullableDataType::new(DataType::Int64, true))), true, ), Field::new( "utf8_list", - DataType::List(Box::new(DataTypeContext::new(DataType::Utf8, true))), + DataType::List(Box::new(NullableDataType::new(DataType::Utf8, true))), true, ), ])); diff --git a/rust/parquet/src/arrow/array_reader.rs b/rust/parquet/src/arrow/array_reader.rs index 7bc0b55bd39..eeb71b0dc12 100644 --- a/rust/parquet/src/arrow/array_reader.rs +++ b/rust/parquet/src/arrow/array_reader.rs @@ -33,14 +33,14 @@ use arrow::array::{ use arrow::buffer::{Buffer, MutableBuffer}; use arrow::datatypes::{ ArrowPrimitiveType, BooleanType as ArrowBooleanType, DataType as ArrowType, - DataTypeContext, Date32Type as ArrowDate32Type, Date64Type as ArrowDate64Type, + Date32Type as ArrowDate32Type, Date64Type as ArrowDate64Type, DurationMicrosecondType as ArrowDurationMicrosecondType, DurationMillisecondType as ArrowDurationMillisecondType, DurationNanosecondType as ArrowDurationNanosecondType, DurationSecondType as ArrowDurationSecondType, Field, Float32Type as ArrowFloat32Type, Float64Type as ArrowFloat64Type, Int16Type as ArrowInt16Type, Int32Type as ArrowInt32Type, - Int64Type as ArrowInt64Type, Int8Type as ArrowInt8Type, Schema, + Int64Type as ArrowInt64Type, Int8Type as ArrowInt8Type, NullableDataType, Schema, Time32MillisecondType as ArrowTime32MillisecondType, Time32SecondType as ArrowTime32SecondType, Time64MicrosecondType as ArrowTime64MicrosecondType, @@ -1347,7 +1347,7 @@ impl<'a> TypeVisitor>, &'a ArrayReaderBuilderContext .ok() .map(|f| f.data_type().to_owned()) .unwrap_or_else(|| { - ArrowType::List(Box::new(DataTypeContext::new( + ArrowType::List(Box::new(NullableDataType::new( item_reader_type.clone(), list_type.is_optional(), ))) @@ -1626,9 +1626,9 @@ mod tests { StructArray, }; use arrow::datatypes::{ - ArrowPrimitiveType, DataType as ArrowType, DataTypeContext, - Date32Type as ArrowDate32, Field, Int32Type as ArrowInt32, - Int64Type as ArrowInt64, Time32MillisecondType as ArrowTime32MillisecondArray, + ArrowPrimitiveType, DataType as ArrowType, Date32Type as ArrowDate32, Field, + Int32Type as ArrowInt32, Int64Type as ArrowInt64, NullableDataType, + Time32MillisecondType as ArrowTime32MillisecondArray, Time64MicrosecondType as ArrowTime64MicrosecondArray, TimestampMicrosecondType as ArrowTimestampMicrosecondType, TimestampMillisecondType as ArrowTimestampMillisecondType, @@ -2310,7 +2310,7 @@ mod tests { let mut list_array_reader = ListArrayReader::::new( Box::new(item_array_reader), - ArrowType::List(Box::new(DataTypeContext::new(ArrowType::Int32, false))), + ArrowType::List(Box::new(NullableDataType::new(ArrowType::Int32, false))), ArrowType::Int32, 1, 1, @@ -2364,7 +2364,7 @@ mod tests { let mut list_array_reader = ListArrayReader::::new( Box::new(item_array_reader), - ArrowType::LargeList(Box::new(DataTypeContext::new(ArrowType::Int32, true))), + ArrowType::LargeList(Box::new(NullableDataType::new(ArrowType::Int32, true))), ArrowType::Int32, 1, 1, diff --git a/rust/parquet/src/arrow/arrow_writer.rs b/rust/parquet/src/arrow/arrow_writer.rs index 0e3320355a8..b97ebcb8997 100644 --- a/rust/parquet/src/arrow/arrow_writer.rs +++ b/rust/parquet/src/arrow/arrow_writer.rs @@ -674,7 +674,7 @@ mod tests { use arrow::array::*; use arrow::datatypes::{DataType, Field, Schema, UInt32Type, UInt8Type}; - use arrow::datatypes::{DataTypeContext, ToByteSlice}; + use arrow::datatypes::{NullableDataType, ToByteSlice}; use arrow::record_batch::RecordBatch; use crate::arrow::{ArrowReader, ParquetFileArrowReader}; @@ -712,7 +712,7 @@ mod tests { // define schema let schema = Schema::new(vec![Field::new( "a", - DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))), + DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))), false, )]); @@ -726,7 +726,7 @@ mod tests { // Construct a list array from the above two let a_list_data = ArrayData::builder(DataType::List(Box::new( - DataTypeContext::new(DataType::Int32, true), + NullableDataType::new(DataType::Int32, true), ))) .len(5) .add_buffer(a_value_offsets) @@ -810,7 +810,7 @@ mod tests { let struct_field_f = Field::new("f", DataType::Float32, true); let struct_field_g = Field::new( "g", - DataType::List(Box::new(DataTypeContext::new(DataType::Int16, false))), + DataType::List(Box::new(NullableDataType::new(DataType::Int16, false))), false, ); let struct_field_e = Field::new( @@ -1235,7 +1235,7 @@ mod tests { let a_value_offsets = arrow::buffer::Buffer::from(&[0, 1, 3, 3, 6, 10].to_byte_slice()); let a_list_data = ArrayData::builder(DataType::List(Box::new( - DataTypeContext::new(DataType::Int32, true), + NullableDataType::new(DataType::Int32, true), ))) .len(5) .add_buffer(a_value_offsets) @@ -1258,7 +1258,7 @@ mod tests { let a_value_offsets = arrow::buffer::Buffer::from(&[0i64, 1, 3, 3, 6, 10].to_byte_slice()); let a_list_data = ArrayData::builder(DataType::LargeList(Box::new( - DataTypeContext::new(DataType::Int32, true), + NullableDataType::new(DataType::Int32, true), ))) .len(5) .add_buffer(a_value_offsets) diff --git a/rust/parquet/src/arrow/schema.rs b/rust/parquet/src/arrow/schema.rs index 5fe76ea23d4..b0c3564ecb1 100644 --- a/rust/parquet/src/arrow/schema.rs +++ b/rust/parquet/src/arrow/schema.rs @@ -26,7 +26,7 @@ use std::collections::{HashMap, HashSet}; use std::sync::Arc; -use arrow::datatypes::{DataType, DataTypeContext, DateUnit, Field, Schema, TimeUnit}; +use arrow::datatypes::{DataType, DateUnit, Field, NullableDataType, Schema, TimeUnit}; use arrow::ipc::writer; use crate::basic::{LogicalType, Repetition, Type as PhysicalType}; @@ -545,7 +545,7 @@ impl ParquetTypeConverter<'_> { if self.is_self_included() { self.to_primitive_type_inner().map(|dt| { if self.is_repeated() { - Some(DataType::List(Box::new(DataTypeContext::new( + Some(DataType::List(Box::new(NullableDataType::new( dt, self.is_nullable(), )))) @@ -644,7 +644,10 @@ impl ParquetTypeConverter<'_> { if self.is_repeated() { self.to_struct().map(|opt| { opt.map(|dt| { - DataType::List(Box::new(DataTypeContext::new(dt, self.is_nullable()))) + DataType::List(Box::new(NullableDataType::new( + dt, + self.is_nullable(), + ))) }) }) } else { @@ -733,7 +736,7 @@ impl ParquetTypeConverter<'_> { item_type.map(|opt| { opt.map(|dt| { - DataType::List(Box::new(DataTypeContext::new( + DataType::List(Box::new(NullableDataType::new( dt, list_item.is_optional(), ))) @@ -754,7 +757,7 @@ mod tests { use std::{collections::HashMap, convert::TryFrom, sync::Arc}; use arrow::datatypes::{ - DataType, DataTypeContext, DateUnit, Field, IntervalUnit, TimeUnit, + DataType, DateUnit, Field, IntervalUnit, NullableDataType, TimeUnit, }; use crate::file::{metadata::KeyValue, reader::SerializedFileReader}; @@ -914,7 +917,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(DataTypeContext::new(DataType::Utf8, true))), + DataType::List(Box::new(NullableDataType::new(DataType::Utf8, true))), false, )); } @@ -928,7 +931,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(DataTypeContext::new(DataType::Utf8, true))), + DataType::List(Box::new(NullableDataType::new(DataType::Utf8, true))), true, )); } @@ -947,10 +950,10 @@ mod tests { // } { let arrow_inner_list = - DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))); + DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))); arrow_fields.push(Field::new( "array_of_arrays", - DataType::List(Box::new(DataTypeContext::new(arrow_inner_list, true))), + DataType::List(Box::new(NullableDataType::new(arrow_inner_list, true))), true, )); } @@ -964,7 +967,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(DataTypeContext::new(DataType::Utf8, true))), + DataType::List(Box::new(NullableDataType::new(DataType::Utf8, true))), true, )); } @@ -976,7 +979,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))), + DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))), true, )); } @@ -995,7 +998,7 @@ mod tests { ]); arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(DataTypeContext::new(arrow_struct, true))), + DataType::List(Box::new(NullableDataType::new(arrow_struct, true))), true, )); } @@ -1012,7 +1015,7 @@ mod tests { DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]); arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(DataTypeContext::new(arrow_struct, true))), + DataType::List(Box::new(NullableDataType::new(arrow_struct, true))), true, )); } @@ -1029,7 +1032,7 @@ mod tests { DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]); arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(DataTypeContext::new(arrow_struct, true))), + DataType::List(Box::new(NullableDataType::new(arrow_struct, true))), true, )); } @@ -1039,7 +1042,7 @@ mod tests { { arrow_fields.push(Field::new( "name", - DataType::List(Box::new(DataTypeContext::new(DataType::Int32, true))), + DataType::List(Box::new(NullableDataType::new(DataType::Int32, true))), true, )); } @@ -1205,7 +1208,7 @@ mod tests { let inner_group_list = Field::new( "innerGroup", - DataType::List(Box::new(DataTypeContext::new( + DataType::List(Box::new(NullableDataType::new( DataType::Struct(vec![Field::new("leaf3", DataType::Int32, true)]), true, ))), @@ -1214,7 +1217,7 @@ mod tests { let outer_group_list = Field::new( "outerGroup", - DataType::List(Box::new(DataTypeContext::new( + DataType::List(Box::new(NullableDataType::new( DataType::Struct(vec![ Field::new("leaf2", DataType::Int32, true), inner_group_list, @@ -1290,7 +1293,7 @@ mod tests { Field::new("string", DataType::Utf8, true), Field::new( "bools", - DataType::List(Box::new(DataTypeContext::new(DataType::Boolean, true))), + DataType::List(Box::new(NullableDataType::new(DataType::Boolean, true))), true, ), Field::new("date", DataType::Date32(DateUnit::Day), true), @@ -1360,7 +1363,7 @@ mod tests { Field::new("string", DataType::Utf8, true), Field::new( "bools", - DataType::List(Box::new(DataTypeContext::new(DataType::Boolean, true))), + DataType::List(Box::new(NullableDataType::new(DataType::Boolean, true))), true, ), Field::new("date", DataType::Date32(DateUnit::Day), true), @@ -1383,7 +1386,7 @@ mod tests { Field::new("uint32", DataType::UInt32, false), Field::new( "int32", - DataType::List(Box::new(DataTypeContext::new( + DataType::List(Box::new(NullableDataType::new( DataType::Int32, true, ))), @@ -1496,7 +1499,7 @@ mod tests { Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false), Field::new( "c21", - DataType::List(Box::new(DataTypeContext::new( + DataType::List(Box::new(NullableDataType::new( DataType::Boolean, true, ))), @@ -1594,7 +1597,7 @@ mod tests { vec![ Field::new( "c21", - DataType::List(Box::new(DataTypeContext::new( + DataType::List(Box::new(NullableDataType::new( DataType::Boolean, true, ))), @@ -1603,15 +1606,15 @@ mod tests { Field::new( "c22", DataType::FixedSizeList( - Box::new(DataTypeContext::new(DataType::Boolean, false)), + Box::new(NullableDataType::new(DataType::Boolean, false)), 5, ), false, ), Field::new( "c23", - DataType::List(Box::new(DataTypeContext::new( - DataType::LargeList(Box::new(DataTypeContext::new( + DataType::List(Box::new(NullableDataType::new( + DataType::LargeList(Box::new(NullableDataType::new( DataType::Struct(vec![ Field::new("a", DataType::Int16, true), Field::new("b", DataType::Float64, false), From 8264a667dffb623b7d9778bdd73ca104127ee54f Mon Sep 17 00:00:00 2001 From: Christoph Schulze Date: Tue, 24 Nov 2020 18:12:06 +0100 Subject: [PATCH 11/15] More renaming --- rust/arrow/src/datatypes.rs | 36 +++++----- rust/arrow/src/ipc/convert.rs | 66 +++++++++++-------- rust/arrow/src/ipc/reader.rs | 8 +-- .../src/bin/arrow-json-integration-test.rs | 18 ++--- rust/parquet/src/arrow/schema.rs | 10 +-- 5 files changed, 75 insertions(+), 63 deletions(-) diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index dc7ae77f344..cecd96bea83 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -882,7 +882,7 @@ impl ToByteSlice for T { impl DataType { /// Parse a data type from a JSON representation pub(crate) fn from(json: &Value) -> Result { - let default_dt_ctx = NullableDataType::new(DataType::Boolean, true); + let default_data_type = NullableDataType::new(DataType::Boolean, true); match *json { Value::Object(ref map) => match map.get("name") { Some(s) if s == "null" => Ok(DataType::Null), @@ -1016,17 +1016,17 @@ impl DataType { }, Some(s) if s == "list" => { // return a list with any type as its child isn't defined in the map - Ok(DataType::List(Box::new(default_dt_ctx))) + Ok(DataType::List(Box::new(default_data_type))) } Some(s) if s == "largelist" => { // return a largelist with any type as its child isn't defined in the map - Ok(DataType::LargeList(Box::new(default_dt_ctx))) + Ok(DataType::LargeList(Box::new(default_data_type))) } Some(s) if s == "fixedsizelist" => { // return a list with any type as its child isn't defined in the map if let Some(Value::Number(size)) = map.get("listSize") { Ok(DataType::FixedSizeList( - Box::new(default_dt_ctx), + Box::new(default_data_type), size.as_i64().unwrap() as i32, )) } else { @@ -1156,7 +1156,7 @@ impl DataType { } impl NullableDataType { - /// Creates a new data type context + /// Creates a new nullable data type pub fn new(data_type: DataType, nullable: bool) -> Self { NullableDataType { data_type, @@ -1274,20 +1274,20 @@ impl Field { )); } let nested_field = Self::from(&values[0])?; - let nexted_dt_ctx = NullableDataType::new( + let nexted_data_type = NullableDataType::new( nested_field.data_type.data_type, nested_field.data_type.nullable, ); match data_type { DataType::List(_) => DataType::List(Box::new( - nexted_dt_ctx, + nexted_data_type, )), DataType::LargeList(_) => DataType::LargeList(Box::new( - nexted_dt_ctx, + nexted_data_type, )), DataType::FixedSizeList(_, int) => { DataType::FixedSizeList( - Box::new(nexted_dt_ctx), + Box::new(nexted_data_type), int, ) } @@ -1378,27 +1378,27 @@ impl Field { pub fn to_json(&self) -> Value { let children: Vec = match self.data_type() { DataType::Struct(fields) => fields.iter().map(|f| f.to_json()).collect(), - DataType::List(type_ctx) => { + DataType::List(data_type) => { let item = Field::new( "item", - type_ctx.data_type().clone(), - type_ctx.is_nullable(), + data_type.data_type().clone(), + data_type.is_nullable(), ); vec![item.to_json()] } - DataType::LargeList(type_ctx) => { + DataType::LargeList(data_type) => { let item = Field::new( "item", - type_ctx.data_type().clone(), - type_ctx.is_nullable(), + data_type.data_type().clone(), + data_type.is_nullable(), ); vec![item.to_json()] } - DataType::FixedSizeList(type_ctx, _) => { + DataType::FixedSizeList(data_type, _) => { let item = Field::new( "item", - type_ctx.data_type().clone(), - type_ctx.is_nullable(), + data_type.data_type().clone(), + data_type.is_nullable(), ); vec![item.to_json()] } diff --git a/rust/arrow/src/ipc/convert.rs b/rust/arrow/src/ipc/convert.rs index 460c1b9c29c..14d7f2f1d66 100644 --- a/rust/arrow/src/ipc/convert.rs +++ b/rust/arrow/src/ipc/convert.rs @@ -152,12 +152,15 @@ pub fn schema_from_bytes(bytes: &[u8]) -> Option { /// Get the Arrow data type from the flatbuffer Field table pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataType { - get_data_type_context(field, may_be_dictionary) + get_nullable_data_type(field, may_be_dictionary) .data_type() .clone() } -fn get_data_type_context(field: ipc::Field, may_be_dictionary: bool) -> NullableDataType { +fn get_nullable_data_type( + field: ipc::Field, + may_be_dictionary: bool, +) -> NullableDataType { if let Some(dictionary) = field.dictionary() { if may_be_dictionary { let int = dictionary.indexType().unwrap(); @@ -172,7 +175,7 @@ fn get_data_type_context(field: ipc::Field, may_be_dictionary: bool) -> Nullable (64, false) => DataType::UInt64, _ => panic!("Unexpected bitwidth and signed"), }; - let value_type = get_data_type_context(field, false).data_type().clone(); + let value_type = get_nullable_data_type(field, false).data_type().clone(); return NullableDataType::new( DataType::Dictionary(Box::new(index_type), Box::new(value_type)), // taking nullability from parent field @@ -279,7 +282,7 @@ fn get_data_type_context(field: ipc::Field, may_be_dictionary: bool) -> Nullable panic!("expect a list to have one child") } let child_field = children.get(0); - DataType::List(Box::new(get_data_type_context(child_field, false))) + DataType::List(Box::new(get_nullable_data_type(child_field, false))) } ipc::Type::LargeList => { let children = field.children().unwrap(); @@ -287,7 +290,7 @@ fn get_data_type_context(field: ipc::Field, may_be_dictionary: bool) -> Nullable panic!("expect a large list to have one child") } let child_field = children.get(0); - DataType::LargeList(Box::new(get_data_type_context(child_field, false))) + DataType::LargeList(Box::new(get_nullable_data_type(child_field, false))) } ipc::Type::FixedSizeList => { let children = field.children().unwrap(); @@ -297,7 +300,7 @@ fn get_data_type_context(field: ipc::Field, may_be_dictionary: bool) -> Nullable let fsl = field.type_as_fixed_size_list().unwrap(); let child_field = children.get(0); DataType::FixedSizeList( - Box::new(get_data_type_context(child_field, false)), + Box::new(get_nullable_data_type(child_field, false)), fsl.listSize(), ) } @@ -547,17 +550,20 @@ pub(crate) fn get_fb_field_type<'a: 'b, 'b>( children: Some(fbb.create_vector(&empty_fields[..])), } } - List(ref type_ctx) => { - let nested_type = - get_fb_field_type(type_ctx.data_type(), type_ctx.is_nullable(), fbb); + List(ref nested_type) => { + let field_type = get_fb_field_type( + nested_type.data_type(), + nested_type.is_nullable(), + fbb, + ); let child = ipc::Field::create( fbb, &ipc::FieldArgs { name: None, - nullable: type_ctx.is_nullable(), - type_type: nested_type.type_type, - type_: Some(nested_type.type_), - children: nested_type.children, + nullable: nested_type.is_nullable(), + type_type: field_type.type_type, + type_: Some(field_type.type_), + children: field_type.children, dictionary: None, custom_metadata: None, }, @@ -568,18 +574,21 @@ pub(crate) fn get_fb_field_type<'a: 'b, 'b>( children: Some(fbb.create_vector(&[child])), } } - LargeList(ref type_ctx) => { - let inner_types = - get_fb_field_type(type_ctx.data_type(), type_ctx.is_nullable(), fbb); + LargeList(ref nested_type) => { + let field_type = get_fb_field_type( + nested_type.data_type(), + nested_type.is_nullable(), + fbb, + ); let child = ipc::Field::create( fbb, &ipc::FieldArgs { name: None, - nullable: type_ctx.is_nullable(), - type_type: inner_types.type_type, - type_: Some(inner_types.type_), + nullable: nested_type.is_nullable(), + type_type: field_type.type_type, + type_: Some(field_type.type_), dictionary: None, - children: inner_types.children, + children: field_type.children, custom_metadata: None, }, ); @@ -589,18 +598,21 @@ pub(crate) fn get_fb_field_type<'a: 'b, 'b>( children: Some(fbb.create_vector(&[child])), } } - FixedSizeList(ref type_ctx, len) => { - let inner_types = - get_fb_field_type(type_ctx.data_type(), type_ctx.is_nullable(), fbb); + FixedSizeList(ref nested_type, len) => { + let field_type = get_fb_field_type( + nested_type.data_type(), + nested_type.is_nullable(), + fbb, + ); let child = ipc::Field::create( fbb, &ipc::FieldArgs { name: None, - nullable: type_ctx.is_nullable(), - type_type: inner_types.type_type, - type_: Some(inner_types.type_), + nullable: nested_type.is_nullable(), + type_type: field_type.type_type, + type_: Some(field_type.type_), dictionary: None, - children: inner_types.children, + children: field_type.children, custom_metadata: None, }, ); diff --git a/rust/arrow/src/ipc/reader.rs b/rust/arrow/src/ipc/reader.rs index d5a929f066a..b09c22dcbe4 100644 --- a/rust/arrow/src/ipc/reader.rs +++ b/rust/arrow/src/ipc/reader.rs @@ -89,7 +89,7 @@ fn create_array( buffer_index += 2; array } - List(ref type_ctx) | LargeList(ref type_ctx) => { + List(ref nested_type) | LargeList(ref nested_type) => { let list_node = &nodes[node_index]; let list_buffers: Vec = buffers[buffer_index..buffer_index + 2] .iter() @@ -99,7 +99,7 @@ fn create_array( buffer_index += 2; let triple = create_array( nodes, - type_ctx.data_type(), + nested_type.data_type(), data, buffers, dictionaries, @@ -111,7 +111,7 @@ fn create_array( create_list_array(list_node, data_type, &list_buffers[..], triple.0) } - FixedSizeList(ref list_field, _) => { + FixedSizeList(ref nested_type, _) => { let list_node = &nodes[node_index]; let list_buffers: Vec = buffers[buffer_index..=buffer_index] .iter() @@ -121,7 +121,7 @@ fn create_array( buffer_index += 1; let triple = create_array( nodes, - list_field.data_type(), + nested_type.data_type(), data, buffers, dictionaries, diff --git a/rust/integration-testing/src/bin/arrow-json-integration-test.rs b/rust/integration-testing/src/bin/arrow-json-integration-test.rs index 26f888b74a9..ff43e0b7dc7 100644 --- a/rust/integration-testing/src/bin/arrow-json-integration-test.rs +++ b/rust/integration-testing/src/bin/arrow-json-integration-test.rs @@ -418,13 +418,13 @@ fn array_from_json( } Ok(Arc::new(b.finish())) } - DataType::List(type_ctx) => { + DataType::List(data_type) => { let null_buf = create_null_buf(&json_col); let children = json_col.children.clone().unwrap(); let child_field = Field::new( "element", - type_ctx.data_type().clone(), - type_ctx.is_nullable(), + data_type.data_type().clone(), + data_type.is_nullable(), ); let child_array = array_from_json( &child_field, @@ -446,13 +446,13 @@ fn array_from_json( .build(); Ok(Arc::new(ListArray::from(list_data))) } - DataType::LargeList(type_ctx) => { + DataType::LargeList(data_type) => { let null_buf = create_null_buf(&json_col); let children = json_col.children.clone().unwrap(); let child_field = Field::new( "element", - type_ctx.data_type().clone(), - type_ctx.is_nullable(), + data_type.data_type().clone(), + data_type.is_nullable(), ); let child_array = array_from_json( &child_field, @@ -478,12 +478,12 @@ fn array_from_json( .build(); Ok(Arc::new(LargeListArray::from(list_data))) } - DataType::FixedSizeList(type_ctx, _) => { + DataType::FixedSizeList(data_type, _) => { let children = json_col.children.clone().unwrap(); let child_field = Field::new( "element", - type_ctx.data_type().clone(), - type_ctx.is_nullable(), + data_type.data_type().clone(), + data_type.is_nullable(), ); let child_array = array_from_json( &child_field, diff --git a/rust/parquet/src/arrow/schema.rs b/rust/parquet/src/arrow/schema.rs index b0c3564ecb1..a5cccbef59d 100644 --- a/rust/parquet/src/arrow/schema.rs +++ b/rust/parquet/src/arrow/schema.rs @@ -412,16 +412,16 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_repetition(repetition) .build() } - DataType::List(type_ctx) - | DataType::FixedSizeList(type_ctx, _) - | DataType::LargeList(type_ctx) => Type::group_type_builder(name) + DataType::List(data_type) + | DataType::FixedSizeList(data_type, _) + | DataType::LargeList(data_type) => Type::group_type_builder(name) .with_fields(&mut vec![Arc::new( Type::group_type_builder("list") .with_fields(&mut vec![Arc::new({ let list_field = Field::new( "element", - type_ctx.data_type().clone(), - type_ctx.is_nullable(), + data_type.data_type().clone(), + data_type.is_nullable(), ); arrow_to_parquet_type(&list_field)? })]) From 28829a2bb8f4563d42d793116bc78a0441458028 Mon Sep 17 00:00:00 2001 From: Christoph Schulze Date: Mon, 30 Nov 2020 18:37:58 +0100 Subject: [PATCH 12/15] Update rust/arrow/src/datatypes.rs Co-authored-by: Andrew Lamb --- rust/arrow/src/datatypes.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index dc7ae77f344..af8c688898f 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -1156,7 +1156,7 @@ impl DataType { } impl NullableDataType { - /// Creates a new data type context + /// Creates a new `NullableDataType` pub fn new(data_type: DataType, nullable: bool) -> Self { NullableDataType { data_type, From 400eeef2145d50d7ef8197cc111f0db2ecb6b3c2 Mon Sep 17 00:00:00 2001 From: Christoph Schulze Date: Mon, 30 Nov 2020 23:26:55 +0100 Subject: [PATCH 13/15] Manual serialization and deserialization of fields --- rust/arrow/src/datatypes.rs | 107 +++++++++++++++++++++++++++++++++--- 1 file changed, 100 insertions(+), 7 deletions(-) diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index 13f148f30df..acb814bece7 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -41,6 +41,9 @@ use serde_json::{ use crate::error::{ArrowError, Result}; use crate::util::bit_util; +use serde::de::{MapAccess, Visitor}; +use serde::ser::SerializeStruct; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; /// The set of datatypes that are supported by this implementation of Apache Arrow. /// @@ -193,7 +196,7 @@ pub enum IntervalUnit { /// Contains the meta-data for a single relative type. /// /// The `Schema` object is an ordered collection of `Field` objects. -#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct Field { name: String, data_type: NullableDataType, @@ -1546,6 +1549,96 @@ impl fmt::Display for Field { } } +impl Serialize for Field { + fn serialize(&self, serializer: S) -> core::result::Result + where + S: Serializer, + { + // 5 is the number of fields in the struct. + let mut state = serializer.serialize_struct("Field", 5)?; + state.serialize_field("name", &self.name)?; + state.serialize_field("data_type", &self.data_type.data_type)?; + state.serialize_field("nullable", &self.data_type.nullable)?; + state.serialize_field("dict_id", &self.dict_id)?; + state.serialize_field("dict_is_ordered", &self.dict_is_ordered)?; + state.end() + } +} + +impl<'de> Deserialize<'de> for Field { + fn deserialize(deserializer: D) -> core::result::Result + where + D: Deserializer<'de>, + { + const FIELDS: &'static [&'static str] = &[ + "name", + "data_type", + "nullable", + "dict_id", + "dict_is_ordered", + ]; + struct FieldVisitor; + impl<'de> Visitor<'de> for FieldVisitor { + type Value = Field; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str("`name`, `data_type` and `nullable` fields") + } + + fn visit_map( + self, + mut map: V, + ) -> core::result::Result + where + V: MapAccess<'de>, + { + let mut name: Option<&str> = None; + let mut data_type: Option = None; + let mut nullable: Option = None; + let mut dict_id: Option = None; + let mut dict_is_ordered: Option = None; + + while let Some(key) = map.next_key::<&str>()? { + match key { + "name" => name = Some(map.next_value::<&str>()?), + "data_type" => data_type = Some(map.next_value::()?), + "nullable" => nullable = Some(map.next_value::()?), + "dict_id" => dict_id = Some(map.next_value::()?), + "dict_is_ordered" => { + dict_is_ordered = Some(map.next_value::()?) + } + _ => Err(serde::de::Error::unknown_field(key, FIELDS))?, + }; + } + + let name = name.ok_or_else(|| serde::de::Error::missing_field("name"))?; + let data_type = data_type + .ok_or_else(|| serde::de::Error::missing_field("data_type"))?; + let nullable = nullable + .ok_or_else(|| serde::de::Error::missing_field("nullable"))?; + + if dict_id.is_some() { + let dict_id = dict_id.unwrap(); + let dict_is_ordered = dict_is_ordered.ok_or_else(|| { + serde::de::Error::missing_field("dict_is_ordered") + })?; + return Ok(Field::new_dict( + name, + data_type, + nullable, + dict_id, + dict_is_ordered, + )); + } + + Ok(Field::new(name, data_type, nullable)) + } + } + + Ok(deserializer.deserialize_struct("Field", FIELDS, FieldVisitor)?) + } +} + /// Describes the meta-data of an ordered sequence of relative types. /// /// Note that this information is only part of the meta-data and not part of the physical @@ -1862,12 +1955,12 @@ mod tests { assert_eq!( "{\"Struct\":[\ - {\"name\":\"first_name\",\"data_type\":{\"data_type\":\"Utf8\",\"nullable\":false},\"dict_id\":0,\"dict_is_ordered\":false},\ - {\"name\":\"last_name\",\"data_type\":{\"data_type\":\"Utf8\",\"nullable\":false},\"dict_id\":0,\"dict_is_ordered\":false},\ - {\"name\":\"address\",\"data_type\":{\"data_type\":{\"Struct\":\ - [{\"name\":\"street\",\"data_type\":{\"data_type\":\"Utf8\",\"nullable\":false},\"dict_id\":0,\"dict_is_ordered\":false},\ - {\"name\":\"zip\",\"data_type\":{\"data_type\":\"UInt16\",\"nullable\":false},\"dict_id\":0,\"dict_is_ordered\":false}\ - ]},\"nullable\":false},\"dict_id\":0,\"dict_is_ordered\":false}]}", + {\"name\":\"first_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false},\ + {\"name\":\"last_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false},\ + {\"name\":\"address\",\"data_type\":{\"Struct\":\ + [{\"name\":\"street\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false},\ + {\"name\":\"zip\",\"data_type\":\"UInt16\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false}\ + ]},\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false}]}", serialized ); From 6228f04fdcf244d1cc20b0eaf87bcd4744730a91 Mon Sep 17 00:00:00 2001 From: Christoph Schulze Date: Thu, 3 Dec 2020 15:49:16 +0100 Subject: [PATCH 14/15] fix clippy errors --- rust/arrow/src/datatypes.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index acb814bece7..1e8619a1dd4 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -1570,7 +1570,7 @@ impl<'de> Deserialize<'de> for Field { where D: Deserializer<'de>, { - const FIELDS: &'static [&'static str] = &[ + const FIELDS: &[&str] = &[ "name", "data_type", "nullable", @@ -1607,7 +1607,7 @@ impl<'de> Deserialize<'de> for Field { "dict_is_ordered" => { dict_is_ordered = Some(map.next_value::()?) } - _ => Err(serde::de::Error::unknown_field(key, FIELDS))?, + _ => return Err(serde::de::Error::unknown_field(key, FIELDS)), }; } @@ -1617,8 +1617,7 @@ impl<'de> Deserialize<'de> for Field { let nullable = nullable .ok_or_else(|| serde::de::Error::missing_field("nullable"))?; - if dict_id.is_some() { - let dict_id = dict_id.unwrap(); + if let Some(dict_id) = dict_id { let dict_is_ordered = dict_is_ordered.ok_or_else(|| { serde::de::Error::missing_field("dict_is_ordered") })?; From 2cccd08f8c12e626bb58ed55af9d67955dd702ff Mon Sep 17 00:00:00 2001 From: Christoph Schulze Date: Thu, 3 Dec 2020 17:19:36 +0100 Subject: [PATCH 15/15] Fixes after merging --- rust/arrow/src/datatypes.rs | 6 +++--- rust/parquet/src/arrow/arrow_writer.rs | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index cc4cd793cb2..a0a429ff973 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -1228,7 +1228,7 @@ impl Field { /// Returns the dictionary ID, if this is a dictionary type #[inline] pub const fn dict_id(&self) -> Option { - match self.data_type { + match self.data_type.data_type { DataType::Dictionary(_, _) => Some(self.dict_id), _ => None, } @@ -1237,7 +1237,7 @@ impl Field { /// Returns whether this `Field`'s dictionary is ordered, if this is a dictionary type #[inline] pub const fn dict_is_ordered(&self) -> Option { - match self.data_type { + match self.data_type.data_type { DataType::Dictionary(_, _) => Some(self.dict_is_ordered), _ => None, } @@ -2669,7 +2669,7 @@ mod tests { last_name: Utf8, \ address: Struct([\ Field { name: \"street\", data_type: NullableDataType { data_type: Utf8, nullable: false }, dict_id: 0, dict_is_ordered: false }, \ - Field { name: \"zip\", data_type: NullableDataType { data_type: UInt16, nullable: false }, dict_id: 0, dict_is_ordered: false }]),\ + Field { name: \"zip\", data_type: NullableDataType { data_type: UInt16, nullable: false }, dict_id: 0, dict_is_ordered: false }]), \ interests: Dictionary(Int32, Utf8)") } diff --git a/rust/parquet/src/arrow/arrow_writer.rs b/rust/parquet/src/arrow/arrow_writer.rs index 75caabd3840..21d49e4e08c 100644 --- a/rust/parquet/src/arrow/arrow_writer.rs +++ b/rust/parquet/src/arrow/arrow_writer.rs @@ -387,10 +387,10 @@ mod tests { use std::sync::Arc; use arrow::array::*; + use arrow::buffer::Buffer; use arrow::datatypes::{DataType, Field, Schema, UInt32Type, UInt8Type}; use arrow::datatypes::{NullableDataType, ToByteSlice}; use arrow::record_batch::RecordBatch; - use arrow::{array::*, buffer::Buffer}; use crate::arrow::{ArrowReader, ParquetFileArrowReader}; use crate::file::{reader::SerializedFileReader, writer::InMemoryWriteableCursor};