@@ -24,6 +24,34 @@ fn make_arrow_error(s: String) -> Error {
24
24
Error :: Arrow ( arrow_schema:: ArrowError :: InvalidArgumentError ( s) )
25
25
}
26
26
27
+ /// Capture the compatibility between two data-types, as passed to [`ensure_data_types`]
28
+ pub ( crate ) enum DataTypeCompat {
29
+ /// The two types are the same
30
+ Identical ,
31
+ /// What is read from parquet needs to be cast to the associated type
32
+ NeedsCast ( ArrowDataType ) ,
33
+ /// Types are compatible, but are nested types. This is used when comparing types where casting
34
+ /// is not desired (i.e. in the expression evaluator)
35
+ Nested ,
36
+ }
37
+
38
+ // Check if two types can be cast
39
+ fn check_cast_compat (
40
+ source_type : & ArrowDataType ,
41
+ target_type : ArrowDataType ,
42
+ ) -> DeltaResult < DataTypeCompat > {
43
+ match ( source_type, & target_type) {
44
+ ( & ArrowDataType :: Timestamp ( _, _) , & ArrowDataType :: Timestamp ( _, _) ) => {
45
+ // timestamps are able to be cast between each other
46
+ Ok ( DataTypeCompat :: NeedsCast ( target_type) )
47
+ }
48
+ _ => Err ( make_arrow_error ( format ! (
49
+ "Incorrect datatype. Expected {}, got {}" ,
50
+ target_type, source_type
51
+ ) ) ) , //| (DataType::Primitive(PrimitiveType::TimestampNtz), ArrowDataType::Timestamp(_, _)) => {
52
+ }
53
+ }
54
+
27
55
/// Ensure a kernel data type matches an arrow data type. This only ensures that the actual "type"
28
56
/// is the same, but does so recursively into structs, and ensures lists and maps have the correct
29
57
/// associated types as well. This returns an `Ok(())` if the types are compatible, or an error if
@@ -33,31 +61,28 @@ fn make_arrow_error(s: String) -> Error {
33
61
pub ( crate ) fn ensure_data_types (
34
62
kernel_type : & DataType ,
35
63
arrow_type : & ArrowDataType ,
36
- ) -> DeltaResult < ( ) > {
64
+ ) -> DeltaResult < DataTypeCompat > {
37
65
match ( kernel_type, arrow_type) {
38
66
( DataType :: Primitive ( _) , _) if arrow_type. is_primitive ( ) => {
39
67
let converted_type: ArrowDataType = kernel_type. try_into ( ) ?;
40
68
if & converted_type == arrow_type {
41
- Ok ( ( ) )
69
+ Ok ( DataTypeCompat :: Identical )
42
70
} else {
43
- Err ( make_arrow_error ( format ! (
44
- "Incorrect datatype. Expected {}, got {}" ,
45
- converted_type, arrow_type
46
- ) ) )
71
+ check_cast_compat ( arrow_type, converted_type)
47
72
}
48
73
}
49
74
( DataType :: Primitive ( PrimitiveType :: Boolean ) , ArrowDataType :: Boolean )
50
75
| ( DataType :: Primitive ( PrimitiveType :: String ) , ArrowDataType :: Utf8 )
51
76
| ( DataType :: Primitive ( PrimitiveType :: Binary ) , ArrowDataType :: Binary ) => {
52
77
// strings, bools, and binary aren't primitive in arrow
53
- Ok ( ( ) )
78
+ Ok ( DataTypeCompat :: Identical )
54
79
}
55
80
(
56
81
DataType :: Primitive ( PrimitiveType :: Decimal ( kernel_prec, kernel_scale) ) ,
57
82
ArrowDataType :: Decimal128 ( arrow_prec, arrow_scale) ,
58
83
) if arrow_prec == kernel_prec && * arrow_scale == * kernel_scale as i8 => {
59
84
// decimal isn't primitive in arrow. cast above is okay as we limit range
60
- Ok ( ( ) )
85
+ Ok ( DataTypeCompat :: Identical )
61
86
}
62
87
( DataType :: Array ( inner_type) , ArrowDataType :: List ( arrow_list_type) ) => {
63
88
let kernel_array_type = & inner_type. element_type ;
@@ -81,7 +106,7 @@ pub(crate) fn ensure_data_types(
81
106
"Arrow map struct didn't have a value type" . to_string ( ) ,
82
107
) ) ;
83
108
}
84
- Ok ( ( ) )
109
+ Ok ( DataTypeCompat :: Nested )
85
110
} else {
86
111
Err ( make_arrow_error (
87
112
"Arrow map type wasn't a struct." . to_string ( ) ,
@@ -111,7 +136,7 @@ pub(crate) fn ensure_data_types(
111
136
kernel_field_names, arrow_field_names,
112
137
) )
113
138
} ) ;
114
- Ok ( ( ) )
139
+ Ok ( DataTypeCompat :: Nested )
115
140
}
116
141
_ => Err ( make_arrow_error ( format ! (
117
142
"Incorrect datatype. Expected {}, got {}" ,
@@ -211,52 +236,65 @@ pub(crate) fn ensure_data_types(
211
236
*/
212
237
213
238
/// Reordering is specified as a tree. Each level is a vec of `ReorderIndex`s. Each element's index
214
- /// represents a column that will be in the read parquet data at that level and index. The `index() `
215
- /// of the element is the position that the column should appear in the final output. If it is a
216
- /// `Child` variant, then at that index there is a `Struct` whose ordering is specified by the
217
- /// values in the associated `Vec` according to these same rules .
239
+ /// represents a column that will be in the read parquet data at that level and index. The `index`
240
+ /// of the element is the position that the column should appear in the final output. The `transform`
241
+ /// indicates what, if any, transforms are needed. See the docs for [`ReorderIndexTransform`] for the
242
+ /// meaning .
218
243
#[ derive( Debug , PartialEq ) ]
219
244
pub ( crate ) struct ReorderIndex {
220
245
pub ( crate ) index : usize ,
221
- kind : ReorderIndexKind ,
246
+ transform : ReorderIndexTransform ,
222
247
}
223
248
224
249
#[ derive( Debug , PartialEq ) ]
225
- pub ( crate ) enum ReorderIndexKind {
250
+ pub ( crate ) enum ReorderIndexTransform {
251
+ /// For a non-nested type, indicates that we need to cast to the contained type
252
+ Cast ( ArrowDataType ) ,
253
+ /// Used for struct/list/map. Potentially transform child fields using contained reordering
226
254
Child ( Vec < ReorderIndex > ) ,
227
- Index ,
255
+ /// No work needed to transform this data
256
+ None ,
257
+ /// Data is missing, fill in with a null column
228
258
Missing ( ArrowFieldRef ) ,
229
259
}
230
260
231
261
impl ReorderIndex {
262
+ fn new_cast ( index : usize , target : ArrowDataType ) -> Self {
263
+ ReorderIndex {
264
+ index,
265
+ transform : ReorderIndexTransform :: Cast ( target) ,
266
+ }
267
+ }
268
+
232
269
fn new_child ( index : usize , children : Vec < ReorderIndex > ) -> Self {
233
270
ReorderIndex {
234
271
index,
235
- kind : ReorderIndexKind :: Child ( children) ,
272
+ transform : ReorderIndexTransform :: Child ( children) ,
236
273
}
237
274
}
238
275
239
276
fn new_index ( index : usize ) -> Self {
240
277
ReorderIndex {
241
278
index,
242
- kind : ReorderIndexKind :: Index ,
279
+ transform : ReorderIndexTransform :: None ,
243
280
}
244
281
}
245
282
246
283
fn new_missing ( index : usize , field : ArrowFieldRef ) -> Self {
247
284
ReorderIndex {
248
285
index,
249
- kind : ReorderIndexKind :: Missing ( field) ,
286
+ transform : ReorderIndexTransform :: Missing ( field) ,
250
287
}
251
288
}
252
289
253
- /// Check if this reordering contains a `Missing` variant anywhere. See comment below on
290
+ /// Check if this reordering requires a transformation anywhere. See comment below on
254
291
/// [`is_ordered`] to understand why this is needed.
255
- fn contains_missing ( & self ) -> bool {
256
- match self . kind {
257
- ReorderIndexKind :: Child ( ref children) => is_ordered ( children) ,
258
- ReorderIndexKind :: Index => true ,
259
- ReorderIndexKind :: Missing ( _) => false ,
292
+ fn needs_transform ( & self ) -> bool {
293
+ match self . transform {
294
+ ReorderIndexTransform :: Cast ( _) => true ,
295
+ ReorderIndexTransform :: Child ( ref children) => is_ordered ( children) ,
296
+ ReorderIndexTransform :: None => true ,
297
+ ReorderIndexTransform :: Missing ( _) => false ,
260
298
}
261
299
}
262
300
}
@@ -410,10 +448,18 @@ fn get_indices(
410
448
if let Some ( ( index, _, requested_field) ) =
411
449
requested_schema. fields . get_full ( field. name ( ) )
412
450
{
413
- ensure_data_types ( & requested_field. data_type , field. data_type ( ) ) ?;
451
+ match ensure_data_types ( & requested_field. data_type , field. data_type ( ) ) ? {
452
+ DataTypeCompat :: Identical =>
453
+ reorder_indices. push ( ReorderIndex :: new_index ( index) ) ,
454
+ DataTypeCompat :: NeedsCast ( target) =>
455
+ reorder_indices. push ( ReorderIndex :: new_cast ( index, target) ) ,
456
+ DataTypeCompat :: Nested => return
457
+ Err ( Error :: generic (
458
+ "Comparing nested types in get_indices. This is a kernel bug, please report"
459
+ ) )
460
+ }
414
461
found_fields. insert ( requested_field. name ( ) ) ;
415
462
mask_indices. push ( parquet_offset + parquet_index) ;
416
- reorder_indices. push ( ReorderIndex :: new_index ( index) ) ;
417
463
}
418
464
}
419
465
}
@@ -491,13 +537,13 @@ fn is_ordered(requested_ordering: &[ReorderIndex]) -> bool {
491
537
return true ;
492
538
}
493
539
// we have >=1 element. check that the first element is ordered
494
- if !requested_ordering[ 0 ] . contains_missing ( ) {
540
+ if !requested_ordering[ 0 ] . needs_transform ( ) {
495
541
return false ;
496
542
}
497
543
// now check that all elements are ordered wrt. each other, and are internally ordered
498
544
requested_ordering
499
545
. windows ( 2 )
500
- . all ( |ri| ( ri[ 0 ] . index < ri[ 1 ] . index ) && ri[ 1 ] . contains_missing ( ) )
546
+ . all ( |ri| ( ri[ 0 ] . index < ri[ 1 ] . index ) && ! ri[ 1 ] . needs_transform ( ) )
501
547
}
502
548
503
549
// we use this as a placeholder for an array and its associated field. We can fill in a Vec of None
@@ -524,8 +570,19 @@ pub(crate) fn reorder_struct_array(
524
570
for ( parquet_position, reorder_index) in requested_ordering. iter ( ) . enumerate ( ) {
525
571
// for each item, reorder_index.index() tells us where to put it, and its position in
526
572
// requested_ordering tells us where it is in the parquet data
527
- match & reorder_index. kind {
528
- ReorderIndexKind :: Child ( children) => {
573
+ match & reorder_index. transform {
574
+ ReorderIndexTransform :: Cast ( target) => {
575
+ let source_col = input_cols[ parquet_position] . as_ref ( ) ;
576
+ let new_col = Arc :: new ( arrow_cast:: cast:: cast ( source_col, target) ?) ;
577
+ let new_field = Arc :: new (
578
+ input_fields[ parquet_position]
579
+ . as_ref ( )
580
+ . clone ( )
581
+ . with_data_type ( new_col. data_type ( ) . clone ( ) ) ,
582
+ ) ;
583
+ final_fields_cols[ reorder_index. index ] = Some ( ( new_field, new_col) ) ;
584
+ }
585
+ ReorderIndexTransform :: Child ( children) => {
529
586
match input_cols[ parquet_position] . data_type ( ) {
530
587
ArrowDataType :: Struct ( _) => {
531
588
let struct_array = input_cols[ parquet_position] . as_struct ( ) . clone ( ) ;
@@ -563,13 +620,13 @@ pub(crate) fn reorder_struct_array(
563
620
}
564
621
}
565
622
}
566
- ReorderIndexKind :: Index => {
623
+ ReorderIndexTransform :: None => {
567
624
final_fields_cols[ reorder_index. index ] = Some ( (
568
625
input_fields[ parquet_position] . clone ( ) , // cheap Arc clone
569
626
input_cols[ parquet_position] . clone ( ) , // cheap Arc clone
570
627
) ) ;
571
628
}
572
- ReorderIndexKind :: Missing ( field) => {
629
+ ReorderIndexTransform :: Missing ( field) => {
573
630
let null_array = Arc :: new ( new_null_array ( field. data_type ( ) , num_rows) ) ;
574
631
let field = field. clone ( ) ; // cheap Arc clone
575
632
final_fields_cols[ reorder_index. index ] = Some ( ( field, null_array) ) ;
0 commit comments