-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Fix bug while writing parquet with empty lists of structs #1166
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -287,11 +287,18 @@ impl LevelInfo { | |
| .as_any() | ||
| .downcast_ref::<StructArray>() | ||
| .expect("Unable to get struct array"); | ||
| let struct_level = self.calculate_child_levels( | ||
| let mut struct_level = self.calculate_child_levels( | ||
| array_offsets, | ||
| array_mask, | ||
| LevelType::Struct(field.is_nullable()), | ||
| ); | ||
|
|
||
| // If the parent field is a list, calculate the children of the struct as if it | ||
| // were a list as well. | ||
| if matches!(self.level_type, LevelType::List(_)) { | ||
| struct_level.level_type = LevelType::List(false); | ||
| } | ||
|
|
||
| let mut struct_levels = vec![]; | ||
| struct_array | ||
| .columns() | ||
|
|
@@ -1675,4 +1682,95 @@ mod tests { | |
| }; | ||
| assert_eq!(list_level, &expected_level); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_list_of_struct() { | ||
| // define schema | ||
| let int_field = Field::new("a", DataType::Int32, true); | ||
| let item_field = | ||
| Field::new("item", DataType::Struct(vec![int_field.clone()]), true); | ||
| let list_field = Field::new("list", DataType::List(Box::new(item_field)), true); | ||
|
|
||
| let int_builder = Int32Builder::new(10); | ||
| let struct_builder = | ||
| StructBuilder::new(vec![int_field], vec![Box::new(int_builder)]); | ||
| let mut list_builder = ListBuilder::new(struct_builder); | ||
|
|
||
| // [{a: 1}], [], null, [null, null], [{a: null}], [{a: 2}] | ||
| // | ||
| // [{a: 1}] | ||
| let values = list_builder.values(); | ||
| values | ||
| .field_builder::<Int32Builder>(0) | ||
| .unwrap() | ||
| .append_value(1) | ||
| .unwrap(); | ||
| values.append(true).unwrap(); | ||
| list_builder.append(true).unwrap(); | ||
|
|
||
| // [] | ||
| list_builder.append(true).unwrap(); | ||
|
|
||
| // null | ||
| list_builder.append(false).unwrap(); | ||
|
|
||
| // [null, null] | ||
| let values = list_builder.values(); | ||
| values | ||
| .field_builder::<Int32Builder>(0) | ||
| .unwrap() | ||
| .append_null() | ||
| .unwrap(); | ||
| values.append(false).unwrap(); | ||
| values | ||
| .field_builder::<Int32Builder>(0) | ||
| .unwrap() | ||
| .append_null() | ||
| .unwrap(); | ||
| values.append(false).unwrap(); | ||
| list_builder.append(true).unwrap(); | ||
|
|
||
| // [{a: null}] | ||
| let values = list_builder.values(); | ||
| values | ||
| .field_builder::<Int32Builder>(0) | ||
| .unwrap() | ||
| .append_null() | ||
| .unwrap(); | ||
| values.append(true).unwrap(); | ||
| list_builder.append(true).unwrap(); | ||
|
|
||
| // [{a: 2}] | ||
| let values = list_builder.values(); | ||
| values | ||
| .field_builder::<Int32Builder>(0) | ||
| .unwrap() | ||
| .append_value(2) | ||
| .unwrap(); | ||
| values.append(true).unwrap(); | ||
| list_builder.append(true).unwrap(); | ||
|
|
||
| let array = Arc::new(list_builder.finish()); | ||
|
|
||
| let schema = Arc::new(Schema::new(vec![list_field])); | ||
|
|
||
| let rb = RecordBatch::try_new(schema, vec![array]).unwrap(); | ||
|
|
||
| let batch_level = LevelInfo::new(0, rb.num_rows()); | ||
| let list_level = | ||
| &batch_level.calculate_array_levels(rb.column(0), rb.schema().field(0))[0]; | ||
|
|
||
| let expected_level = LevelInfo { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it would be great if someone else who knew this better than I could double check this
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not super confident in this either--it would be great if someone with knowledge about the details of this code could chime in. The definition and repetition levels I compared with what the c++ parquet writer produces. I exported the above record batch and used the C++ parquet writer to generate a parquet file. I then used
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can confirm, these numbers check out 👍 |
||
| definition: vec![4, 1, 0, 2, 2, 3, 4], | ||
| repetition: Some(vec![0, 0, 0, 0, 1, 0, 0]), | ||
| array_offsets: vec![0, 1, 1, 1, 3, 4, 5], | ||
| array_mask: vec![true, true, false, false, false, false, true], | ||
| max_definition: 4, | ||
| level_type: LevelType::Primitive(true), | ||
| offset: 0, | ||
| length: 5, | ||
| }; | ||
|
|
||
| assert_eq!(list_level, &expected_level); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I double checked that the code matches the comments about what structure is intended to be created