-
Notifications
You must be signed in to change notification settings - Fork 96
GH-698: Improve and fix Avro read consumers #718
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 31 commits
1872d41
d07b417
d2b33ae
1766e8b
6d77139
47572c0
331dabc
5361009
c2bd7b3
3d9faf6
8897832
eaef6c8
0f94366
2cfb81a
f7aba15
dde2dca
8525421
1ac4fae
b9cf11b
c61cc3a
84ac2c7
0fd70de
737ca3b
4e6efdb
8350f40
6788d51
7f7c4e7
e0550da
0eb69a3
bccc14a
577f8aa
1a03b8a
141b6fd
3c173ec
f8aa184
bff71c4
74717c3
6c75d87
9aaf5cf
62d08a9
d6538e6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -332,7 +332,17 @@ private static <T> T buildBaseTypeSchema( | |
|
|
||
| case List: | ||
| case FixedSizeList: | ||
| return buildArraySchema(builder.array(), field, namespace); | ||
| // Arrow uses "$data$" as the field name for list items, that is not a valid Avro name | ||
| Field itemField = field.getChildren().get(0); | ||
| if (ListVector.DATA_VECTOR_NAME.equals(itemField.getName())) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we perhaps want to check for invalid names more generally and mangle/normalize them?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Or just normalize all field names to something consistent in Avro?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm, I think for list / map types using the constant defined names for children makes sense, with "item" instead of "$data$" for list items. More generally, we could normalise illegal chars to "_" to match the Avro name rules. Per my understanding similar rules are already enforced in C++, but are not part of the Arrow spec or Java implementation. Very happy to put the normalisation in, it's probably a more useful behaviour than throwing an error in the adapter. Would you like me to do it? |
||
| Field safeItemField = | ||
| new Field("item", itemField.getFieldType(), itemField.getChildren()); | ||
| Field safeListField = | ||
| new Field(field.getName(), field.getFieldType(), List.of(safeItemField)); | ||
| return buildArraySchema(builder.array(), safeListField, namespace); | ||
| } else { | ||
| return buildArraySchema(builder.array(), field, namespace); | ||
| } | ||
|
|
||
| case Map: | ||
| return buildMapSchema(builder.map(), field, namespace); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,6 +41,9 @@ public class AvroToArrowConfig { | |
| /** The field names which to skip when reading decoder values. */ | ||
| private final Set<String> skipFieldNames; | ||
|
|
||
| /** Interpret unions where one type is null as a nullable field. */ | ||
| private final boolean handleNullable; | ||
|
|
||
| /** | ||
| * Instantiate an instance. | ||
| * | ||
|
|
@@ -64,6 +67,36 @@ public class AvroToArrowConfig { | |
| this.targetBatchSize = targetBatchSize; | ||
| this.provider = provider; | ||
| this.skipFieldNames = skipFieldNames; | ||
|
|
||
| // Default values for optional parameters | ||
| handleNullable = false; // Match original behavior by default | ||
| } | ||
|
|
||
| /** | ||
| * Instantiate an instance. | ||
| * | ||
| * @param allocator The memory allocator to construct the Arrow vectors with. | ||
| * @param targetBatchSize The maximum rowCount to read each time when partially convert data. | ||
| * @param provider The dictionary provider used for enum type, adapter will update this provider. | ||
| * @param skipFieldNames Field names which to skip. | ||
| */ | ||
| AvroToArrowConfig( | ||
| BufferAllocator allocator, | ||
| int targetBatchSize, | ||
| DictionaryProvider.MapDictionaryProvider provider, | ||
| Set<String> skipFieldNames, | ||
| boolean handleNullable) { | ||
|
|
||
| Preconditions.checkArgument( | ||
| targetBatchSize == AvroToArrowVectorIterator.NO_LIMIT_BATCH_SIZE || targetBatchSize > 0, | ||
| "invalid targetBatchSize: %s", | ||
| targetBatchSize); | ||
|
|
||
| this.allocator = allocator; | ||
| this.targetBatchSize = targetBatchSize; | ||
| this.provider = provider; | ||
| this.skipFieldNames = skipFieldNames; | ||
| this.handleNullable = handleNullable; | ||
| } | ||
|
|
||
| public BufferAllocator getAllocator() { | ||
|
|
@@ -81,4 +114,8 @@ public DictionaryProvider.MapDictionaryProvider getProvider() { | |
| public Set<String> getSkipFieldNames() { | ||
| return skipFieldNames; | ||
| } | ||
|
|
||
| public boolean isHandleNullable() { | ||
|
||
| return handleNullable; | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The funny thing is, arrow-java shouldn't be doing that, it was just never corrected...