-
Notifications
You must be signed in to change notification settings - Fork 2.9k
Spark: Fix nested struct pruning #2877
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
bfacdcb
883d4c0
c69da8a
b757370
09871b0
b979cd8
13622b8
8593a92
7374d9c
412b57e
1fd8db8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,18 +30,26 @@ | |
| import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Iterables; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Lists; | ||
| import org.apache.iceberg.util.StructProjection; | ||
|
|
||
| class StaticDataTask implements DataTask { | ||
|
|
||
| static <T> DataTask of(InputFile metadata, Iterable<T> values, Function<T, Row> transform) { | ||
| static <T> DataTask of(InputFile metadata, Iterable<T> values, Function<T, Row> transform, | ||
| Schema original, Schema projected) { | ||
|
||
| return new StaticDataTask(metadata, | ||
| Lists.newArrayList(Iterables.transform(values, transform::apply)).toArray(new Row[0])); | ||
| Lists.newArrayList(Iterables.transform(values, transform::apply)).toArray(new Row[0]), | ||
| original, | ||
| projected); | ||
| } | ||
|
|
||
| private final DataFile metadataFile; | ||
| private final StructLike[] rows; | ||
| private final Schema original; | ||
| private final Schema projectedSchema; | ||
|
|
||
| private StaticDataTask(InputFile metadata, StructLike[] rows) { | ||
| private StaticDataTask(InputFile metadata, StructLike[] rows, Schema original, Schema projectedSchema) { | ||
| this.original = original; | ||
| this.projectedSchema = projectedSchema; | ||
| this.metadataFile = DataFiles.builder(PartitionSpec.unpartitioned()) | ||
| .withInputFile(metadata) | ||
| .withRecordCount(rows.length) | ||
|
|
@@ -57,7 +65,9 @@ public List<DeleteFile> deletes() { | |
|
|
||
| @Override | ||
| public CloseableIterable<StructLike> rows() { | ||
| return CloseableIterable.withNoopClose(Arrays.asList(rows)); | ||
| StructProjection projection = StructProjection.create(original, projectedSchema); | ||
| Iterable<StructLike> projectedRows = Iterables.transform(Arrays.asList(rows), projection::wrap); | ||
| return CloseableIterable.withNoopClose(projectedRows); | ||
| } | ||
|
|
||
| @Override | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -186,28 +186,27 @@ private CloseableIterable<InternalRow> newOrcIterable( | |
| } | ||
|
|
||
| private CloseableIterable<InternalRow> newDataIterable(DataTask task, Schema readSchema) { | ||
| StructInternalRow row = new StructInternalRow(tableSchema.asStruct()); | ||
| StructInternalRow row = new StructInternalRow(readSchema.asStruct()); | ||
| CloseableIterable<InternalRow> asSparkRows = CloseableIterable.transform( | ||
| task.asDataTask().rows(), row::setStruct); | ||
| return CloseableIterable.transform( | ||
| asSparkRows, APPLY_PROJECTION.bind(projection(readSchema, tableSchema))::invoke); | ||
| return asSparkRows; | ||
rdblue marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| private static UnsafeProjection projection(Schema finalSchema, Schema readSchema) { | ||
| StructType struct = SparkSchemaUtil.convert(readSchema); | ||
| StructType readStruct = SparkSchemaUtil.convert(readSchema); | ||
|
||
|
|
||
| List<AttributeReference> refs = JavaConverters.seqAsJavaListConverter(struct.toAttributes()).asJava(); | ||
| List<Attribute> attrs = Lists.newArrayListWithExpectedSize(struct.fields().length); | ||
| List<AttributeReference> readReferences = JavaConverters.seqAsJavaListConverter(readStruct.toAttributes()).asJava(); | ||
| List<Attribute> attrs = Lists.newArrayListWithExpectedSize(readStruct.fields().length); | ||
| List<org.apache.spark.sql.catalyst.expressions.Expression> exprs = | ||
| Lists.newArrayListWithExpectedSize(struct.fields().length); | ||
| Lists.newArrayListWithExpectedSize(readStruct.fields().length); | ||
|
|
||
| for (AttributeReference ref : refs) { | ||
| for (AttributeReference ref : readReferences) { | ||
| attrs.add(ref.toAttribute()); | ||
| } | ||
|
|
||
| for (Types.NestedField field : finalSchema.columns()) { | ||
| int indexInReadSchema = struct.fieldIndex(field.name()); | ||
| exprs.add(refs.get(indexInReadSchema)); | ||
| int indexInReadSchema = readStruct.fieldIndex(field.name()); | ||
| exprs.add(readReferences.get(indexInReadSchema)); | ||
| } | ||
|
|
||
| return UnsafeProjection.create( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What about allowing the projection if the the fields are primitives or if the entire struct is projected? That would cover the cases that are currently supported and avoid introducing a new pruning bug to replace the one you're fixing (where nested structs don't match the requested struct schema).