-
Notifications
You must be signed in to change notification settings - Fork 2.9k
Read all row group metadata only when position column is projected #1716
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,6 +27,7 @@ | |
| import java.util.Set; | ||
| import java.util.function.Function; | ||
| import java.util.stream.Collectors; | ||
| import org.apache.iceberg.MetadataColumns; | ||
| import org.apache.iceberg.Schema; | ||
| import org.apache.iceberg.exceptions.RuntimeIOException; | ||
| import org.apache.iceberg.expressions.Expression; | ||
|
|
@@ -58,7 +59,8 @@ class ReadConf<T> { | |
| private final long totalValues; | ||
| private final boolean reuseContainers; | ||
| private final Integer batchSize; | ||
| private final long[] startRowPositions; | ||
|
|
||
| private long[] startRowPositions; | ||
|
|
||
| // List of column chunk metadata for each row group | ||
| private final List<Map<ColumnPath, ColumnChunkMetaData>> columnChunkMetaDataForRowGroups; | ||
|
|
@@ -89,8 +91,11 @@ class ReadConf<T> { | |
| this.shouldSkip = new boolean[rowGroups.size()]; | ||
|
|
||
| // Fetch all row groups starting positions to compute the row offsets of the filtered row groups | ||
| Map<Long, Long> offsetToStartPos = generateOffsetToStartPos(); | ||
| this.startRowPositions = new long[rowGroups.size()]; | ||
| Map<Long, Long> offsetToStartPos = null; | ||
| if (expectedSchema.findField(MetadataColumns.ROW_POSITION.fieldId()) != null) { | ||
| offsetToStartPos = generateOffsetToStartPos(); | ||
| this.startRowPositions = new long[rowGroups.size()]; | ||
|
||
| } | ||
|
|
||
| ParquetMetricsRowGroupFilter statsFilter = null; | ||
| ParquetDictionaryRowGroupFilter dictFilter = null; | ||
|
|
@@ -102,7 +107,9 @@ class ReadConf<T> { | |
| long computedTotalValues = 0L; | ||
| for (int i = 0; i < shouldSkip.length; i += 1) { | ||
| BlockMetaData rowGroup = rowGroups.get(i); | ||
| startRowPositions[i] = offsetToStartPos.get(rowGroup.getStartingPos()); | ||
| if (offsetToStartPos != null) { | ||
| startRowPositions[i] = offsetToStartPos.get(rowGroup.getStartingPos()); | ||
|
||
| } | ||
| boolean shouldRead = filter == null || ( | ||
| statsFilter.shouldRead(typeWithIds, rowGroup) && | ||
| dictFilter.shouldRead(typeWithIds, rowGroup, reader.getDictionaryReader(rowGroup))); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
rowPositionwill be ignored if the position column is not projected.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think this file should change. If the determination not to use correct start positions in made in
ReadConf, then it should just set the position to 0 there. This can use whateverrowGroupStartRowPoscontains.