-
Notifications
You must be signed in to change notification settings - Fork 2.5k
[HUDI-3258] HoodieData for metadata index records, bloom and colstats init #4848
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,6 +24,7 @@ | |
| import org.apache.hudi.common.fs.FSUtils; | ||
| import org.apache.hudi.common.model.BaseFile; | ||
| import org.apache.hudi.common.model.FileSlice; | ||
| import org.apache.hudi.common.model.HoodieColumnRangeMetadata; | ||
| import org.apache.hudi.common.model.HoodieDeltaWriteStat; | ||
| import org.apache.hudi.common.model.HoodieKey; | ||
| import org.apache.hudi.common.model.HoodieLogFile; | ||
|
|
@@ -71,6 +72,9 @@ | |
| import java.util.concurrent.atomic.AtomicLong; | ||
| import java.util.stream.Collectors; | ||
|
|
||
| import static org.apache.hudi.metadata.HoodieTableMetadataUtil.accumulateColumnRanges; | ||
| import static org.apache.hudi.metadata.HoodieTableMetadataUtil.aggregateColumnStats; | ||
|
|
||
| /** | ||
| * IO Operation to append data onto an existing file. | ||
| */ | ||
|
|
@@ -320,7 +324,7 @@ private void updateWriteStatus(HoodieDeltaWriteStat stat, AppendResult result) { | |
| statuses.add(this.writeStatus); | ||
| } | ||
|
|
||
| private void processAppendResult(AppendResult result) { | ||
| private void processAppendResult(AppendResult result, List<IndexedRecord> recordList) { | ||
| HoodieDeltaWriteStat stat = (HoodieDeltaWriteStat) this.writeStatus.getStat(); | ||
|
|
||
| if (stat.getPath() == null) { | ||
|
|
@@ -339,6 +343,19 @@ private void processAppendResult(AppendResult result) { | |
| updateWriteStatus(stat, result); | ||
| } | ||
|
|
||
| if (config.isMetadataIndexColumnStatsForAllColumnsEnabled()) { | ||
| Map<String, HoodieColumnRangeMetadata<Comparable>> columnRangeMap = stat.getRecordsStats().isPresent() | ||
codope marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| ? stat.getRecordsStats().get().getStats() : new HashMap<>(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @codope that's what i was referring to with my comments regarding increased complexity in respect to Now, when reading this code reader actually need to understand what is this additional |
||
| final String filePath = stat.getPath(); | ||
| // initialize map of column name to map of stats name to stats value | ||
| Map<String, Map<String, Object>> columnToStats = new HashMap<>(); | ||
| writeSchemaWithMetaFields.getFields().forEach(field -> columnToStats.putIfAbsent(field.name(), new HashMap<>())); | ||
| // collect stats for columns at once per record and keep iterating through every record to eventually find col stats for all fields. | ||
| recordList.forEach(record -> aggregateColumnStats(record, writeSchemaWithMetaFields, columnToStats, config.isConsistentLogicalTimestampEnabled())); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we, instead of placing iteration and aggregation into separate methods, consolidate them in |
||
| writeSchemaWithMetaFields.getFields().forEach(field -> accumulateColumnRanges(field, filePath, columnRangeMap, columnToStats)); | ||
| stat.setRecordsStats(new HoodieDeltaWriteStat.RecordsStats<>(columnRangeMap)); | ||
| } | ||
|
|
||
| resetWriteCounts(); | ||
| assert stat.getRuntimeStats() != null; | ||
| LOG.info(String.format("AppendHandle for partitionPath %s filePath %s, took %d ms.", partitionPath, | ||
|
|
@@ -376,7 +393,7 @@ protected void appendDataAndDeleteBlocks(Map<HeaderMetadataType, String> header) | |
|
|
||
| if (blocks.size() > 0) { | ||
| AppendResult appendResult = writer.appendBlocks(blocks); | ||
| processAppendResult(appendResult); | ||
| processAppendResult(appendResult, recordList); | ||
| recordList.clear(); | ||
| keysToDelete.clear(); | ||
| } | ||
|
|
@@ -419,7 +436,7 @@ public List<WriteStatus> close() { | |
| // update final size, once for all log files | ||
| // TODO we can actually deduce file size purely from AppendResult (based on offset and size | ||
| // of the appended block) | ||
| for (WriteStatus status: statuses) { | ||
| for (WriteStatus status : statuses) { | ||
| long logFileSize = FSUtils.getFileSize(fs, new Path(config.getBasePath(), status.getStat().getPath())); | ||
| status.getStat().setFileSizeInBytes(logFileSize); | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.