-
Notifications
You must be signed in to change notification settings - Fork 2.5k
[HUDI-5434] Fix archival in metadata table to not rely on completed rollback or clean in data table #7580
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[HUDI-5434] Fix archival in metadata table to not rely on completed rollback or clean in data table #7580
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,13 +28,13 @@ | |
| import org.apache.hudi.common.fs.HoodieWrapperFileSystem; | ||
| import org.apache.hudi.common.fs.StorageSchemes; | ||
| import org.apache.hudi.common.model.HoodieArchivedLogFile; | ||
| import org.apache.hudi.common.model.HoodieAvroIndexedRecord; | ||
| import org.apache.hudi.common.model.HoodieAvroPayload; | ||
| import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; | ||
| import org.apache.hudi.common.model.HoodieAvroIndexedRecord; | ||
| import org.apache.hudi.common.model.HoodieLogFile; | ||
| import org.apache.hudi.common.model.HoodieRecord; | ||
| import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; | ||
| import org.apache.hudi.common.model.HoodieTableType; | ||
| import org.apache.hudi.common.model.HoodieRecord; | ||
| import org.apache.hudi.common.table.HoodieTableMetaClient; | ||
| import org.apache.hudi.common.table.log.HoodieLogFormat; | ||
| import org.apache.hudi.common.table.log.HoodieLogFormat.Writer; | ||
|
|
@@ -46,6 +46,7 @@ | |
| import org.apache.hudi.common.table.timeline.HoodieInstant; | ||
| import org.apache.hudi.common.table.timeline.HoodieTimeline; | ||
| import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; | ||
| import org.apache.hudi.common.table.timeline.TimelineUtils; | ||
| import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; | ||
| import org.apache.hudi.common.util.ClusteringUtils; | ||
| import org.apache.hudi.common.util.CollectionUtils; | ||
|
|
@@ -514,24 +515,27 @@ private Stream<HoodieInstant> getInstantsToArchive() throws IOException { | |
| .setBasePath(HoodieTableMetadata.getDatasetBasePath(config.getBasePath())) | ||
| .setConf(metaClient.getHadoopConf()) | ||
| .build(); | ||
| Option<HoodieInstant> earliestActiveDatasetCommit = dataMetaClient.getActiveTimeline().firstInstant(); | ||
|
|
||
| if (config.shouldArchiveBeyondSavepoint()) { | ||
| // There are chances that there could be holes in the timeline due to archival and savepoint interplay. | ||
| // So, the first non-savepoint commit in the data timeline is considered as beginning of the active timeline. | ||
| Option<HoodieInstant> firstNonSavepointCommit = dataMetaClient.getActiveTimeline().getFirstNonSavepointCommit(); | ||
| if (firstNonSavepointCommit.isPresent()) { | ||
| String firstNonSavepointCommitTime = firstNonSavepointCommit.get().getTimestamp(); | ||
| instants = instants.filter(instant -> | ||
| compareTimestamps(instant.getTimestamp(), LESSER_THAN, firstNonSavepointCommitTime)); | ||
| } | ||
| } else { | ||
| // Do not archive the commits that live in data set active timeline. | ||
| // This is required by metadata table, see HoodieTableMetadataUtil#processRollbackMetadata for details. | ||
| if (earliestActiveDatasetCommit.isPresent()) { | ||
| instants = instants.filter(instant -> | ||
| compareTimestamps(instant.getTimestamp(), HoodieTimeline.LESSER_THAN, earliestActiveDatasetCommit.get().getTimestamp())); | ||
| } | ||
| Option<HoodieInstant> qualifiedEarliestInstant = | ||
| TimelineUtils.getEarliestInstantForMetadataArchival( | ||
| dataMetaClient.getActiveTimeline(), config.shouldArchiveBeyondSavepoint()); | ||
|
|
||
| // Do not archive the instants after the earliest commit (COMMIT, DELTA_COMMIT, and | ||
| // REPLACE_COMMIT only, considering non-savepoint commit only if enabling archive | ||
| // beyond savepoint) and the earliest inflight instant (all actions). | ||
| // This is required by metadata table, see HoodieTableMetadataUtil#processRollbackMetadata | ||
| // for details. | ||
| // Note that we cannot blindly use the earliest instant of all actions, because CLEAN and | ||
| // ROLLBACK instants are archived separately apart from commits (check | ||
| // HoodieTimelineArchiver#getCleanInstantsToArchive). If we do so, a very old completed | ||
| // CLEAN or ROLLBACK instant can block the archive of metadata table timeline and causes | ||
| // the active timeline of metadata table to be extremely long, leading to performance issues | ||
| // for loading the timeline. | ||
| if (qualifiedEarliestInstant.isPresent()) { | ||
| instants = instants.filter(instant -> | ||
| compareTimestamps( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have created a jira to test savepoint beyond archival. I guess, we should not be deleting the commits that are savepointed.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sg. This particular logic is for the metadata table's archival only. Nevertheless, we should still thoroughly test the archival beyond savepoints, orthogonal to this PR. |
||
| instant.getTimestamp(), | ||
| HoodieTimeline.LESSER_THAN, | ||
| qualifiedEarliestInstant.get().getTimestamp())); | ||
| } | ||
| } | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.