-
Notifications
You must be signed in to change notification settings - Fork 2.5k
[HUDI-3782] Fixing table config when any of the index is disabled #5222
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -95,6 +95,7 @@ | |
| import static org.apache.hudi.metadata.HoodieTableMetadata.METADATA_TABLE_NAME_SUFFIX; | ||
| import static org.apache.hudi.metadata.HoodieTableMetadata.SOLO_COMMIT_TIMESTAMP; | ||
| import static org.apache.hudi.metadata.HoodieTableMetadataUtil.getCompletedMetadataPartitions; | ||
| import static org.apache.hudi.metadata.HoodieTableMetadataUtil.getInflightAndCompletedMetadataPartitions; | ||
| import static org.apache.hudi.metadata.HoodieTableMetadataUtil.getInflightMetadataPartitions; | ||
|
|
||
| /** | ||
|
|
@@ -377,7 +378,25 @@ protected <T extends SpecificRecordBase> void initializeIfNeeded(HoodieTableMeta | |
| if (initializeFromFilesystem(dataMetaClient, inflightInstantTimestamp)) { | ||
| metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.INITIALIZE_STR, timer.endTimer())); | ||
| } | ||
| return; | ||
| } | ||
|
|
||
| // if metadata table exists, then check if any of the enabled partition types needs to be initialized | ||
| Set<String> inflightAndCompletedPartitions = getInflightAndCompletedMetadataPartitions(dataMetaClient.getTableConfig()); | ||
| List<MetadataPartitionType> partitionsToInit = this.enabledPartitionTypes.stream() | ||
| .filter(p -> !inflightAndCompletedPartitions.contains(p.getPartitionPath()) && !MetadataPartitionType.FILES.equals(p)) | ||
| .collect(Collectors.toList()); | ||
|
|
||
| // if there are no partitions to initialize or there is a pending operation, then don't initialize in this round | ||
| if (partitionsToInit.isEmpty() || anyPendingDataInstant(dataMetaClient, inflightInstantTimestamp)) { | ||
| return; | ||
| } | ||
|
|
||
| String createInstantTime = getInitialCommitInstantTime(dataMetaClient); | ||
nsivabalan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| initTableMetadata(); // re-init certain flags in BaseTableMetadata | ||
| initializeEnabledFileGroups(dataMetaClient, createInstantTime, partitionsToInit); | ||
| initialCommit(createInstantTime, partitionsToInit); | ||
| updateInitializedPartitionsInTableConfig(partitionsToInit); | ||
| } | ||
|
|
||
| private <T extends SpecificRecordBase> boolean metadataTableExists(HoodieTableMetaClient dataMetaClient, | ||
|
|
@@ -502,26 +521,11 @@ private <T extends SpecificRecordBase> boolean isCommitRevertedByInFlightAction( | |
| */ | ||
| private boolean initializeFromFilesystem(HoodieTableMetaClient dataMetaClient, | ||
| Option<String> inflightInstantTimestamp) throws IOException { | ||
| ValidationUtils.checkState(enabled, "Metadata table cannot be initialized as it is not enabled"); | ||
|
|
||
| // We can only initialize if there are no pending operations on the dataset | ||
| List<HoodieInstant> pendingDataInstant = dataMetaClient.getActiveTimeline() | ||
| .getInstants().filter(i -> !i.isCompleted()) | ||
| .filter(i -> !inflightInstantTimestamp.isPresent() || !i.getTimestamp().equals(inflightInstantTimestamp.get())) | ||
| .collect(Collectors.toList()); | ||
|
|
||
| if (!pendingDataInstant.isEmpty()) { | ||
| metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.BOOTSTRAP_ERR_STR, 1)); | ||
| LOG.warn("Cannot initialize metadata table as operation(s) are in progress on the dataset: " | ||
| + Arrays.toString(pendingDataInstant.toArray())); | ||
| if (anyPendingDataInstant(dataMetaClient, inflightInstantTimestamp)) { | ||
| return false; | ||
| } | ||
|
|
||
| // If there is no commit on the dataset yet, use the SOLO_COMMIT_TIMESTAMP as the instant time for initial commit | ||
| // Otherwise, we use the timestamp of the latest completed action. | ||
| String createInstantTime = dataMetaClient.getActiveTimeline().filterCompletedInstants() | ||
| .getReverseOrderedInstants().findFirst().map(HoodieInstant::getTimestamp).orElse(SOLO_COMMIT_TIMESTAMP); | ||
| LOG.info("Creating a new metadata table in " + metadataWriteConfig.getBasePath() + " at instant " + createInstantTime); | ||
| String createInstantTime = getInitialCommitInstantTime(dataMetaClient); | ||
|
|
||
| initializeMetaClient(dataWriteConfig.getMetadataConfig().populateMetaFields()); | ||
| initTableMetadata(); | ||
|
|
@@ -535,15 +539,38 @@ private boolean initializeFromFilesystem(HoodieTableMetaClient dataMetaClient, | |
| enabledPartitionTypes = this.enabledPartitionTypes; | ||
| } | ||
| initializeEnabledFileGroups(dataMetaClient, createInstantTime, enabledPartitionTypes); | ||
|
|
||
| // During cold startup, the list of files to be committed can be huge. So creating a HoodieCommitMetadata out | ||
| // of these large number of files and calling the existing update(HoodieCommitMetadata) function does not scale | ||
| // well. Hence, we have a special commit just for the initialization scenario. | ||
| initialCommit(createInstantTime, enabledPartitionTypes); | ||
| updateInitializedPartitionsInTableConfig(enabledPartitionTypes); | ||
| return true; | ||
| } | ||
|
|
||
| private String getInitialCommitInstantTime(HoodieTableMetaClient dataMetaClient) { | ||
| // If there is no commit on the dataset yet, use the SOLO_COMMIT_TIMESTAMP as the instant time for initial commit | ||
| // Otherwise, we use the timestamp of the latest completed action. | ||
| String createInstantTime = dataMetaClient.getActiveTimeline().filterCompletedInstants() | ||
| .getReverseOrderedInstants().findFirst().map(HoodieInstant::getTimestamp).orElse(SOLO_COMMIT_TIMESTAMP); | ||
| LOG.info("Creating a new metadata table in " + metadataWriteConfig.getBasePath() + " at instant " + createInstantTime); | ||
| return createInstantTime; | ||
| } | ||
|
|
||
| private boolean anyPendingDataInstant(HoodieTableMetaClient dataMetaClient, Option<String> inflightInstantTimestamp) { | ||
| ValidationUtils.checkState(enabled, "Metadata table cannot be initialized as it is not enabled"); | ||
|
|
||
| // We can only initialize if there are no pending operations on the dataset | ||
| List<HoodieInstant> pendingDataInstant = dataMetaClient.getActiveTimeline() | ||
| .getInstants().filter(i -> !i.isCompleted()) | ||
| .filter(i -> !inflightInstantTimestamp.isPresent() || !i.getTimestamp().equals(inflightInstantTimestamp.get())) | ||
| .collect(Collectors.toList()); | ||
|
|
||
| if (!pendingDataInstant.isEmpty()) { | ||
| metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.BOOTSTRAP_ERR_STR, 1)); | ||
| LOG.warn("Cannot initialize metadata table as operation(s) are in progress on the dataset: " | ||
| + Arrays.toString(pendingDataInstant.toArray())); | ||
| return true; | ||
| } | ||
| return false; | ||
| } | ||
|
|
||
| private void updateInitializedPartitionsInTableConfig(List<MetadataPartitionType> partitionTypes) { | ||
| Set<String> completedPartitions = getCompletedMetadataPartitions(dataMetaClient.getTableConfig()); | ||
| completedPartitions.addAll(partitionTypes.stream().map(MetadataPartitionType::getPartitionPath).collect(Collectors.toSet())); | ||
|
|
@@ -973,8 +1000,12 @@ protected void cleanIfNecessary(BaseHoodieWriteClient writeClient, String instan | |
| } | ||
|
|
||
| /** | ||
| * This is invoked to initialize metadata table for a dataset. Bootstrap Commit has special handling mechanism due to its scale compared to | ||
| * other regular commits. | ||
| * This is invoked to initialize metadata table for a dataset. | ||
| * Initial commit has special handling mechanism due to its scale compared to other regular commits. | ||
| * During cold startup, the list of files to be committed can be huge. | ||
| * So creating a HoodieCommitMetadata out of these large number of files, | ||
| * and calling the existing update(HoodieCommitMetadata) function does not scale well. | ||
| * Hence, we have a special commit just for the initialization scenario. | ||
| */ | ||
| private void initialCommit(String createInstantTime, List<MetadataPartitionType> partitionTypes) { | ||
| // List all partitions in the basePath of the containing dataset | ||
|
|
@@ -992,18 +1023,17 @@ private void initialCommit(String createInstantTime, List<MetadataPartitionType> | |
| }).collect(Collectors.toMap(Pair::getKey, Pair::getValue)); | ||
| final Map<MetadataPartitionType, HoodieData<HoodieRecord>> partitionToRecordsMap = new HashMap<>(); | ||
|
|
||
| // Record which saves the list of all partitions | ||
| HoodieRecord allPartitionRecord = HoodieMetadataPayload.createPartitionListRecord(partitions); | ||
| if (partitions.isEmpty()) { | ||
| // in case of initializing of a fresh table, there won't be any partitions, but we need to make a boostrap commit | ||
| final HoodieData<HoodieRecord> allPartitionRecordsRDD = engineContext.parallelize( | ||
| Collections.singletonList(allPartitionRecord), 1); | ||
| partitionToRecordsMap.put(MetadataPartitionType.FILES, allPartitionRecordsRDD); | ||
| commit(createInstantTime, partitionToRecordsMap, false); | ||
| return; | ||
| } | ||
|
|
||
| if (partitionTypes.contains(MetadataPartitionType.FILES)) { | ||
| // Record which saves the list of all partitions | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. may I know why do we need this if condition. can you help clarify.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This just saves some duplicate effort. There was no correctness issue in absence of this if condition. For e.g., when colstats is re-enabled, we are reusing this method and then we don't really need to redo the files partition, hence this if condition. In absence of this condition, |
||
| HoodieRecord allPartitionRecord = HoodieMetadataPayload.createPartitionListRecord(partitions); | ||
| if (partitions.isEmpty()) { | ||
| // in case of initializing of a fresh table, there won't be any partitions, but we need to make a boostrap commit | ||
| final HoodieData<HoodieRecord> allPartitionRecordsRDD = engineContext.parallelize( | ||
| Collections.singletonList(allPartitionRecord), 1); | ||
| partitionToRecordsMap.put(MetadataPartitionType.FILES, allPartitionRecordsRDD); | ||
| commit(createInstantTime, partitionToRecordsMap, false); | ||
| return; | ||
| } | ||
| HoodieData<HoodieRecord> filesPartitionRecords = getFilesPartitionRecords(createInstantTime, partitionInfoList, allPartitionRecord); | ||
| ValidationUtils.checkState(filesPartitionRecords.count() == (partitions.size() + 1)); | ||
| partitionToRecordsMap.put(MetadataPartitionType.FILES, filesPartitionRecords); | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.