diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/FileSystemViewCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/FileSystemViewCommand.java index 977575e042cd..8fb119a2a056 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/FileSystemViewCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/FileSystemViewCommand.java @@ -31,6 +31,7 @@ import org.apache.hudi.common.table.timeline.TimelineFactory; import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.common.util.NumericUtils; + import org.apache.hudi.storage.HoodieStorage; import org.apache.hudi.storage.StoragePath; import org.apache.hudi.storage.StoragePathInfo; @@ -270,7 +271,7 @@ private HoodieTableFileSystemView buildFileSystemView(String globRegex, String m } instantsStream = instantsStream.filter(is -> predicate.test(maxInstant, is.requestedTime())); } - TimelineFactory timelineFactory = metaClient.getTimelineLayout().getTimelineFactory(); + TimelineFactory timelineFactory = metaClient.getTableFormat().getTimelineFactory(); HoodieTimeline filteredTimeline = timelineFactory.createDefaultTimeline(instantsStream, metaClient.getActiveTimeline()); return new HoodieTableFileSystemView(metaClient, filteredTimeline, pathInfoList); } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RepairsCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RepairsCommand.java index b75e5b9875d8..868ea40eb764 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RepairsCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RepairsCommand.java @@ -231,7 +231,7 @@ public String migratePartitionMeta( HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(HoodieCLI.conf); HoodieTableMetaClient client = HoodieCLI.getTableMetaClient(); List partitionPaths = - FSUtils.getAllPartitionPaths(engineContext, client.getStorage(), client.getBasePath(), false); + FSUtils.getAllPartitionPaths(engineContext, client, false); StoragePath basePath = client.getBasePath(); String[][] rows = new String[partitionPaths.size()][]; diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieTableServiceClient.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieTableServiceClient.java index df8c574cdff3..cd06a875d1a6 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieTableServiceClient.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieTableServiceClient.java @@ -402,7 +402,7 @@ public void commitLogCompaction(String compactionInstantTime, HoodieWriteMetadat HoodieTable table = tableOpt.orElseGet(() -> createTable(config, context.getStorageConf())); completeLogCompaction(writeMetadata.getCommitMetadata().get(), table, compactionInstantTime, tableWriteStats.getMetadataTableWriteStats()); } - + /** * Schedules a new log compaction instant. * @@ -575,7 +575,8 @@ private void completeClustering(HoodieReplaceCommitMetadata replaceCommitMetadat LOG.info("Committing Clustering {} for table {}", clusteringCommitTime, table.getConfig().getBasePath()); - ClusteringUtils.transitionClusteringOrReplaceInflightToComplete(false, clusteringInstant, replaceCommitMetadata, table.getActiveTimeline()); + ClusteringUtils.transitionClusteringOrReplaceInflightToComplete(false, clusteringInstant, replaceCommitMetadata, table.getActiveTimeline(), + completedInstant -> table.getMetaClient().getTableFormat().commit(replaceCommitMetadata, completedInstant, table.getContext(), table.getMetaClient(), table.getViewManager())); LOG.debug("Clustering {} finished with result {}", clusteringCommitTime, replaceCommitMetadata); } catch (Exception e) { throw new HoodieClusteringException("unable to transition clustering inflight to complete: " + clusteringCommitTime, e); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java index 3cad79476da8..6496c89cdf9c 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java @@ -316,7 +316,10 @@ protected void commit(HoodieTable table, } // update Metadata table writeToMetadataTable(skipStreamingWritesToMetadataTable, table, instantTime, tableWriteStats.getMetadataTableWriteStats(), metadata); - activeTimeline.saveAsComplete(false, table.getMetaClient().createNewInstant(HoodieInstant.State.INFLIGHT, commitActionType, instantTime), Option.of(metadata)); + activeTimeline.saveAsComplete(false, + table.getMetaClient().createNewInstant(HoodieInstant.State.INFLIGHT, commitActionType, instantTime), Option.of(metadata), + completedInstant -> table.getMetaClient().getTableFormat().commit(metadata, completedInstant, getEngineContext(), table.getMetaClient(), table.getViewManager()) + ); // update cols to Index as applicable HoodieColumnStatsIndexUtils.updateColsToIndex(table, config, metadata, commitActionType, (Functions.Function2, Void>) (metaClient, columnsToIndex) -> { diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/timeline/versioning/v1/TimelineArchiverV1.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/timeline/versioning/v1/TimelineArchiverV1.java index 8da69585b044..b8c94ab2804a 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/timeline/versioning/v1/TimelineArchiverV1.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/timeline/versioning/v1/TimelineArchiverV1.java @@ -297,7 +297,7 @@ private List getInstantsToArchive() throws IOException { // If metadata table is enabled, do not archive instants which are more recent than the last compaction on the // metadata table. if (config.isMetadataTableEnabled() && table.getMetaClient().getTableConfig().isMetadataTableAvailable()) { - try (HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(table.getContext(), table.getStorage(), config.getMetadataConfig(), config.getBasePath())) { + try (HoodieTableMetadata tableMetadata = table.refreshAndGetTableMetadata()) { Option latestCompactionTime = tableMetadata.getLatestCompactionTime(); if (!latestCompactionTime.isPresent()) { LOG.info("Not archiving as there is no compaction yet on the metadata table"); @@ -397,10 +397,10 @@ private boolean deleteArchivedInstants(List archivedInstants, Hoo // Therefore, the concurrency of deleting completed instants is temporarily disabled, // and instants are deleted in ascending order to prevent the occurrence of such holes. // See HUDI-7207 and #10325. - completedInstants.stream() - .forEach(instant -> activeTimeline.deleteInstantFileIfExists(instant)); + completedInstants.stream().forEach(activeTimeline::deleteInstantFileIfExists); } - + // Call Table Format archive to allow archiving in table format. + table.getMetaClient().getTableFormat().archive(() -> archivedInstants, table.getContext(), table.getMetaClient(), table.getViewManager()); return true; } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/timeline/versioning/v2/TimelineArchiverV2.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/timeline/versioning/v2/TimelineArchiverV2.java index e18617232e51..9a39a45ead6d 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/timeline/versioning/v2/TimelineArchiverV2.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/timeline/versioning/v2/TimelineArchiverV2.java @@ -56,6 +56,7 @@ import java.util.Map; import java.util.Set; import java.util.function.Consumer; +import java.util.function.Supplier; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -117,6 +118,11 @@ public int archiveIfRequired(HoodieEngineContext context, boolean acquireLock) t deleteArchivedActions(instantsToArchive, context); // triggers compaction and cleaning only after archiving action this.timelineWriter.compactAndClean(context); + Supplier> archivedInstants = () -> instantsToArchive.stream() + .flatMap(action -> Stream.concat(action.getCompletedInstants().stream(), action.getPendingInstants().stream())) + .collect(Collectors.toList()); + // Call Table Format archive to allow archiving in table format. + table.getMetaClient().getTableFormat().archive(archivedInstants, table.getContext(), table.getMetaClient(), table.getViewManager()); } else { LOG.info("No Instants to archive"); } @@ -209,8 +215,7 @@ private List getCommitInstantsToArchive() throws IOException { // 4. If metadata table is enabled, do not archive instants which are more recent than the last compaction on the // metadata table. if (config.isMetadataTableEnabled() && table.getMetaClient().getTableConfig().isMetadataTableAvailable()) { - try (HoodieTableMetadata tableMetadata = HoodieTableMetadata.create( - table.getContext(), table.getStorage(), config.getMetadataConfig(), config.getBasePath())) { + try (HoodieTableMetadata tableMetadata = table.refreshAndGetTableMetadata()) { Option latestCompactionTime = tableMetadata.getLatestCompactionTime(); if (!latestCompactionTime.isPresent()) { LOG.info("Not archiving as there is no compaction yet on the metadata table"); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieGlobalBloomIndex.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieGlobalBloomIndex.java index 25ca3984bdea..f94788197ad7 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieGlobalBloomIndex.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieGlobalBloomIndex.java @@ -57,7 +57,7 @@ public HoodieGlobalBloomIndex(HoodieWriteConfig config, BaseHoodieBloomIndexHelp List> loadColumnRangesFromFiles(List partitions, final HoodieEngineContext context, final HoodieTable hoodieTable) { HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); - List allPartitionPaths = FSUtils.getAllPartitionPaths(context, metaClient.getStorage(), config.getMetadataConfig(), metaClient.getBasePath()); + List allPartitionPaths = FSUtils.getAllPartitionPaths(context, metaClient, config.getMetadataConfig()); return super.loadColumnRangesFromFiles(allPartitionPaths, context, hoodieTable); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/simple/HoodieGlobalSimpleIndex.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/simple/HoodieGlobalSimpleIndex.java index 4abc3cb1d87b..336f6376bcd0 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/simple/HoodieGlobalSimpleIndex.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/simple/HoodieGlobalSimpleIndex.java @@ -94,7 +94,7 @@ private HoodiePairData fetchRecordGlobalLoca private List> getAllBaseFilesInTable( final HoodieEngineContext context, final HoodieTable hoodieTable) { HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); - List allPartitionPaths = FSUtils.getAllPartitionPaths(context, metaClient.getStorage(), config.getMetadataConfig(), metaClient.getBasePath()); + List allPartitionPaths = FSUtils.getAllPartitionPaths(context, metaClient, config.getMetadataConfig()); // Obtain the latest data files from all the partitions. return getLatestBaseFilesForAllPartitions(allPartitionPaths, context, hoodieTable); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java index 70a3e8e34342..667e2a183a77 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java @@ -183,7 +183,7 @@ public HoodieTableVersion version() { protected abstract HoodieIndex getIndex(HoodieWriteConfig config, HoodieEngineContext context); - private synchronized FileSystemViewManager getViewManager() { + public synchronized FileSystemViewManager getViewManager() { if (null == viewManager) { viewManager = FileSystemViewManager.createViewManager(getContext(), config.getMetadataConfig(), config.getViewStorageConfig(), config.getCommonConfig(), unused -> getMetadataTable()); } @@ -1166,14 +1166,18 @@ private void clearMetadataTablePartitionsConfig(Option pa public HoodieTableMetadata getMetadataTable() { if (metadata == null) { - HoodieMetadataConfig metadataConfig = HoodieMetadataConfig.newBuilder() - .fromProperties(config.getMetadataConfig().getProps()) - .build(); - metadata = HoodieTableMetadata.create(context, metaClient.getStorage(), metadataConfig, config.getBasePath()); + metadata = refreshAndGetTableMetadata(); } return metadata; } + public HoodieTableMetadata refreshAndGetTableMetadata() { + HoodieMetadataConfig metadataConfig = HoodieMetadataConfig.newBuilder() + .fromProperties(config.getMetadataConfig().getProps()) + .build(); + return metaClient.getTableFormat().getMetadataFactory().create(context, metaClient.getStorage(), metadataConfig, config.getBasePath()); + } + /** * When {@link HoodieTableConfig#POPULATE_META_FIELDS} is enabled, * we need to track written records within WriteStatus in two cases: diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/BaseTableServicePlanActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/BaseTableServicePlanActionExecutor.java index ad4edb2433de..f4ab580d6080 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/BaseTableServicePlanActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/BaseTableServicePlanActionExecutor.java @@ -102,8 +102,7 @@ public List getPartitions(Object strategy, TableServiceType type) { // get all partitions LOG.info("Start to fetch all partitions for " + type + ". Instant " + instantTime); - return FSUtils.getAllPartitionPaths(context, table.getMetaClient().getStorage(), - config.getMetadataConfig(), table.getMetaClient().getBasePath()); + return FSUtils.getAllPartitionPaths(context, table.getMetaClient(), config.getMetadataConfig()); } public Pair, Set> getIncrementalPartitions(TableServiceType type) { diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java index 549afc7d90d6..d4e6eca9077d 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java @@ -234,7 +234,8 @@ protected void commit(HoodieWriteMetadata result, List write // cannot serialize maps with null values metadata.getExtraMetadata().entrySet().removeIf(entry -> entry.getValue() == null); activeTimeline.saveAsComplete(false, - table.getMetaClient().createNewInstant(State.INFLIGHT, actionType, instantTime), Option.of(metadata)); + table.getMetaClient().createNewInstant(State.INFLIGHT, actionType, instantTime), Option.of(metadata), + completedInstant -> table.getMetaClient().getTableFormat().commit(metadata, completedInstant, table.getContext(), table.getMetaClient(), table.getViewManager())); LOG.info("Committed " + instantTime); result.setCommitMetadata(Option.of(metadata)); // update cols to Index as applicable diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/plan/generators/BaseHoodieCompactionPlanGenerator.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/plan/generators/BaseHoodieCompactionPlanGenerator.java index 09638bebca36..53c55ba0bc8b 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/plan/generators/BaseHoodieCompactionPlanGenerator.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/plan/generators/BaseHoodieCompactionPlanGenerator.java @@ -84,7 +84,7 @@ public HoodieCompactionPlan generateCompactionPlan(String compactionInstant) thr // TODO : check if maxMemory is not greater than JVM or executor memory // TODO - rollback any compactions in flight HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); - CompletionTimeQueryView completionTimeQueryView = metaClient.getTimelineLayout().getTimelineFactory().createCompletionTimeQueryView(metaClient); + CompletionTimeQueryView completionTimeQueryView = metaClient.getTableFormat().getTimelineFactory().createCompletionTimeQueryView(metaClient); List partitionPaths = getPartitions(); int allPartitionSize = partitionPaths.size(); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackActionExecutor.java index 2c17de655bc1..246a7afd346b 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackActionExecutor.java @@ -288,7 +288,8 @@ protected void finishRollback(HoodieInstant inflightInstant, HoodieRollbackMetad // NOTE: no need to lock here, since !skipTimelinePublish is always true, // when skipLocking is false, txnManager above-mentioned should lock it. // when skipLocking is true, the caller should have already held the lock. - table.getActiveTimeline().transitionRollbackInflightToComplete(false, inflightInstant, rollbackMetadata); + table.getActiveTimeline().transitionRollbackInflightToComplete(false, inflightInstant, rollbackMetadata, + completedInstant -> table.getMetaClient().getTableFormat().completedRollback(completedInstant, table.getContext(), table.getMetaClient(), table.getViewManager())); LOG.info("Rollback of Commits " + rollbackMetadata.getCommitsRollback() + " is complete"); } } finally { diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/CopyOnWriteRollbackActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/CopyOnWriteRollbackActionExecutor.java index 7abb3e7a931a..bd619c85616c 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/CopyOnWriteRollbackActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/CopyOnWriteRollbackActionExecutor.java @@ -20,10 +20,12 @@ import org.apache.hudi.avro.model.HoodieRollbackPlan; import org.apache.hudi.common.HoodieRollbackStat; +import org.apache.hudi.common.NativeTableFormat; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; @@ -67,11 +69,23 @@ protected List executeRollback(HoodieRollbackPlan hoodieRoll if (instantToRollback.isCompleted()) { LOG.info("Unpublishing instant " + instantToRollback); + table.getMetaClient().getTableFormat().rollback(instantToRollback, table.getContext(), table.getMetaClient(), table.getViewManager()); + // Revert the completed instant to inflight in native format. resolvedInstant = activeTimeline.revertToInflight(instantToRollback); // reload meta-client to reflect latest timeline status table.getMetaClient().reloadActiveTimeline(); } + // If instant is inflight but marked as completed in native format, delete the completed instant from storage. + if (instantToRollback.isInflight() && !table.getMetaClient().getTableFormat().getName().equals(NativeTableFormat.TABLE_FORMAT)) { + HoodieActiveTimeline activeTimelineForNativeFormat = table.getMetaClient().getActiveTimelineForNativeFormat(); + Option instantToRollbackInNativeFormat = activeTimelineForNativeFormat.filter(instant -> instant.requestedTime().equals(instantToRollback.requestedTime())).lastInstant(); + if (instantToRollbackInNativeFormat.isPresent() && instantToRollbackInNativeFormat.get().isCompleted()) { + resolvedInstant = activeTimelineForNativeFormat.revertToInflight(instantToRollbackInNativeFormat.get()); + table.getMetaClient().reloadActiveTimeline(); + } + } + // For Requested State (like failure during index lookup), there is nothing to do rollback other than // deleting the timeline file if (!resolvedInstant.isRequested()) { diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackStrategy.java index d3c1f35b7d8c..28905f75f75d 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackStrategy.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackStrategy.java @@ -101,7 +101,7 @@ public List getRollbackRequests(HoodieInstant instantToRo HoodieTableMetaClient metaClient = table.getMetaClient(); boolean isTableVersionLessThanEight = metaClient.getTableConfig().getTableVersion().lesserThan(HoodieTableVersion.EIGHT); List partitionPaths = - FSUtils.getAllPartitionPaths(context, table.getStorage(), table.getMetaClient().getBasePath(), false); + FSUtils.getAllPartitionPaths(context, table.getMetaClient(), false); int numPartitions = Math.max(Math.min(partitionPaths.size(), config.getRollbackParallelism()), 1); context.setJobStatus(this.getClass().getSimpleName(), "Creating Listing Rollback Plan: " + config.getTableName()); @@ -285,7 +285,7 @@ private List listAllFilesSinceCommit(String commit, String partitionPath, HoodieTableMetaClient metaClient) throws IOException { LOG.info("Collecting files to be cleaned/rolledback up for path " + partitionPath + " and commit " + commit); - CompletionTimeQueryView completionTimeQueryView = metaClient.getTimelineLayout().getTimelineFactory().createCompletionTimeQueryView(metaClient); + CompletionTimeQueryView completionTimeQueryView = metaClient.getTableFormat().getTimelineFactory().createCompletionTimeQueryView(metaClient); StoragePathFilter filter = (path) -> { if (path.toString().contains(baseFileExtension)) { String fileCommitTime = FSUtils.getCommitTime(path.getName()); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/savepoint/SavepointActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/savepoint/SavepointActionExecutor.java index ca4def781bb6..56ce313ef8df 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/savepoint/SavepointActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/savepoint/SavepointActionExecutor.java @@ -122,8 +122,7 @@ public HoodieSavepointMetadata execute() { return latestFiles; })); } else { - List partitions = FSUtils.getAllPartitionPaths( - context, table.getStorage(), config.getMetadataConfig(), table.getMetaClient().getBasePath()); + List partitions = FSUtils.getAllPartitionPaths(context, table.getMetaClient(), config.getMetadataConfig()); latestFilesMap = context.mapToPair(partitions, partitionPath -> { // Scan all partitions files with this commit time LOG.info("Collecting latest files in partition path " + partitionPath); @@ -143,8 +142,9 @@ public HoodieSavepointMetadata execute() { table.getActiveTimeline().createNewInstant( instantGenerator.createNewInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.SAVEPOINT_ACTION, instantTime)); table.getActiveTimeline() - .saveAsComplete(instantGenerator.createNewInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.SAVEPOINT_ACTION, instantTime), - Option.of(metadata)); + .saveAsComplete( + true, instantGenerator.createNewInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.SAVEPOINT_ACTION, instantTime), Option.of(metadata), + savepointCompletedInstant -> table.getMetaClient().getTableFormat().savepoint(savepointCompletedInstant, table.getContext(), table.getMetaClient(), table.getViewManager())); LOG.info("Savepoint " + instantTime + " created"); return metadata; } catch (HoodieIOException e) { diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/ttl/strategy/PartitionTTLStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/ttl/strategy/PartitionTTLStrategy.java index dadbf06b7c9f..9d6275d5f257 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/ttl/strategy/PartitionTTLStrategy.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/ttl/strategy/PartitionTTLStrategy.java @@ -66,8 +66,7 @@ protected List getPartitionPathsForTTL() { List partitionsForTTL; if (StringUtils.isNullOrEmpty(partitionSelected)) { // Return all partition paths. - partitionsForTTL = FSUtils.getAllPartitionPaths( - hoodieTable.getContext(), hoodieTable.getStorage(), writeConfig.getMetadataConfig(), writeConfig.getBasePath()); + partitionsForTTL = FSUtils.getAllPartitionPaths(hoodieTable.getContext(), hoodieTable.getMetaClient(), writeConfig.getMetadataConfig()); } else { partitionsForTTL = Arrays.asList(partitionSelected.split(",")); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/EightToSevenDowngradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/EightToSevenDowngradeHandler.java index 04cd07162be3..4bbfef242e02 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/EightToSevenDowngradeHandler.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/EightToSevenDowngradeHandler.java @@ -342,12 +342,14 @@ static void downgradeMetadataPartitions(HoodieEngineContext context, // Get base path for metadata table. StoragePath metadataTableBasePath = HoodieTableMetadata.getMetadataTableBasePath(metaClient.getBasePath()); + HoodieTableMetaClient metadataMetaClient = + HoodieTableMetaClient.builder() + .setBasePath(metadataTableBasePath.toUri().toString()) + .setConf(hoodieStorage.getConf()) + .build(); // Fetch metadata partition paths. - List metadataPartitions = FSUtils.getAllPartitionPaths(context, - hoodieStorage, - metadataTableBasePath, - false); + List metadataPartitions = FSUtils.getAllPartitionPaths(context, metadataMetaClient, false); // Delete partitions. List validPartitionPaths = deleteMetadataPartition(context, metaClient, metadataPartitions); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeDowngradeUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeDowngradeUtils.java index 7d8ee567d0e3..3d39a47809b2 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeDowngradeUtils.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeDowngradeUtils.java @@ -107,7 +107,7 @@ public static void runCompaction(HoodieTable table, HoodieEngineContext context, */ public static void syncCompactionRequestedFileToAuxiliaryFolder(HoodieTable table) { HoodieTableMetaClient metaClient = table.getMetaClient(); - TimelineFactory timelineFactory = metaClient.getTimelineLayout().getTimelineFactory(); + TimelineFactory timelineFactory = metaClient.getTableFormat().getTimelineFactory(); InstantFileNameGenerator instantFileNameGenerator = metaClient.getInstantFileNameGenerator(); HoodieTimeline compactionTimeline = timelineFactory.createActiveTimeline(metaClient, false).filterPendingCompactionTimeline() .filter(instant -> instant.getState() == HoodieInstant.State.REQUESTED); diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/WriteClientTestUtils.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/WriteClientTestUtils.java index c3dfc2ea1d41..376da22edc98 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/WriteClientTestUtils.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/WriteClientTestUtils.java @@ -58,7 +58,6 @@ public long generateTime(boolean skipLocking) { @Override public void consumeTime(boolean skipLocking, Consumer func) { - } } } diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/timeline/TestCompletionTimeQueryView.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/timeline/TestCompletionTimeQueryView.java index 9f290cf44979..944ae5678df8 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/timeline/TestCompletionTimeQueryView.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/timeline/TestCompletionTimeQueryView.java @@ -77,7 +77,7 @@ void testReadCompletionTime() throws Exception { HoodieTestUtils.getDefaultStorageConf(), tablePath, HoodieTableType.COPY_ON_WRITE, tableName); prepareTimeline(tablePath, metaClient); try (CompletionTimeQueryView view = - metaClient.getTimelineLayout().getTimelineFactory().createCompletionTimeQueryView(metaClient, String.format("%08d", 3))) { + metaClient.getTableFormat().getTimelineFactory().createCompletionTimeQueryView(metaClient, String.format("%08d", 3))) { // query completion time from LSM timeline for (int i = 3; i < 7; i++) { assertThat(view.getCompletionTime(String.format("%08d", i)).orElse(""), is(String.format("%08d", i + 1000))); @@ -108,7 +108,7 @@ void testReadStartTime() throws Exception { HoodieTestUtils.getDefaultStorageConf(), tablePath, HoodieTableType.COPY_ON_WRITE, tableName); prepareTimeline(tablePath, metaClient); try (CompletionTimeQueryView view = - metaClient.getTimelineLayout().getTimelineFactory().createCompletionTimeQueryView(metaClient, String.format("%08d", 3))) { + metaClient.getTableFormat().getTimelineFactory().createCompletionTimeQueryView(metaClient, String.format("%08d", 3))) { // query start time from LSM timeline assertThat(getInstantTimeSetFormattedString(view, 3 + 1000, 6 + 1000), is("00000003,00000004,00000005,00000006")); // query start time from active timeline diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/table/upgrade/TestEightToSevenDowngradeHandler.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/table/upgrade/TestEightToSevenDowngradeHandler.java index 1752e0e06b8c..7e0c85795afc 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/table/upgrade/TestEightToSevenDowngradeHandler.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/table/upgrade/TestEightToSevenDowngradeHandler.java @@ -134,16 +134,34 @@ void testDowngradeMetadataPartitions() { String baseTablePath = baseDir.toString(); HoodieStorage hoodieStorage = HoodieStorageUtils.getStorage(getDefaultStorageConf()); StoragePath basePath = new StoragePath(baseTablePath); + + HoodieTableConfig tableConfig = mock(HoodieTableConfig.class); when(metaClient.getBasePath()).thenReturn(basePath); + when(metaClient.getTableConfig()).thenReturn(tableConfig); + when(metaClient.getStorage()).thenReturn(hoodieStorage); Map tablePropsToAdd = new HashMap<>(); try (MockedStatic mockedFSUtils = mockStatic(FSUtils.class); - MockedStatic mockedMetadataUtils = mockStatic(HoodieTableMetadataUtil.class)) { + MockedStatic mockedStaticMetaClient = mockStatic(HoodieTableMetaClient.class)) { StoragePath mdtBasePath = HoodieTableMetadata.getMetadataTableBasePath(metaClient.getBasePath()); + + // Mock FSUtils.getAllPartitionPaths to return SAMPLE_METADATA_PATHS mockedFSUtils - .when(() -> FSUtils.getAllPartitionPaths(context, hoodieStorage, mdtBasePath, false)) + .when(() -> FSUtils.getAllPartitionPaths(context, metaClient, false)) .thenReturn(SAMPLE_METADATA_PATHS); + // Mock HoodieTableMetaClient.builder() to return a builder that returns a mock metaClient + HoodieTableMetaClient.Builder mockBuilder = mock(HoodieTableMetaClient.Builder.class); + when(mockBuilder.setBasePath(mdtBasePath.toUri().toString())).thenReturn(mockBuilder); + when(mockBuilder.setConf(hoodieStorage.getConf())).thenReturn(mockBuilder); + when(mockBuilder.build()).thenReturn(metaClient); + mockedStaticMetaClient.when(HoodieTableMetaClient::builder).thenReturn(mockBuilder); + + // Mock FSUtils.isTableExists to return true + mockedFSUtils + .when(() -> FSUtils.isTableExists(mdtBasePath.toString(), hoodieStorage)) + .thenReturn(true); + EightToSevenDowngradeHandler.downgradeMetadataPartitions(context, hoodieStorage, metaClient, tablePropsToAdd); assertTrue(tablePropsToAdd.containsKey(TABLE_METADATA_PARTITIONS)); diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/table/upgrade/TestSevenToEightUpgradeHandler.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/table/upgrade/TestSevenToEightUpgradeHandler.java index bcc9d897620f..7a6dea543d8b 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/table/upgrade/TestSevenToEightUpgradeHandler.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/table/upgrade/TestSevenToEightUpgradeHandler.java @@ -61,8 +61,6 @@ class TestSevenToEightUpgradeHandler { private HoodieWriteConfig config; @Mock private HoodieTableConfig tableConfig; - @Mock - private SupportsUpgradeDowngrade upgradeDowngradeHelper; private SevenToEightUpgradeHandler upgradeHandler; diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/utils/HoodieWriterClientTestHarness.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/utils/HoodieWriterClientTestHarness.java index 0f2a23eb5971..3fdad193655d 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/utils/HoodieWriterClientTestHarness.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/utils/HoodieWriterClientTestHarness.java @@ -1230,7 +1230,7 @@ protected void testUpsertsInternal(Function3 table.getMetaClient().getTableFormat().commit(metadata, completedInstant, table.getContext(), table.getMetaClient(), table.getViewManager())); } catch (HoodieIOException e) { throw new HoodieClusteringException( "Failed to commit " + table.getMetaClient().getBasePath() + " at time " + clusteringCommitTime, e); diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/HoodieFlinkWriteClient.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/HoodieFlinkWriteClient.java index 39e481dff109..e33d9c5acb59 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/HoodieFlinkWriteClient.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/HoodieFlinkWriteClient.java @@ -487,7 +487,7 @@ public Map> getPartitionToReplacedFileIds( case INSERT_OVERWRITE_TABLE: Map> partitionToExistingFileIds = new HashMap<>(); List partitionPaths = - FSUtils.getAllPartitionPaths(context, table.getStorage(), config.getMetadataConfig(), table.getMetaClient().getBasePath()); + FSUtils.getAllPartitionPaths(context, table.getMetaClient(), config.getMetadataConfig()); if (partitionPaths != null && partitionPaths.size() > 0) { context.setJobStatus(this.getClass().getSimpleName(), "Getting ExistingFileIds of all partitions: " + config.getTableName()); partitionToExistingFileIds = partitionPaths.stream().parallel() diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaInsertOverwriteTableCommitActionExecutor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaInsertOverwriteTableCommitActionExecutor.java index f65e8a3d8b3b..b10f6ebfdc7a 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaInsertOverwriteTableCommitActionExecutor.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaInsertOverwriteTableCommitActionExecutor.java @@ -50,8 +50,7 @@ protected List getAllExistingFileIds(String partitionPath) { @Override protected Map> getPartitionToReplacedFileIds(HoodieWriteMetadata> writeResult) { Map> partitionToExistingFileIds = new HashMap<>(); - List partitionPaths = FSUtils.getAllPartitionPaths(context, - table.getStorage(), table.getMetaClient().getBasePath(), config.isMetadataTableEnabled()); + List partitionPaths = FSUtils.getAllPartitionPaths(context, table.getMetaClient(), config.isMetadataTableEnabled()); if (partitionPaths != null && partitionPaths.size() > 0) { partitionToExistingFileIds = context.mapToPair(partitionPaths, diff --git a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/client/TestJavaHoodieBackedMetadata.java b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/client/TestJavaHoodieBackedMetadata.java index 5165ca5f3d71..00195ea86cbe 100644 --- a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/client/TestJavaHoodieBackedMetadata.java +++ b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/client/TestJavaHoodieBackedMetadata.java @@ -2837,7 +2837,7 @@ private void validateMetadata(HoodieJavaWriteClient testClient, Option i // Metadata table has a fixed number of partitions // Cannot use FSUtils.getAllFoldersWithPartitionMetaFile for this as that function filters all directory // in the .hoodie folder. - List metadataTablePartitions = FSUtils.getAllPartitionPaths(engineContext, storage, getMetadataTableBasePath(basePath), false); + List metadataTablePartitions = FSUtils.getAllPartitionPaths(engineContext, metadataMetaClient, false); // check if the last instant is restore, then the metadata table should have only the partitions that are not deleted metaClient.reloadActiveTimeline().getReverseOrderedInstants().findFirst().ifPresent(instant -> { if (instant.getAction().equals(HoodieActiveTimeline.RESTORE_ACTION)) { diff --git a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/client/functional/TestHoodieJavaClientOnCopyOnWriteStorageForTestFormat.java b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/client/functional/TestHoodieJavaClientOnCopyOnWriteStorageForTestFormat.java new file mode 100644 index 000000000000..4d3c8326c2b9 --- /dev/null +++ b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/client/functional/TestHoodieJavaClientOnCopyOnWriteStorageForTestFormat.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.functional; + +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.tableformat.TestTableFormat; + +import org.apache.hadoop.fs.FileSystem; +import org.junit.jupiter.api.AfterAll; + +import java.io.IOException; + +public class TestHoodieJavaClientOnCopyOnWriteStorageForTestFormat extends TestHoodieJavaClientOnCopyOnWriteStorage { + @Override + protected void initMetaClient() throws IOException { + if (basePath == null) { + initPath(); + } + storageConf.set(HoodieMetadataConfig.ENABLE_METADATA_INDEX_COLUMN_STATS.key(), "false"); + storageConf.set(HoodieTableConfig.TABLE_FORMAT.key(), "test-format"); + storageConf.set(HoodieMetadataConfig.ENABLE.key(), "false"); + metaClient = HoodieTestUtils.init(storageConf, basePath, HoodieTableType.COPY_ON_WRITE); + } + + @AfterAll + public static void tearDownAll() throws IOException { + TestTableFormat.tearDown(); + FileSystem.closeAll(); + } +} diff --git a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/testutils/HoodieJavaClientTestHarness.java b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/testutils/HoodieJavaClientTestHarness.java index 5595a1aa77bc..e4787f97a43c 100644 --- a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/testutils/HoodieJavaClientTestHarness.java +++ b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/testutils/HoodieJavaClientTestHarness.java @@ -280,7 +280,7 @@ public void syncTableMetadata(HoodieWriteConfig writeConfig) { } protected HoodieTableMetadata metadata(HoodieWriteConfig clientConfig, HoodieEngineContext engineContext) { - return HoodieTableMetadata.create(engineContext, metaClient.getStorage(), clientConfig.getMetadataConfig(), clientConfig.getBasePath()); + return metaClient.getTableFormat().getMetadataFactory().create(engineContext, metaClient.getStorage(), clientConfig.getMetadataConfig(), clientConfig.getBasePath()); } @Override @@ -439,8 +439,7 @@ private void runFullValidation(HoodieWriteConfig writeConfig, // Metadata table has a fixed number of partitions // Cannot use FSUtils.getAllFoldersWithPartitionMetaFile for this as that function filters all directory // in the .hoodie folder. - List metadataTablePartitions = FSUtils.getAllPartitionPaths( - engineContext, storage, HoodieTableMetadata.getMetadataTableBasePath(basePath), false); + List metadataTablePartitions = FSUtils.getAllPartitionPaths(engineContext, metadataMetaClient, false); // Metadata table should automatically compact and clean // versions are +1 as autoClean / compaction happens end of commits diff --git a/hudi-client/hudi-java-client/src/test/resources/META-INF/services/org.apache.hudi.common.HoodieTableFormat b/hudi-client/hudi-java-client/src/test/resources/META-INF/services/org.apache.hudi.common.HoodieTableFormat new file mode 100644 index 000000000000..361558b871c1 --- /dev/null +++ b/hudi-client/hudi-java-client/src/test/resources/META-INF/services/org.apache.hudi.common.HoodieTableFormat @@ -0,0 +1,18 @@ +########################################################################## +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +########################################################################## +org.apache.hudi.tableformat.TestTableFormat diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertOverwriteTableCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertOverwriteTableCommitActionExecutor.java index 776ec1dbf1ec..d300ea683a90 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertOverwriteTableCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertOverwriteTableCommitActionExecutor.java @@ -45,8 +45,7 @@ public SparkInsertOverwriteTableCommitActionExecutor(HoodieEngineContext context @Override protected Map> getPartitionToReplacedFileIds(HoodieWriteMetadata> writeMetadata) { - List partitionPaths = FSUtils.getAllPartitionPaths( - context, table.getStorage(), config.getMetadataConfig(), table.getMetaClient().getBasePath()); + List partitionPaths = FSUtils.getAllPartitionPaths(context, table.getMetaClient(), config.getMetadataConfig()); if (partitionPaths == null || partitionPaths.isEmpty()) { return Collections.emptyMap(); } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestClientRollback.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestClientRollback.java index 0ac5d652eb2e..375f3eb592ff 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestClientRollback.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestClientRollback.java @@ -139,9 +139,9 @@ public void testSavepointAndRollback(Boolean testFailedRestore, Boolean failedRe client.commit(newCommitTime, jsc.parallelize(statusList), Option.empty(), COMMIT_ACTION, Collections.emptyMap(), Option.empty()); HoodieWriteConfig config = getConfig(); - List partitionPaths = - FSUtils.getAllPartitionPaths(context, storage, config.getMetadataConfig(), cfg.getBasePath()); metaClient = HoodieTableMetaClient.reload(metaClient); + List partitionPaths = + FSUtils.getAllPartitionPaths(context, metaClient, config.getMetadataConfig()); HoodieSparkTable table = HoodieSparkTable.create(getConfig(), context, metaClient); final BaseFileOnlyView view1 = table.getBaseFileOnlyView(); @@ -322,9 +322,9 @@ public void testSavepointAndRollbackWithKeepLatestFileVersionPolicy() throws Exc assertNoWriteErrors(statusList); HoodieWriteConfig config = getConfig(); - List partitionPaths = - FSUtils.getAllPartitionPaths(context, storage, config.getMetadataConfig(), cfg.getBasePath()); metaClient = HoodieTableMetaClient.reload(metaClient); + List partitionPaths = + FSUtils.getAllPartitionPaths(context, metaClient, config.getMetadataConfig()); HoodieSparkTable table = HoodieSparkTable.create(getConfig(), context, metaClient); final BaseFileOnlyView view1 = table.getBaseFileOnlyView(); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestExternalPathHandling.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestExternalPathHandling.java index 651ca99d4c1a..9ed49a52960e 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestExternalPathHandling.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestExternalPathHandling.java @@ -216,7 +216,8 @@ private interface FileIdAndNameGenerator { } private void assertFileGroupCorrectness(String instantTime, String partitionPath, String filePath, String fileId, int expectedSize) { - HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(context, metaClient.getStorage(), writeConfig.getMetadataConfig(), metaClient.getBasePath().toString()); + HoodieTableMetadata tableMetadata = metaClient.getTableFormat().getMetadataFactory().create( + context, metaClient.getStorage(), writeConfig.getMetadataConfig(), metaClient.getBasePath().toString()); HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(tableMetadata, metaClient, metaClient.reloadActiveTimeline()); List fileGroups = fsView.getAllFileGroups(partitionPath).collect(Collectors.toList()); Assertions.assertEquals(expectedSize, fileGroups.size()); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieSparkClientTestHarness.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieSparkClientTestHarness.java index 5278c0f7f418..49af02e05808 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieSparkClientTestHarness.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieSparkClientTestHarness.java @@ -574,7 +574,7 @@ public HoodieBackedTableMetadataWriter metadataWriter(HoodieWriteConfig clientCo public HoodieTableMetadata metadata(HoodieWriteConfig clientConfig, HoodieEngineContext hoodieEngineContext) { - return HoodieTableMetadata.create( + return metaClient.getTableFormat().getMetadataFactory().create( hoodieEngineContext, storage, clientConfig.getMetadataConfig(), clientConfig.getBasePath()); } @@ -651,8 +651,7 @@ private void runFullValidation(HoodieMetadataConfig metadataConfig, // Metadata table has a fixed number of partitions // Cannot use FSUtils.getAllFoldersWithPartitionMetaFile for this as that function filters all directory // in the .hoodie folder. - List metadataTablePartitions = FSUtils.getAllPartitionPaths( - engineContext, storage, HoodieTableMetadata.getMetadataTableBasePath(basePath), false); + List metadataTablePartitions = FSUtils.getAllPartitionPaths(engineContext, metadataMetaClient, false); // Metadata table should automatically compact and clean // versions are +1 as autoClean / compaction happens end of commits diff --git a/hudi-common/src/main/java/org/apache/hudi/BaseHoodieTableFileIndex.java b/hudi-common/src/main/java/org/apache/hudi/BaseHoodieTableFileIndex.java index 9d6aa0316c55..9919e0bde3cb 100644 --- a/hudi-common/src/main/java/org/apache/hudi/BaseHoodieTableFileIndex.java +++ b/hudi-common/src/main/java/org/apache/hudi/BaseHoodieTableFileIndex.java @@ -48,6 +48,7 @@ import org.apache.hudi.internal.schema.Types; import org.apache.hudi.metadata.HoodieTableMetadata; import org.apache.hudi.metadata.HoodieTableMetadataUtil; +import org.apache.hudi.metadata.TableMetadataFactory; import org.apache.hudi.storage.HoodieStorage; import org.apache.hudi.storage.StoragePath; import org.apache.hudi.storage.StoragePathInfo; @@ -436,7 +437,7 @@ private List listPartitionPathFiles(List partiti private void doRefresh() { HoodieTimer timer = HoodieTimer.start(); - resetTableMetadata(createMetadataTable(engineContext, metaClient.getStorage(), metadataConfig, basePath)); + resetTableMetadata(createMetadataTable(engineContext, metaClient.getStorage(), metaClient.getTableFormat().getMetadataFactory(), metadataConfig, basePath)); // Make sure we reload active timeline metaClient.reloadActiveTimeline(); @@ -535,10 +536,11 @@ private void resetTableMetadata(HoodieTableMetadata newTableMetadata) { private static HoodieTableMetadata createMetadataTable( HoodieEngineContext engineContext, HoodieStorage storage, + TableMetadataFactory metadataFactory, HoodieMetadataConfig metadataConfig, StoragePath basePath ) { - HoodieTableMetadata newTableMetadata = HoodieTableMetadata.create( + HoodieTableMetadata newTableMetadata = metadataFactory.create( engineContext, storage, metadataConfig, basePath.toString(), true); return newTableMetadata; } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/HoodieTableFormat.java b/hudi-common/src/main/java/org/apache/hudi/common/HoodieTableFormat.java new file mode 100644 index 000000000000..01b018af9b1b --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/HoodieTableFormat.java @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common; + +import org.apache.hudi.avro.model.HoodieCleanMetadata; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.TimelineFactory; +import org.apache.hudi.common.table.view.FileSystemViewManager; +import org.apache.hudi.metadata.TableMetadataFactory; + +import java.io.Serializable; +import java.util.List; +import java.util.Properties; +import java.util.function.Supplier; + +/** + * External Table Format needs to implement this interface + */ +public interface HoodieTableFormat extends Serializable { + + /** + * Returns the name of the table format. + */ + String getName(); + + /** + * Initializes the table format implementation with the properties supplied from {@link org.apache.hudi.common.table.HoodieTableConfig} + */ + default void init(Properties properties) { + } + + /** + * Called just after marking the write action as complete in hoodie timeline. Implementation expected to save additional state needed in + * extraMetadata. + * + * @param commitMetadata HoodieCommitMetadata for commit or clustering action. + * @param completedInstant completed instant in hoodie timeline + * @param engineContext engine context used for execution - local,spark or flink etc. + * @param metaClient metaClient from HoodieTable. + * @param viewManager viewManager from HoodieTable. + */ + + default void commit( + HoodieCommitMetadata commitMetadata, + HoodieInstant completedInstant, + HoodieEngineContext engineContext, + HoodieTableMetaClient metaClient, + FileSystemViewManager viewManager) { + } + + /** + * Called after marking the clean action as complete in hoodie timeline. + * + * @param cleanMetadata HoodieCleanMetadata for clean action. + * @param completedInstant completed instant in hoodie timeline + * @param engineContext engine context used for execution - local,spark or flink etc. + * @param metaClient metaClient from HoodieTable. + * @param viewManager viewManager from HoodieTable. + */ + default void clean( + HoodieCleanMetadata cleanMetadata, + HoodieInstant completedInstant, + HoodieEngineContext engineContext, + HoodieTableMetaClient metaClient, + FileSystemViewManager viewManager) { + } + + /** + * Called after archiving the instants in hoodie timeline. + * + * @param archivedInstants List of instants archived in hoodie timeline + * @param engineContext engine context used for execution - local,spark or flink etc. + * @param metaClient metaClient from HoodieTable. + * @param viewManager viewManager from HoodieTable. + */ + default void archive( + Supplier> archivedInstants, + HoodieEngineContext engineContext, + HoodieTableMetaClient metaClient, + FileSystemViewManager viewManager) { + } + + /** + * Called before rolling back the instant in hoodie timeline. + * + * @param completedInstant completed rollback instant in hoodie timeline + * @param engineContext engine context used for execution - local,spark or flink etc. + * @param metaClient metaClient from HoodieTable. + * @param viewManager viewManager from HoodieTable. + */ + default void rollback( + HoodieInstant completedInstant, + HoodieEngineContext engineContext, + HoodieTableMetaClient metaClient, + FileSystemViewManager viewManager) { + } + + /** + * Called after marking a rollback action as complete in hoodie timeline. + * + * @param rollbackInstant The completed rollback instant in hoodie timeline. + * @param engineContext engine context used for execution - local,spark or flink etc. + * @param metaClient metaClient from HoodieTable. + * @param viewManager viewManager from HoodieTable. + */ + default void completedRollback( + HoodieInstant rollbackInstant, + HoodieEngineContext engineContext, + HoodieTableMetaClient metaClient, + FileSystemViewManager viewManager) { + } + + /** + * Called after marking a complete write action as "savepoint" in the hoodie timeline. + * + * @param savepointInstant The completed instant to be marked as savepoint. + * @param engineContext engine context used for execution - local,spark or flink etc. + * @param metaClient metaClient from HoodieTable. + * @param viewManager viewManager from HoodieTable. + */ + default void savepoint( + HoodieInstant savepointInstant, + HoodieEngineContext engineContext, + HoodieTableMetaClient metaClient, + FileSystemViewManager viewManager) { + } + + /** + * Return the timeline factory for table format. + */ + TimelineFactory getTimelineFactory(); + + /** + * Return the table metadata factory for table format. + */ + TableMetadataFactory getMetadataFactory(); +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/NativeTableFormat.java b/hudi-common/src/main/java/org/apache/hudi/common/NativeTableFormat.java new file mode 100644 index 000000000000..6c57ed0ab167 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/NativeTableFormat.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common; + +import org.apache.hudi.common.table.timeline.TimelineFactory; +import org.apache.hudi.common.table.timeline.TimelineLayout; +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; +import org.apache.hudi.metadata.NativeTableMetadataFactory; +import org.apache.hudi.metadata.TableMetadataFactory; + +public class NativeTableFormat implements HoodieTableFormat { + public static final String TABLE_FORMAT = "native"; + private final TimelineLayoutVersion timelineLayoutVersion; + + public NativeTableFormat(TimelineLayoutVersion timelineLayoutVersion) { + this.timelineLayoutVersion = timelineLayoutVersion; + } + + @Override + public String getName() { + return NativeTableFormat.TABLE_FORMAT; + } + + public TimelineFactory getTimelineFactory() { + return TimelineLayout.fromVersion(timelineLayoutVersion).getTimelineFactory(); + } + + @Override + public TableMetadataFactory getMetadataFactory() { + return NativeTableMetadataFactory.getInstance(); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java index 451da6c179ba..a84d518bfccc 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java @@ -238,13 +238,13 @@ public static void processFiles(HoodieStorage storage, String basePathStr, Funct } public static List getAllPartitionPaths(HoodieEngineContext engineContext, - HoodieStorage storage, - String basePathStr, + HoodieTableMetaClient metaClient, boolean useFileListingFromMetadata) { HoodieMetadataConfig metadataConfig = HoodieMetadataConfig.newBuilder() .enable(useFileListingFromMetadata) .build(); - try (HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(engineContext, storage, metadataConfig, basePathStr)) { + try (HoodieTableMetadata tableMetadata = metaClient.getTableFormat().getMetadataFactory() + .create(engineContext, metaClient.getStorage(), metadataConfig, metaClient.getBasePath().toString())) { return tableMetadata.getAllPartitionPaths(); } catch (Exception e) { throw new HoodieException("Error fetching partition paths from metadata table", e); @@ -252,38 +252,22 @@ public static List getAllPartitionPaths(HoodieEngineContext engineContex } public static List getAllPartitionPaths(HoodieEngineContext engineContext, - HoodieStorage storage, - HoodieMetadataConfig metadataConfig, - String basePathStr) { - try (HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(engineContext, storage, metadataConfig, - basePathStr)) { + HoodieTableMetaClient metaClient, + HoodieMetadataConfig metadataConfig) { + try (HoodieTableMetadata tableMetadata = metaClient.getTableFormat().getMetadataFactory() + .create(engineContext, metaClient.getStorage(), metadataConfig, metaClient.getBasePath().toString())) { return tableMetadata.getAllPartitionPaths(); } catch (Exception e) { throw new HoodieException("Error fetching partition paths from metadata table", e); } } - public static List getAllPartitionPaths(HoodieEngineContext engineContext, - HoodieStorage storage, - StoragePath basePath, - boolean useFileListingFromMetadata) { - return getAllPartitionPaths(engineContext, storage, basePath.toString(), useFileListingFromMetadata); - } - - public static List getAllPartitionPaths(HoodieEngineContext engineContext, - HoodieStorage storage, - HoodieMetadataConfig metadataConfig, - StoragePath basePath) { - return getAllPartitionPaths(engineContext, storage, metadataConfig, basePath.toString()); - } - public static Map> getFilesInPartitions(HoodieEngineContext engineContext, - HoodieStorage storage, + HoodieTableMetaClient metaClient, HoodieMetadataConfig metadataConfig, - String basePathStr, String[] partitionPaths) { - try (HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(engineContext, storage, metadataConfig, - basePathStr)) { + try (HoodieTableMetadata tableMetadata = metaClient.getTableFormat().getMetadataFactory() + .create(engineContext, metaClient.getStorage(), metadataConfig, metaClient.getBasePath().toString())) { return tableMetadata.getAllFilesInPartitions(Arrays.asList(partitionPaths)); } catch (Exception ex) { throw new HoodieException("Error get files in partitions: " + String.join(",", partitionPaths), ex); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java index 03db7a56739f..6cd6f1b1b230 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java @@ -18,6 +18,8 @@ package org.apache.hudi.common.table; +import org.apache.hudi.common.HoodieTableFormat; +import org.apache.hudi.common.NativeTableFormat; import org.apache.hudi.common.bootstrap.index.hfile.HFileBootstrapIndex; import org.apache.hudi.common.config.ConfigClassProperty; import org.apache.hudi.common.config.ConfigGroups; @@ -72,6 +74,7 @@ import java.util.List; import java.util.Map; import java.util.Properties; +import java.util.ServiceLoader; import java.util.Set; import java.util.function.BiConsumer; import java.util.function.Function; @@ -198,7 +201,12 @@ public class HoodieTableConfig extends HoodieConfig { .key("hoodie.timeline.layout.version") .noDefaultValue() .withDocumentation("Version of timeline used, by the table."); - + + public static final ConfigProperty TABLE_FORMAT = ConfigProperty + .key("hoodie.table.format") + .defaultValue(NativeTableFormat.TABLE_FORMAT) + .withDocumentation("Table format name used when writing to the table."); + public static final ConfigProperty RECORD_MERGE_MODE = ConfigProperty .key("hoodie.record.merge.mode") .defaultValue((RecordMergeMode) null, @@ -720,6 +728,20 @@ public Option getTimelineLayoutVersion() { : Option.empty(); } + public HoodieTableFormat getTableFormat(TimelineLayoutVersion layoutVersion) { + String tableFormat = getStringOrDefault(TABLE_FORMAT); + if (!tableFormat.equals(NativeTableFormat.TABLE_FORMAT)) { + ServiceLoader loader = ServiceLoader.load(HoodieTableFormat.class); + for (HoodieTableFormat tableFormatImpl : loader) { + if (getString(TABLE_FORMAT).equals(tableFormatImpl.getName())) { + tableFormatImpl.init(props); + return tableFormatImpl; + } + } + } + return new NativeTableFormat(layoutVersion); + } + /** * @return the hoodie.table.version from hoodie.properties file. */ diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java index a6c868768f5e..b527a292dab3 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java @@ -18,6 +18,8 @@ package org.apache.hudi.common.table; +import org.apache.hudi.common.HoodieTableFormat; +import org.apache.hudi.common.NativeTableFormat; import org.apache.hudi.common.config.ConfigProperty; import org.apache.hudi.common.config.HoodieConfig; import org.apache.hudi.common.config.HoodieMetaserverConfig; @@ -165,6 +167,7 @@ public class HoodieTableMetaClient implements Serializable { protected HoodieMetaserverConfig metaserverConfig; private HoodieTimeGeneratorConfig timeGeneratorConfig; private Option indexMetadataOpt = Option.empty(); + private HoodieTableFormat tableFormat; /** * Instantiate HoodieTableMetaClient. @@ -195,6 +198,7 @@ protected HoodieTableMetaClient(HoodieStorage storage, String basePath, boolean throw new TableNotFoundException("Table does not exist"); } this.timelineLayoutVersion = layoutVersion.orElseGet(tableConfigVersion::get); + this.tableFormat = tableConfig.getTableFormat(timelineLayoutVersion); this.timelineLayout = TimelineLayout.fromVersion(timelineLayoutVersion); this.timelinePath = timelineLayout.getTimelinePathProvider().getTimelinePath(tableConfig, this.basePath); this.timelineHistoryPath = timelineLayout.getTimelinePathProvider().getTimelineHistoryPath(tableConfig, this.basePath); @@ -357,6 +361,10 @@ public StoragePath getTimelinePath() { return timelinePath; } + public HoodieTableFormat getTableFormat() { + return tableFormat; + } + /** * @return schema folder path */ @@ -501,7 +509,7 @@ public StorageConfiguration getStorageConf() { */ public synchronized HoodieActiveTimeline getActiveTimeline() { if (activeTimeline == null) { - activeTimeline = timelineLayout.getTimelineFactory().createActiveTimeline(this); + activeTimeline = tableFormat.getTimelineFactory().createActiveTimeline(this); } return activeTimeline; } @@ -514,13 +522,22 @@ public synchronized void reload() { reloadTableConfig(); } + /** + * Get the active instants as a timeline in native format. + * + * @return Active instants timeline + */ + public synchronized HoodieActiveTimeline getActiveTimelineForNativeFormat() { + return new NativeTableFormat(activeTimeline.getTimelineLayoutVersion()).getTimelineFactory().createActiveTimeline(this); + } + /** * Reload ActiveTimeline and cache. * * @return Active instants timeline */ public synchronized HoodieActiveTimeline reloadActiveTimeline() { - activeTimeline = timelineLayout.getTimelineFactory().createActiveTimeline(this); + activeTimeline = tableFormat.getTimelineFactory().createActiveTimeline(this); return activeTimeline; } @@ -538,6 +555,7 @@ public synchronized void reloadTableConfig() { private void reloadTimelineLayoutAndPath() { this.timelineLayoutVersion = tableConfig.getTimelineLayoutVersion().get(); this.timelineLayout = TimelineLayout.fromVersion(timelineLayoutVersion); + this.tableFormat = tableConfig.getTableFormat(timelineLayoutVersion); this.timelinePath = timelineLayout.getTimelinePathProvider().getTimelinePath(tableConfig, basePath); this.timelineHistoryPath = timelineLayout.getTimelinePathProvider().getTimelineHistoryPath(tableConfig, basePath); } @@ -619,8 +637,8 @@ public HoodieArchivedTimeline getArchivedTimeline(String startTs, boolean useCac private HoodieArchivedTimeline instantiateArchivedTimeline(String startTs) { return StringUtils.isNullOrEmpty(startTs) - ? timelineLayout.getTimelineFactory().createArchivedTimeline(this) - : timelineLayout.getTimelineFactory().createArchivedTimeline(this, startTs); + ? tableFormat.getTimelineFactory().createArchivedTimeline(this) + : tableFormat.getTimelineFactory().createArchivedTimeline(this, startTs); } public static void createTableLayoutOnStorage(StorageConfiguration storageConf, @@ -1035,6 +1053,7 @@ public static class TableBuilder { private Boolean multipleBaseFileFormatsEnabled; private String indexDefinitionPath; + private String tableFormat; /** * Persist the configs that is written at the first time, and should not be changed. @@ -1234,6 +1253,11 @@ public TableBuilder setIndexDefinitionPath(String indexDefinitionPath) { return this; } + public TableBuilder setTableFormat(String tableFormat) { + this.tableFormat = tableFormat; + return this; + } + public TableBuilder set(Map props) { for (ConfigProperty configProperty : HoodieTableConfig.PERSISTED_CONFIG_LIST) { if (containsConfigProperty(props, configProperty)) { @@ -1250,6 +1274,7 @@ public TableBuilder fromMetaClient(HoodieTableMetaClient metaClient) { return setTableType(metaClient.getTableType()) .setTableName(metaClient.getTableConfig().getTableName()) .setTableVersion(metaClient.getTableConfig().getTableVersion()) + .setTableFormat(metaClient.getTableConfig().getTableFormat(metaClient.getTimelineLayoutVersion()).getName()) .setTimelinePath(metaClient.getTableConfig().getTimelinePath()) .setArchiveLogFolder(metaClient.getTableConfig().getTimelineHistoryPath()) .setRecordMergeMode(metaClient.getTableConfig().getRecordMergeMode()) @@ -1280,6 +1305,10 @@ public TableBuilder fromProperties(Properties properties) { setTableVersion(hoodieConfig.getInt(VERSION)); } + if (hoodieConfig.contains(HoodieTableConfig.TABLE_FORMAT)) { + setTableFormat(hoodieConfig.getString(HoodieTableConfig.TABLE_FORMAT)); + } + if (hoodieConfig.contains(TIMELINE_PATH)) { setTimelinePath(hoodieConfig.getString(TIMELINE_PATH)); } @@ -1515,6 +1544,9 @@ public Properties build() { if (null != indexDefinitionPath) { tableConfig.setValue(HoodieTableConfig.RELATIVE_INDEX_DEFINITION_PATH, indexDefinitionPath); } + if (null != tableFormat) { + tableConfig.setValue(HoodieTableConfig.TABLE_FORMAT, tableFormat); + } return tableConfig.getProps(); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/read/IncrementalQueryAnalyzer.java b/hudi-common/src/main/java/org/apache/hudi/common/table/read/IncrementalQueryAnalyzer.java index 9f22586a560d..b144eed94e74 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/read/IncrementalQueryAnalyzer.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/read/IncrementalQueryAnalyzer.java @@ -160,7 +160,7 @@ public static Builder builder() { * @return An incremental query context including the instant time range info. */ public QueryContext analyze() { - try (CompletionTimeQueryView completionTimeQueryView = metaClient.getTimelineLayout().getTimelineFactory().createCompletionTimeQueryView(this.metaClient)) { + try (CompletionTimeQueryView completionTimeQueryView = metaClient.getTableFormat().getTimelineFactory().createCompletionTimeQueryView(this.metaClient)) { if (completionTimeQueryView.isEmptyTable()) { // no dataset committed in the table return QueryContext.EMPTY; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/BaseHoodieTimeline.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/BaseHoodieTimeline.java index d09a4f9bed28..0eaedda3492d 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/BaseHoodieTimeline.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/BaseHoodieTimeline.java @@ -18,15 +18,19 @@ package org.apache.hudi.common.table.timeline; +import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant.State; import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; import java.io.InputStream; import java.io.Serializable; import java.security.MessageDigest; @@ -115,6 +119,14 @@ protected void appendInstants(List newInstants) { clearState(); } + protected List getInstantsFromFileSystem(HoodieTableMetaClient metaClient, Set includedExtensions, boolean applyLayoutFilters) { + try { + return metaClient.scanHoodieInstantsFromFileSystem(metaClient.getTimelinePath(), includedExtensions, applyLayoutFilters); + } catch (IOException e) { + throw new HoodieIOException("Failed to scan metadata", e); + } + } + @Override public HoodieTimeline filterInflights() { return factory.createDefaultTimeline(getInstantsAsStream().filter(HoodieInstant::isInflight), getInstantReader()); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieActiveTimeline.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieActiveTimeline.java index 619cc402dbb3..7c48720ba978 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieActiveTimeline.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieActiveTimeline.java @@ -69,7 +69,7 @@ public interface HoodieActiveTimeline extends HoodieTimeline { * @param instant Instant to be saved. * @param metadata metadata to write into the instant file */ - void saveAsComplete(HoodieInstant instant, Option metadata); + HoodieInstant saveAsComplete(HoodieInstant instant, Option metadata); /** * Save Completed instant in active timeline. @@ -77,7 +77,20 @@ public interface HoodieActiveTimeline extends HoodieTimeline { * @param instant Instant to be saved. * @param metadata metadata to write into the instant file */ - void saveAsComplete(boolean shouldLock, HoodieInstant instant, Option metadata); + HoodieInstant saveAsComplete(boolean shouldLock, HoodieInstant instant, Option metadata); + + /** + * Save Completed instant in active timeline with table format completion actions. + * + * @param shouldLock Lock before writing to timeline. + * @param instant Instant to be saved. + * @param metadata metadata to write into the instant file + * @param tableFormatCompletionAction functional interface to perform table format specific completion actions. + * @return The completed hoodie instant + * @param + */ + HoodieInstant saveAsComplete(boolean shouldLock, HoodieInstant instant, Option metadata, TableFormatCompletionAction tableFormatCompletionAction); + /** * Save Completed instant in active timeline with an optional completion time. For version 8 tables, completion times are generated just before wrapping up the commit and serialized as part of @@ -88,7 +101,7 @@ public interface HoodieActiveTimeline extends HoodieTimeline { * @param completionTimeOpt an optinal instance of completion time. * @param */ - void saveAsComplete(boolean shouldLock, HoodieInstant instant, Option metadata, Option completionTimeOpt); + HoodieInstant saveAsComplete(boolean shouldLock, HoodieInstant instant, Option metadata, Option completionTimeOpt); /** * Delete Compaction requested instant file from timeline. @@ -227,6 +240,8 @@ public interface HoodieActiveTimeline extends HoodieTimeline { */ HoodieInstant transitionCleanInflightToComplete(boolean shouldLock, HoodieInstant inflightInstant, Option metadata); + HoodieInstant transitionCleanInflightToComplete(boolean shouldLock, HoodieInstant inflightInstant, Option metadata, TableFormatCompletionAction tableFormatCompletionAction); + /** * Transition Clean State from requested to inflight. * @@ -245,6 +260,8 @@ public interface HoodieActiveTimeline extends HoodieTimeline { */ HoodieInstant transitionRollbackInflightToComplete(boolean shouldLock, HoodieInstant inflightInstant, HoodieRollbackMetadata metadata); + HoodieInstant transitionRollbackInflightToComplete(boolean shouldLock, HoodieInstant inflightInstant, HoodieRollbackMetadata metadata, TableFormatCompletionAction tableFormatCompletionAction); + /** * Transition Rollback State from requested to inflight. * @@ -289,6 +306,8 @@ public interface HoodieActiveTimeline extends HoodieTimeline { */ HoodieInstant transitionReplaceInflightToComplete(boolean shouldLock, HoodieInstant inflightInstant, HoodieReplaceCommitMetadata metadata); + HoodieInstant transitionReplaceInflightToComplete(boolean shouldLock, HoodieInstant inflightInstant, HoodieReplaceCommitMetadata metadata, TableFormatCompletionAction tableFormatCompletionAction); + /** * Transition cluster inflight to replace committed. * @@ -299,6 +318,8 @@ public interface HoodieActiveTimeline extends HoodieTimeline { */ HoodieInstant transitionClusterInflightToComplete(boolean shouldLock, HoodieInstant inflightInstant, HoodieReplaceCommitMetadata metadata); + HoodieInstant transitionClusterInflightToComplete(boolean shouldLock, HoodieInstant inflightInstant, HoodieReplaceCommitMetadata metadata, TableFormatCompletionAction tableFormatCompletionAction); + /** * Save Restore requested instant with metadata. * @param commitType Instant type. @@ -416,4 +437,4 @@ public interface HoodieActiveTimeline extends HoodieTimeline { * @return */ Set getValidExtensions(); -} +} \ No newline at end of file diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TableFormatCompletionAction.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TableFormatCompletionAction.java new file mode 100644 index 000000000000..b2ad0eb5a235 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TableFormatCompletionAction.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.table.timeline; + + +/** + * Functional Interface for executing table format actions. + */ +@FunctionalInterface +public interface TableFormatCompletionAction { + /** + * Execute the table format action with the given completed instant. + */ + void execute(HoodieInstant completedInstant); +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineUtils.java index 5fc87292c75e..8ad6239d4ef9 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineUtils.java @@ -582,7 +582,7 @@ public static HoodieInstant getInflightInstant(final HoodieInstant instant, fina } else if (instant.getAction().equals(DELTA_COMMIT_ACTION)) { // Deltacommit is used by both ingestion and logcompaction. // So, distinguish both of them check for the inflight file being present. - HoodieActiveTimeline rawActiveTimeline = metaClient.getTimelineLayout().getTimelineFactory().createActiveTimeline(metaClient, false); + HoodieActiveTimeline rawActiveTimeline = metaClient.getTableFormat().getTimelineFactory().createActiveTimeline(metaClient, false); Option logCompactionInstant = Option.fromJavaOptional(rawActiveTimeline.getInstantsAsStream() .filter(hoodieInstant -> hoodieInstant.requestedTime().equals(instant.requestedTime()) && LOG_COMPACTION_ACTION.equals(hoodieInstant.getAction())).findFirst()); @@ -603,7 +603,7 @@ public enum HollowCommitHandling { */ public static HoodieTimeline concatTimeline(HoodieTimeline timeline1, HoodieTimeline timeline2, HoodieTableMetaClient metaClient) { - return metaClient.getTimelineLayout().getTimelineFactory().createDefaultTimeline( + return metaClient.getTableFormat().getTimelineFactory().createDefaultTimeline( Stream.concat(timeline1.getInstantsAsStream(), timeline2.getInstantsAsStream()).sorted(), metaClient.getActiveTimeline()); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/dto/InstantDTO.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/dto/InstantDTO.java index 006daf88a538..662c27ee9c1f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/dto/InstantDTO.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/dto/InstantDTO.java @@ -37,6 +37,12 @@ public class InstantDTO { @JsonProperty("state") String state; + @JsonProperty("requestedTime") + String requestedTime; + + @JsonProperty("completionTime") + String completionTime; + public static InstantDTO fromInstant(HoodieInstant instant) { if (null == instant) { return null; @@ -45,6 +51,8 @@ public static InstantDTO fromInstant(HoodieInstant instant) { InstantDTO dto = new InstantDTO(); dto.action = instant.getAction(); dto.timestamp = instant.requestedTime(); + dto.requestedTime = instant.requestedTime(); + dto.completionTime = instant.getCompletionTime(); dto.state = instant.getState().toString(); return dto; } @@ -54,6 +62,6 @@ public static HoodieInstant toInstant(InstantDTO dto, InstantGenerator factory) return null; } - return factory.createNewInstant(HoodieInstant.State.valueOf(dto.state), dto.action, dto.timestamp); + return factory.createNewInstant(HoodieInstant.State.valueOf(dto.state), dto.action, dto.requestedTime, dto.completionTime); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/dto/TimelineDTO.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/dto/TimelineDTO.java index f4a7deb4c034..5822832af616 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/dto/TimelineDTO.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/dto/TimelineDTO.java @@ -19,6 +19,7 @@ package org.apache.hudi.common.table.timeline.dto; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.InstantGenerator; import org.apache.hudi.common.table.timeline.TimelineFactory; @@ -44,11 +45,21 @@ public static TimelineDTO fromTimeline(HoodieTimeline timeline) { return dto; } + public static TimelineDTO fromInstants(List instants) { + TimelineDTO dto = new TimelineDTO(); + dto.instants = instants.stream().map(InstantDTO::fromInstant).collect(Collectors.toList()); + return dto; + } + public static HoodieTimeline toTimeline(TimelineDTO dto, HoodieTableMetaClient metaClient) { InstantGenerator instantGenerator = metaClient.getInstantGenerator(); - TimelineFactory factory = metaClient.getTimelineLayout().getTimelineFactory(); + TimelineFactory factory = metaClient.getTableFormat().getTimelineFactory(); // TODO: For Now, we will assume, only active-timeline will be transferred. return factory.createDefaultTimeline(dto.instants.stream().map(d -> InstantDTO.toInstant(d, instantGenerator)), metaClient.getActiveTimeline()); } + + public static HoodieTimeline toTimeline(TimelineDTO dto, TimelineFactory factory, HoodieTimeline timeline, InstantGenerator instantGenerator) { + return factory.createDefaultTimeline(dto.instants.stream().map(d -> InstantDTO.toInstant(d, instantGenerator)), timeline.getInstantReader()); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/versioning/v1/ActiveTimelineV1.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/versioning/v1/ActiveTimelineV1.java index a3537f2a9463..cd5d8d7664be 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/versioning/v1/ActiveTimelineV1.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/versioning/v1/ActiveTimelineV1.java @@ -35,6 +35,7 @@ import org.apache.hudi.common.table.timeline.HoodieInstantReader; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.InstantFileNameGenerator; +import org.apache.hudi.common.table.timeline.TableFormatCompletionAction; import org.apache.hudi.common.table.timeline.TimelineUtils; import org.apache.hudi.common.util.FileIOUtils; import org.apache.hudi.common.util.Option; @@ -84,12 +85,7 @@ protected ActiveTimelineV1(HoodieTableMetaClient metaClient, Set include boolean applyLayoutFilters) { // Filter all the filter in the metapath and include only the extensions passed and // convert them into HoodieInstant - try { - this.setInstants(metaClient.scanHoodieInstantsFromFileSystem(metaClient.getTimelinePath(), - includedExtensions, applyLayoutFilters)); - } catch (IOException e) { - throw new HoodieIOException("Failed to scan metadata", e); - } + this.setInstants(getInstantsFromFileSystem(metaClient, includedExtensions, applyLayoutFilters)); this.metaClient = metaClient; // multiple casts will make this lambda serializable - // http://docs.oracle.com/javase/specs/jls/se8/html/jls-15.html#jls-15.16 @@ -151,22 +147,31 @@ public HoodieInstant createRequestedCommitWithReplaceMetadata(String instantTime } @Override - public void saveAsComplete(HoodieInstant instant, Option metadata) { - LOG.info("Marking instant complete {}", instant); + public HoodieInstant saveAsComplete(HoodieInstant instant, Option metadata) { + LOG.info("Marking instant complete " + instant); ValidationUtils.checkArgument(instant.isInflight(), "Could not mark an already completed instant as complete again " + instant); - transitionState(instant, instantGenerator.createNewInstant(HoodieInstant.State.COMPLETED, instant.getAction(), instant.requestedTime()), metadata); + HoodieInstant completedInstant = instantGenerator.createNewInstant(HoodieInstant.State.COMPLETED, instant.getAction(), instant.requestedTime()); + transitionState(instant, completedInstant, metadata); LOG.info("Completed {}", instant); + return completedInstant; } @Override - public void saveAsComplete(boolean shouldLock, HoodieInstant instant, Option metadata) { - saveAsComplete(instant, metadata); + public HoodieInstant saveAsComplete(boolean shouldLock, HoodieInstant instant, Option metadata) { + return saveAsComplete(instant, metadata); } @Override - public void saveAsComplete(boolean shouldLock, HoodieInstant instant, Option metadata, Option completionTimeOpt) { - saveAsComplete(instant, metadata); + public HoodieInstant saveAsComplete(boolean shouldLock, HoodieInstant instant, Option metadata, Option completionTimeOpt) { + return saveAsComplete(instant, metadata); + } + + @Override + public HoodieInstant saveAsComplete(boolean shouldLock, HoodieInstant instant, Option metadata, TableFormatCompletionAction tableFormatCompletionAction) { + HoodieInstant completedInstant = saveAsComplete(shouldLock, instant, metadata); + tableFormatCompletionAction.execute(completedInstant); + return completedInstant; } @Override @@ -401,6 +406,14 @@ public HoodieInstant transitionCleanInflightToComplete(boolean shouldLock, Hoodi return commitInstant; } + @Override + public HoodieInstant transitionCleanInflightToComplete(boolean shouldLock, HoodieInstant inflightInstant, Option metadata, + TableFormatCompletionAction tableFormatCompletionAction) { + HoodieInstant completedInstant = transitionCleanInflightToComplete(shouldLock, inflightInstant, metadata); + tableFormatCompletionAction.execute(completedInstant); + return completedInstant; + } + @Override public HoodieInstant transitionCleanRequestedToInflight(HoodieInstant requestedInstant) { ValidationUtils.checkArgument(requestedInstant.getAction().equals(HoodieTimeline.CLEAN_ACTION)); @@ -420,6 +433,14 @@ public HoodieInstant transitionRollbackInflightToComplete(boolean shouldLock, Ho return commitInstant; } + @Override + public HoodieInstant transitionRollbackInflightToComplete(boolean shouldLock, HoodieInstant inflightInstant, HoodieRollbackMetadata metadata, + TableFormatCompletionAction tableFormatCompletionAction) { + HoodieInstant completedInstant = transitionRollbackInflightToComplete(shouldLock, inflightInstant, metadata); + tableFormatCompletionAction.execute(completedInstant); + return completedInstant; + } + @Override public HoodieInstant transitionRollbackRequestedToInflight(HoodieInstant requestedInstant) { ValidationUtils.checkArgument(requestedInstant.getAction().equals(HoodieTimeline.ROLLBACK_ACTION)); @@ -466,12 +487,28 @@ public HoodieInstant transitionReplaceInflightToComplete( return commitInstant; } + @Override + public HoodieInstant transitionReplaceInflightToComplete(boolean shouldLock, HoodieInstant inflightInstant, HoodieReplaceCommitMetadata metadata, + TableFormatCompletionAction tableFormatCompletionAction) { + HoodieInstant completedInstant = transitionReplaceInflightToComplete(shouldLock, inflightInstant, metadata); + tableFormatCompletionAction.execute(completedInstant); + return completedInstant; + } + @Override public HoodieInstant transitionClusterInflightToComplete(boolean shouldLock, HoodieInstant inflightInstant, HoodieReplaceCommitMetadata metadata) { // In 0.x, no separate clustering action, reuse replace action. return transitionReplaceInflightToComplete(shouldLock, inflightInstant, metadata); } + @Override + public HoodieInstant transitionClusterInflightToComplete(boolean shouldLock, HoodieInstant inflightInstant, HoodieReplaceCommitMetadata metadata, + TableFormatCompletionAction tableFormatCompletionAction) { + HoodieInstant completedInstant = transitionClusterInflightToComplete(shouldLock, inflightInstant, metadata); + tableFormatCompletionAction.execute(completedInstant); + return completedInstant; + } + private void transitionState(HoodieInstant fromInstant, HoodieInstant toInstant, Option metadata) { transitionState(fromInstant, toInstant, metadata, false); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/versioning/v2/ActiveTimelineV2.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/versioning/v2/ActiveTimelineV2.java index 9da142c2b3d1..b823c53580dd 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/versioning/v2/ActiveTimelineV2.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/versioning/v2/ActiveTimelineV2.java @@ -36,6 +36,7 @@ import org.apache.hudi.common.table.timeline.HoodieInstantTimeGenerator; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.InstantFileNameGenerator; +import org.apache.hudi.common.table.timeline.TableFormatCompletionAction; import org.apache.hudi.common.table.timeline.TimeGenerator; import org.apache.hudi.common.table.timeline.TimeGenerators; import org.apache.hudi.common.table.timeline.TimelineUtils; @@ -61,6 +62,7 @@ import java.util.HashSet; import java.util.Objects; import java.util.Set; +import java.util.concurrent.atomic.AtomicReference; import java.util.stream.Stream; import static org.apache.hudi.common.table.timeline.TimelineUtils.getHoodieInstantWriterOption; @@ -89,12 +91,7 @@ private ActiveTimelineV2(HoodieTableMetaClient metaClient, Set includedE boolean applyLayoutFilters) { // Filter all the filter in the metapath and include only the extensions passed and // convert them into HoodieInstant - try { - this.setInstants(metaClient.scanHoodieInstantsFromFileSystem(metaClient.getTimelinePath(), - includedExtensions, applyLayoutFilters)); - } catch (IOException e) { - throw new HoodieIOException("Failed to scan metadata", e); - } + this.setInstants(getInstantsFromFileSystem(metaClient, includedExtensions, applyLayoutFilters)); this.metaClient = metaClient; // multiple casts will make this lambda serializable - // http://docs.oracle.com/javase/specs/jls/se8/html/jls-15.html#jls-15.16 @@ -156,23 +153,31 @@ public HoodieInstant createRequestedCommitWithReplaceMetadata(String instantTime } @Override - public void saveAsComplete(HoodieInstant instant, Option metadata) { - saveAsComplete(true, instant, metadata); + public HoodieInstant saveAsComplete(HoodieInstant instant, Option metadata) { + return saveAsComplete(true, instant, metadata); } @Override - public void saveAsComplete(boolean shouldLock, HoodieInstant instant, Option metadata) { - saveAsComplete(shouldLock, instant, metadata, Option.empty()); + public HoodieInstant saveAsComplete(boolean shouldLock, HoodieInstant instant, Option metadata) { + return saveAsComplete(shouldLock, instant, metadata, Option.empty()); } @Override - public void saveAsComplete(boolean shouldLock, HoodieInstant instant, Option metadata, Option completionTimeOpt) { + public HoodieInstant saveAsComplete(boolean shouldLock, HoodieInstant instant, Option metadata, Option completionTimeOpt) { LOG.info("Marking instant complete {}", instant); ValidationUtils.checkArgument(instant.isInflight(), "Could not mark an already completed instant as complete again " + instant); HoodieInstant commitInstant = instantGenerator.createNewInstant(HoodieInstant.State.COMPLETED, instant.getAction(), instant.requestedTime()); - transitionStateToComplete(shouldLock, instant, commitInstant, metadata, completionTimeOpt); + HoodieInstant completedInstant = transitionStateToComplete(shouldLock, instant, commitInstant, metadata, completionTimeOpt); LOG.info("Completed " + instant); + return completedInstant; + } + + @Override + public HoodieInstant saveAsComplete(boolean shouldLock, HoodieInstant instant, Option metadata, TableFormatCompletionAction tableFormatCompletionAction) { + HoodieInstant completedInstant = saveAsComplete(shouldLock, instant, metadata); + tableFormatCompletionAction.execute(completedInstant); + return completedInstant; } @Override @@ -413,8 +418,15 @@ public HoodieInstant transitionCleanInflightToComplete(boolean shouldLock, Hoodi ValidationUtils.checkArgument(inflightInstant.isInflight()); HoodieInstant commitInstant = instantGenerator.createNewInstant(HoodieInstant.State.COMPLETED, CLEAN_ACTION, inflightInstant.requestedTime()); // Then write to timeline - transitionStateToComplete(shouldLock, inflightInstant, commitInstant, metadata); - return commitInstant; + return transitionStateToComplete(shouldLock, inflightInstant, commitInstant, metadata); + } + + @Override + public HoodieInstant transitionCleanInflightToComplete(boolean shouldLock, HoodieInstant inflightInstant, Option metadata, + TableFormatCompletionAction tableFormatCompletionAction) { + HoodieInstant completedInstant = transitionCleanInflightToComplete(shouldLock, inflightInstant, metadata); + tableFormatCompletionAction.execute(completedInstant); + return completedInstant; } @Override @@ -432,8 +444,15 @@ public HoodieInstant transitionRollbackInflightToComplete(boolean shouldLock, Ho ValidationUtils.checkArgument(inflightInstant.isInflight()); HoodieInstant commitInstant = instantGenerator.createNewInstant(HoodieInstant.State.COMPLETED, ROLLBACK_ACTION, inflightInstant.requestedTime()); // Then write to timeline - transitionStateToComplete(shouldLock, inflightInstant, commitInstant, Option.of(metadata)); - return commitInstant; + return transitionStateToComplete(shouldLock, inflightInstant, commitInstant, Option.of(metadata)); + } + + @Override + public HoodieInstant transitionRollbackInflightToComplete(boolean shouldLock, HoodieInstant inflightInstant, HoodieRollbackMetadata metadata, + TableFormatCompletionAction tableFormatCompletionAction) { + HoodieInstant completedInstant = transitionRollbackInflightToComplete(shouldLock, inflightInstant, metadata); + tableFormatCompletionAction.execute(completedInstant); + return completedInstant; } @Override @@ -482,8 +501,15 @@ public HoodieInstant transitionReplaceInflightToComplete( ValidationUtils.checkArgument(inflightInstant.isInflight()); HoodieInstant commitInstant = instantGenerator.createNewInstant(HoodieInstant.State.COMPLETED, REPLACE_COMMIT_ACTION, inflightInstant.requestedTime()); // Then write to timeline - transitionStateToComplete(shouldLock, inflightInstant, commitInstant, Option.of(metadata)); - return commitInstant; + return transitionStateToComplete(shouldLock, inflightInstant, commitInstant, Option.of(metadata)); + } + + @Override + public HoodieInstant transitionReplaceInflightToComplete(boolean shouldLock, HoodieInstant inflightInstant, HoodieReplaceCommitMetadata metadata, + TableFormatCompletionAction tableFormatCompletionAction) { + HoodieInstant completedInstant = transitionReplaceInflightToComplete(shouldLock, inflightInstant, metadata); + tableFormatCompletionAction.execute(completedInstant); + return completedInstant; } @Override @@ -493,20 +519,27 @@ public HoodieInstant transitionClusterInflightToComplete( ValidationUtils.checkArgument(inflightInstant.isInflight()); HoodieInstant commitInstant = instantGenerator.createNewInstant(HoodieInstant.State.COMPLETED, REPLACE_COMMIT_ACTION, inflightInstant.requestedTime()); // Then write to timeline - transitionStateToComplete(shouldLock, inflightInstant, commitInstant, Option.of(metadata), Option.empty()); - return commitInstant; + return transitionStateToComplete(shouldLock, inflightInstant, commitInstant, Option.of(metadata), Option.empty()); + } + + @Override + public HoodieInstant transitionClusterInflightToComplete(boolean shouldLock, HoodieInstant inflightInstant, HoodieReplaceCommitMetadata metadata, + TableFormatCompletionAction tableFormatCompletionAction) { + HoodieInstant completedInstant = transitionClusterInflightToComplete(shouldLock, inflightInstant, metadata); + tableFormatCompletionAction.execute(completedInstant); + return completedInstant; } private void transitionPendingState(HoodieInstant fromInstant, HoodieInstant toInstant, Option metadata) { transitionPendingState(fromInstant, toInstant, metadata, false); } - protected void transitionStateToComplete(boolean shouldLock, HoodieInstant fromInstant, HoodieInstant toInstant, Option metadata) { - transitionStateToComplete(shouldLock, fromInstant, toInstant, metadata, Option.empty()); + protected HoodieInstant transitionStateToComplete(boolean shouldLock, HoodieInstant fromInstant, HoodieInstant toInstant, Option metadata) { + return transitionStateToComplete(shouldLock, fromInstant, toInstant, metadata, Option.empty()); } - protected void transitionStateToComplete(boolean shouldLock, HoodieInstant fromInstant, HoodieInstant toInstant, Option metadata, - Option completionTimeOpt) { + protected HoodieInstant transitionStateToComplete(boolean shouldLock, HoodieInstant fromInstant, HoodieInstant toInstant, Option metadata, + Option completionTimeOpt) { ValidationUtils.checkArgument(fromInstant.requestedTime().equals(toInstant.requestedTime()), String.format("%s and %s are not consistent when transition state.", fromInstant, toInstant)); String fromInstantFileName = instantFileNameGenerator.getFileName(fromInstant); try { @@ -524,12 +557,14 @@ protected void transitionStateToComplete(boolean shouldLock, HoodieInstant f throw new HoodieIOException( "Could not rename " + fromInstantPath + " to " + toInstantPath); } + return instantWithCompletionTime; } else { // Ensures old state exists in timeline ValidationUtils.checkArgument( metaClient.getStorage().exists(getInstantFileNamePath(fromInstantFileName)), "File " + getInstantFileNamePath(fromInstantFileName) + " does not exist!"); - createCompleteFileInMetaPath(shouldLock, toInstant, metadata); + String completionTime = createCompleteFileInMetaPath(shouldLock, toInstant, metadata); + return new HoodieInstant(toInstant.getState(), toInstant.getAction(), toInstant.requestedTime(), completionTime, instantComparator.requestedTimeOrderedComparator()); } } catch (IOException e) { throw new HoodieIOException("Could not complete " + fromInstant, e); @@ -708,10 +743,11 @@ public void createFileInMetaPath(String filename, Option metadata, boolea } } - protected void createCompleteFileInMetaPath(boolean shouldLock, HoodieInstant instant, Option metadata) { + protected String createCompleteFileInMetaPath(boolean shouldLock, HoodieInstant instant, Option metadata) { Option writerOption = getHoodieInstantWriterOption(this, metadata); TimeGenerator timeGenerator = TimeGenerators .getTimeGenerator(metaClient.getTimeGeneratorConfig(), metaClient.getStorageConf()); + final AtomicReference completionTimeRef = new AtomicReference<>(); timeGenerator.consumeTime(!shouldLock, currentTimeMillis -> { String completionTime = HoodieInstantTimeGenerator.formatDateBasedOnTimeZone(new Date(currentTimeMillis)); String fileName = instantFileNameGenerator.getFileName(completionTime, instant); @@ -721,8 +757,10 @@ protected void createCompleteFileInMetaPath(boolean shouldLock, HoodieInstan } else { metaClient.getStorage().createImmutableFileInPath(fullPath, writerOption); } - LOG.info("Created new file for toInstant ?{}", fullPath); + completionTimeRef.set(completionTime); + LOG.info("Created new file for toInstant: {}", fullPath); }); + return completionTimeRef.get(); } protected Option readDataFromPath(StoragePath detailPath) { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/versioning/v2/CompletionTimeQueryViewV2.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/versioning/v2/CompletionTimeQueryViewV2.java index 8005614c6fdc..1f67469eae97 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/versioning/v2/CompletionTimeQueryViewV2.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/versioning/v2/CompletionTimeQueryViewV2.java @@ -288,7 +288,7 @@ private void loadCompletionTimeIncrementally(String startTime) { // This operation is resource costly. synchronized (this) { if (InstantComparison.compareTimestamps(startTime, LESSER_THAN, this.cursorInstant)) { - metaClient.getTimelineLayout().getTimelineFactory().createArchivedTimelineLoader().loadInstants(metaClient, + metaClient.getTableFormat().getTimelineFactory().createArchivedTimelineLoader().loadInstants(metaClient, new HoodieArchivedTimeline.ClosedOpenTimeRangeFilter(startTime, this.cursorInstant), HoodieArchivedTimeline.LoadMode.TIME, r -> true, @@ -310,7 +310,7 @@ private void load() { .filterCompletedInstants().getInstantsAsStream() .forEach(instant -> setCompletionTime(instant.requestedTime(), instant.getCompletionTime())); // then load the archived instants. - metaClient.getTimelineLayout().getTimelineFactory().createArchivedTimelineLoader().loadInstants(metaClient, + metaClient.getTableFormat().getTimelineFactory().createArchivedTimelineLoader().loadInstants(metaClient, new HoodieArchivedTimeline.StartTsFilter(this.cursorInstant), HoodieArchivedTimeline.LoadMode.TIME, r -> true, diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java index b0b39b815453..5c0128930d62 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java @@ -124,7 +124,7 @@ protected AbstractTableFileSystemView(HoodieTableMetadata tableMetadata) { */ protected void init(HoodieTableMetaClient metaClient, HoodieTimeline visibleActiveTimeline) { this.metaClient = metaClient; - this.completionTimeQueryView = metaClient.getTimelineLayout().getTimelineFactory().createCompletionTimeQueryView(metaClient); + this.completionTimeQueryView = metaClient.getTableFormat().getTimelineFactory().createCompletionTimeQueryView(metaClient); refreshTimeline(visibleActiveTimeline); resetFileGroupsReplaced(visibleCommitsAndCompactionTimeline); this.bootstrapIndex = BootstrapIndex.getBootstrapIndex(metaClient); @@ -152,7 +152,7 @@ protected void refreshTimeline(HoodieTimeline visibleActiveTimeline) { * Refresh the completion time query view. */ protected void refreshCompletionTimeQueryView() { - this.completionTimeQueryView = metaClient.getTimelineLayout().getTimelineFactory().createCompletionTimeQueryView(metaClient); + this.completionTimeQueryView = metaClient.getTableFormat().getTimelineFactory().createCompletionTimeQueryView(metaClient); } /** diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewManager.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewManager.java index 0c7495fa294e..d1baad849516 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewManager.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewManager.java @@ -206,7 +206,7 @@ public static HoodieTableFileSystemView createInMemoryFileSystemViewWithTimeline HoodieTimeline timeline) { LOG.info("Creating InMemory based view for basePath {}.", metaClient.getBasePath()); HoodieTableMetadata tableMetadata = getTableMetadata(engineContext, metaClient, metadataConfig.isEnabled(), - unused -> HoodieTableMetadata.create(engineContext, metaClient.getStorage(), metadataConfig, metaClient.getBasePath().toString())); + unused -> metaClient.getTableFormat().getMetadataFactory().create(engineContext, metaClient.getStorage(), metadataConfig, metaClient.getBasePath().toString())); if (metaClient.getMetaserverConfig().isMetaserverEnabled()) { return (HoodieTableFileSystemView) ReflectionUtils.loadClass(HOODIE_METASERVER_FILE_SYSTEM_VIEW_CLASS, @@ -236,7 +236,7 @@ public static FileSystemViewManager createViewManagerWithTableMetadata( final FileSystemViewStorageConfig config, final HoodieCommonConfig commonConfig) { return createViewManager(context, metadataConfig, config, commonConfig, - metaClient -> HoodieTableMetadata.create(context, metaClient.getStorage(), metadataConfig, metaClient.getBasePath().toString(), true)); + metaClient -> metaClient.getTableFormat().getMetadataFactory().create(context, metaClient.getStorage(), metadataConfig, metaClient.getBasePath().toString(), true)); } public static FileSystemViewManager createViewManager(final HoodieEngineContext context, diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ClusteringUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ClusteringUtils.java index c6212f5d1f39..a964301bfdae 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/ClusteringUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ClusteringUtils.java @@ -40,6 +40,7 @@ import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.InstantGenerator; +import org.apache.hudi.common.table.timeline.TableFormatCompletionAction; import org.apache.hudi.common.table.timeline.TimelineUtils; import org.apache.hudi.common.table.timeline.versioning.v2.InstantGeneratorV2; import org.apache.hudi.common.util.collection.Pair; @@ -124,11 +125,12 @@ public static Option getRequestedClusteringInstant(String timesta * action type. After HUDI-7905, the new clustering commits are written with clustering action. */ public static void transitionClusteringOrReplaceInflightToComplete(boolean shouldLock, HoodieInstant clusteringInstant, - HoodieReplaceCommitMetadata metadata, HoodieActiveTimeline activeTimeline) { + HoodieReplaceCommitMetadata metadata, HoodieActiveTimeline activeTimeline, + TableFormatCompletionAction tableFormatCompletionAction) { if (clusteringInstant.getAction().equals(HoodieTimeline.CLUSTERING_ACTION)) { - activeTimeline.transitionClusterInflightToComplete(shouldLock, clusteringInstant, metadata); + activeTimeline.transitionClusterInflightToComplete(shouldLock, clusteringInstant, metadata, tableFormatCompletionAction); } else { - activeTimeline.transitionReplaceInflightToComplete(shouldLock, clusteringInstant, metadata); + activeTimeline.transitionReplaceInflightToComplete(shouldLock, clusteringInstant, metadata, tableFormatCompletionAction); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadata.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadata.java index ec3ce260102f..3c2ac8884be5 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadata.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadata.java @@ -20,9 +20,7 @@ import org.apache.hudi.avro.model.HoodieMetadataColumnStats; import org.apache.hudi.common.bloom.BloomFilter; -import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.data.HoodieData; -import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordGlobalLocation; import org.apache.hudi.common.table.HoodieTableMetaClient; @@ -31,7 +29,6 @@ import org.apache.hudi.exception.HoodieMetadataException; import org.apache.hudi.expression.Expression; import org.apache.hudi.internal.schema.Types; -import org.apache.hudi.storage.HoodieStorage; import org.apache.hudi.storage.StoragePath; import org.apache.hudi.storage.StoragePathInfo; @@ -121,44 +118,6 @@ static boolean isMetadataTable(StoragePath basePath) { return isMetadataTable(basePath.toString()); } - static HoodieTableMetadata create(HoodieEngineContext engineContext, - HoodieStorage storage, - HoodieMetadataConfig metadataConfig, - String datasetBasePath) { - return create(engineContext, storage, metadataConfig, datasetBasePath, false); - } - - static HoodieTableMetadata create(HoodieEngineContext engineContext, - HoodieStorage storage, - HoodieMetadataConfig metadataConfig, - String datasetBasePath, - boolean reuse) { - if (metadataConfig.isEnabled()) { - HoodieBackedTableMetadata metadata = createHoodieBackedTableMetadata(engineContext, storage, metadataConfig, datasetBasePath, reuse); - // If the MDT is not initialized then we fallback to FSBackedTableMetadata - if (metadata.isMetadataTableInitialized()) { - return metadata; - } - LOG.warn("Falling back to FileSystemBackedTableMetadata as metadata table is not initialized"); - } - return createFSBackedTableMetadata(engineContext, storage, datasetBasePath); - } - - static FileSystemBackedTableMetadata createFSBackedTableMetadata(HoodieEngineContext engineContext, - HoodieStorage storage, - String datasetBasePath) { - return new FileSystemBackedTableMetadata( - engineContext, storage, datasetBasePath); - } - - static HoodieBackedTableMetadata createHoodieBackedTableMetadata(HoodieEngineContext engineContext, - HoodieStorage storage, - HoodieMetadataConfig metadataConfig, - String datasetBasePath, - boolean reuse) { - return new HoodieBackedTableMetadata(engineContext, storage, metadataConfig, datasetBasePath, reuse); - } - /** * Fetch all the files at the given partition path, per the latest snapshot of the metadata. */ diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java index 13dd2caf2a3b..09fe6ba31558 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java @@ -1419,7 +1419,7 @@ public static HoodieTableFileSystemView getFileSystemViewForMetadataTable(Hoodie // default FileSystemView will not return any file slices even // though we may have initialized them. HoodieTimeline timeline = metaClient.getActiveTimeline(); - TimelineFactory factory = metaClient.getTimelineLayout().getTimelineFactory(); + TimelineFactory factory = metaClient.getTableFormat().getTimelineFactory(); if (timeline.empty()) { final HoodieInstant instant = metaClient.createNewInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, HoodieInstantTimeGenerator.getCurrentInstantTimeStr()); diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/NativeTableMetadataFactory.java b/hudi-common/src/main/java/org/apache/hudi/metadata/NativeTableMetadataFactory.java new file mode 100644 index 000000000000..106b6b5c758b --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/NativeTableMetadataFactory.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metadata; + +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.storage.HoodieStorage; + +public class NativeTableMetadataFactory extends TableMetadataFactory { + private static final NativeTableMetadataFactory INSTANCE = new NativeTableMetadataFactory(); + + public static NativeTableMetadataFactory getInstance() { + return INSTANCE; + } + + @Override + public HoodieTableMetadata create(HoodieEngineContext engineContext, HoodieStorage storage, + HoodieMetadataConfig metadataConfig, String datasetBasePath, boolean reuse) { + if (metadataConfig.isEnabled()) { + HoodieBackedTableMetadata metadata = createHoodieBackedTableMetadata(engineContext, storage, metadataConfig, datasetBasePath, reuse); + // If the MDT is not initialized then we fallback to FSBackedTableMetadata + if (metadata.isMetadataTableInitialized()) { + return metadata; + } + LOG.warn("Falling back to FileSystemBackedTableMetadata as metadata table is not initialized"); + } + return createFSBackedTableMetadata(engineContext, storage, datasetBasePath); + } + + private FileSystemBackedTableMetadata createFSBackedTableMetadata(HoodieEngineContext engineContext, + HoodieStorage storage, + String datasetBasePath) { + return new FileSystemBackedTableMetadata(engineContext, storage, datasetBasePath); + } + + private HoodieBackedTableMetadata createHoodieBackedTableMetadata(HoodieEngineContext engineContext, + HoodieStorage storage, + HoodieMetadataConfig metadataConfig, + String datasetBasePath, + boolean reuse) { + return new HoodieBackedTableMetadata(engineContext, storage, metadataConfig, datasetBasePath, reuse); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/TableMetadataFactory.java b/hudi-common/src/main/java/org/apache/hudi/metadata/TableMetadataFactory.java new file mode 100644 index 000000000000..4d113fdb2cba --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/TableMetadataFactory.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metadata; + +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.storage.HoodieStorage; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public abstract class TableMetadataFactory { + + protected static Logger LOG = LoggerFactory.getLogger(TableMetadataFactory.class); + + public HoodieTableMetadata create(HoodieEngineContext engineContext, + HoodieStorage storage, + HoodieMetadataConfig metadataConfig, + String datasetBasePath) { + return create(engineContext, storage, metadataConfig, datasetBasePath, false); + } + + public abstract HoodieTableMetadata create(HoodieEngineContext engineContext, + HoodieStorage storage, + HoodieMetadataConfig metadataConfig, + String datasetBasePath, + boolean reuse); + +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/read/TestHoodieFileGroupReaderBase.java b/hudi-common/src/test/java/org/apache/hudi/common/table/read/TestHoodieFileGroupReaderBase.java index b35d6ce9c84d..af0889c289b7 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/read/TestHoodieFileGroupReaderBase.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/read/TestHoodieFileGroupReaderBase.java @@ -52,7 +52,6 @@ import org.apache.hudi.common.util.collection.ExternalSpillableMap; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.keygen.constant.KeyGeneratorOptions; -import org.apache.hudi.metadata.HoodieTableMetadata; import org.apache.hudi.storage.StorageConfiguration; import com.fasterxml.jackson.databind.ObjectMapper; @@ -347,13 +346,11 @@ private List getFileSlicesToRead(StorageConfiguration storageConf, metadataConfig, FileSystemViewStorageConfig.newBuilder().build(), HoodieCommonConfig.newBuilder().build(), - mc -> HoodieTableMetadata.create( + mc -> metaClient.getTableFormat().getMetadataFactory().create( engineContext, mc.getStorage(), metadataConfig, tablePath)); HoodieTableFileSystemView fsView = (HoodieTableFileSystemView) viewManager.getFileSystemView(metaClient); - List relativePartitionPathList = FSUtils.getAllPartitionPaths( - engineContext, metaClient.getStorage(), - metadataConfig, metaClient.getBasePath().toString()); + List relativePartitionPathList = FSUtils.getAllPartitionPaths(engineContext, metaClient, metadataConfig); List fileSlices = relativePartitionPathList.stream().flatMap(fsView::getAllFileSlices) .collect(Collectors.toList()); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java index 4a35c5c7a192..79b32c2b1544 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java @@ -236,6 +236,9 @@ public static HoodieTableMetaClient.TableBuilder getMetaClientBuilder(HoodieTabl if (properties.containsKey("hoodie.write.table.version")) { builder.setTableVersion(Integer.parseInt(properties.getProperty("hoodie.write.table.version"))); } + if (properties.containsKey(HoodieTableConfig.TABLE_FORMAT.key())) { + builder.setTableFormat(properties.getProperty(HoodieTableConfig.TABLE_FORMAT.key())); + } String keyGen = properties.getProperty("hoodie.datasource.write.keygenerator.class"); if (!Objects.equals(keyGen, "org.apache.hudi.keygen.NonpartitionedKeyGenerator") diff --git a/hudi-common/src/test/java/org/apache/hudi/tableformat/TestActiveTimeline.java b/hudi-common/src/test/java/org/apache/hudi/tableformat/TestActiveTimeline.java new file mode 100644 index 000000000000..bd3106a68e11 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/tableformat/TestActiveTimeline.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.tableformat; + +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.versioning.v2.ActiveTimelineV2; +import org.apache.hudi.common.table.timeline.versioning.v2.InstantComparatorV2; + +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * An active timeline for test table format which merges timeline assuming test-format as the source of truth. + */ +public class TestActiveTimeline extends ActiveTimelineV2 { + + public TestActiveTimeline( + HoodieTableMetaClient metaClient, + Set includedExtensions, + boolean applyLayoutFilters) { + this.setInstants(getInstantsFromFileSystem(metaClient, includedExtensions, applyLayoutFilters)); + this.metaClient = metaClient; + } + + public TestActiveTimeline(HoodieTableMetaClient metaClient) { + this(metaClient, Collections.unmodifiableSet(VALID_EXTENSIONS_IN_ACTIVE_TIMELINE), true); + } + + public TestActiveTimeline(HoodieTableMetaClient metaClient, boolean applyLayoutFilters) { + this( + metaClient, + Collections.unmodifiableSet(VALID_EXTENSIONS_IN_ACTIVE_TIMELINE), + applyLayoutFilters); + } + + public TestActiveTimeline() { + + } + + @Override + protected List getInstantsFromFileSystem( + HoodieTableMetaClient metaClient, + Set includedExtensions, + boolean applyLayoutFilters) { + Map instantsInTestTableFormat = TestTableFormat.getRecordedInstants(metaClient.getBasePath().toString()) + .stream() + .collect(Collectors.toMap(HoodieInstant::requestedTime, instant -> instant)); + List instantsFromHoodieTimeline = + super.getInstantsFromFileSystem(metaClient, includedExtensions, applyLayoutFilters); + List inflightInstantsInTestTableFormat = + instantsFromHoodieTimeline.stream() + .filter( + hoodieInstant -> !instantsInTestTableFormat.containsKey(hoodieInstant.requestedTime())) + .map( + instant -> { + if (instant.isCompleted()) { + return new HoodieInstant( + HoodieInstant.State.INFLIGHT, + instant.getAction(), + instant.requestedTime(), + instant.getCompletionTime(), + InstantComparatorV2.REQUESTED_TIME_BASED_COMPARATOR); + } + return instant; + }) + .collect(Collectors.toList()); + List completedInstantsInTestTableFormat = + instantsInTestTableFormat.values().stream() + .filter(instantsFromHoodieTimeline::contains) + .collect(Collectors.toList()); + return Stream.concat(completedInstantsInTestTableFormat.stream(), inflightInstantsInTestTableFormat.stream()) + .sorted(InstantComparatorV2.REQUESTED_TIME_BASED_COMPARATOR) + .collect(Collectors.toList()); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/tableformat/TestFormatBackedTableMetadata.java b/hudi-common/src/test/java/org/apache/hudi/tableformat/TestFormatBackedTableMetadata.java new file mode 100644 index 000000000000..dfdf494f9414 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/tableformat/TestFormatBackedTableMetadata.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.tableformat; + +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.metadata.FileSystemBackedTableMetadata; +import org.apache.hudi.storage.HoodieStorage; + +/** + * An implementation of {@link FileSystemBackedTableMetadata} for testing used by test-format. + */ +public class TestFormatBackedTableMetadata extends FileSystemBackedTableMetadata { + + public TestFormatBackedTableMetadata( + HoodieEngineContext engineContext, HoodieStorage storage, String datasetBasePath) { + super(engineContext, storage, datasetBasePath); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/tableformat/TestTableFormat.java b/hudi-common/src/test/java/org/apache/hudi/tableformat/TestTableFormat.java new file mode 100644 index 000000000000..43d45af8ad14 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/tableformat/TestTableFormat.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.tableformat; + +import org.apache.hudi.avro.model.HoodieCleanMetadata; +import org.apache.hudi.common.HoodieTableFormat; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.TimelineFactory; +import org.apache.hudi.common.table.view.FileSystemViewManager; +import org.apache.hudi.metadata.TableMetadataFactory; + +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.function.Supplier; + +/** + * Test implementation of HoodieTableFormat that records all Hoodie instants in memory. + * Used for functional testing of HoodieTableFormat. + */ +public class TestTableFormat implements HoodieTableFormat { + + private static final Map> RECORDED_INSTANTS = new ConcurrentHashMap<>(); + + public TestTableFormat() { + } + + public static List getRecordedInstants(String basePath) { + return RECORDED_INSTANTS.getOrDefault(basePath, Collections.emptyList()); + } + + public static void tearDown() { + RECORDED_INSTANTS.clear(); + } + + @Override + public String getName() { + return "test-format"; + } + + @Override + public void commit(HoodieCommitMetadata commitMetadata, HoodieInstant completedInstant, + HoodieEngineContext engineContext, HoodieTableMetaClient metaClient, + FileSystemViewManager viewManager) { + RECORDED_INSTANTS.putIfAbsent(metaClient.getBasePath().toString(), new CopyOnWriteArrayList<>()); + RECORDED_INSTANTS.get(metaClient.getBasePath().toString()).add(completedInstant); + } + + @Override + public void clean(HoodieCleanMetadata cleanMetadata, HoodieInstant completedInstant, + HoodieEngineContext engineContext, HoodieTableMetaClient metaClient, + FileSystemViewManager viewManager) { + RECORDED_INSTANTS.get(metaClient.getBasePath().toString()).add(completedInstant); + } + + @Override + public void archive(Supplier> archivedInstants, HoodieEngineContext engineContext, + HoodieTableMetaClient metaClient, FileSystemViewManager viewManager) { + RECORDED_INSTANTS.get(metaClient.getBasePath().toString()).removeAll(archivedInstants.get()); + } + + @Override + public void rollback(HoodieInstant completedInstant, HoodieEngineContext engineContext, + HoodieTableMetaClient metaClient, FileSystemViewManager viewManager) { + // No-op. + } + + @Override + public void completedRollback(HoodieInstant rollbackInstant, HoodieEngineContext engineContext, + HoodieTableMetaClient metaClient, FileSystemViewManager viewManager) { + RECORDED_INSTANTS.putIfAbsent(metaClient.getBasePath().toString(), new CopyOnWriteArrayList<>()); + RECORDED_INSTANTS.get(metaClient.getBasePath().toString()).add(rollbackInstant); + } + + @Override + public void savepoint(HoodieInstant savepointInstant, HoodieEngineContext engineContext, + HoodieTableMetaClient metaClient, FileSystemViewManager viewManager) { + RECORDED_INSTANTS.get(metaClient.getBasePath().toString()).add(savepointInstant); + } + + @Override + public TimelineFactory getTimelineFactory() { + return new TestTimelineFactory(null); + } + + @Override + public TableMetadataFactory getMetadataFactory() { + return TestTableMetadataFactory.getInstance(); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/tableformat/TestTableMetadataFactory.java b/hudi-common/src/test/java/org/apache/hudi/tableformat/TestTableMetadataFactory.java new file mode 100644 index 000000000000..033c0f6cd8d4 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/tableformat/TestTableMetadataFactory.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.tableformat; + +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.metadata.HoodieTableMetadata; +import org.apache.hudi.metadata.TableMetadataFactory; +import org.apache.hudi.storage.HoodieStorage; + +/** + * The test implementation of {@link TableMetadataFactory} used by test-format. + */ +public class TestTableMetadataFactory extends TableMetadataFactory { + private static final TestTableMetadataFactory INSTANCE = new TestTableMetadataFactory(); + + public static TestTableMetadataFactory getInstance() { + return INSTANCE; + } + + @Override + public HoodieTableMetadata create( + HoodieEngineContext engineContext, + HoodieStorage storage, + HoodieMetadataConfig metadataConfig, + String datasetBasePath, + boolean reuse) { + return new TestFormatBackedTableMetadata(engineContext, storage, datasetBasePath); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/tableformat/TestTimelineFactory.java b/hudi-common/src/test/java/org/apache/hudi/tableformat/TestTimelineFactory.java new file mode 100644 index 000000000000..87a1b967b46d --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/tableformat/TestTimelineFactory.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.tableformat; + +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.ArchivedTimelineLoader; +import org.apache.hudi.common.table.timeline.CompletionTimeQueryView; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieArchivedTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieInstantReader; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.TimelineFactory; +import org.apache.hudi.common.table.timeline.versioning.v2.ArchivedTimelineLoaderV2; +import org.apache.hudi.common.table.timeline.versioning.v2.ArchivedTimelineV2; +import org.apache.hudi.common.table.timeline.versioning.v2.BaseTimelineV2; +import org.apache.hudi.common.table.timeline.versioning.v2.CompletionTimeQueryViewV2; + +import java.util.stream.Stream; + +/** + * The test implementation of TimelineFactory used for functional testing. + */ +public class TestTimelineFactory extends TimelineFactory { + + public TestTimelineFactory(HoodieConfig config) { + // To match reflection. + } + + @Override + public HoodieTimeline createDefaultTimeline(Stream instants, HoodieInstantReader instantReader) { + return new BaseTimelineV2(instants, instantReader); + } + + @Override + public HoodieActiveTimeline createActiveTimeline() { + return new TestActiveTimeline(); + } + + @Override + public HoodieArchivedTimeline createArchivedTimeline(HoodieTableMetaClient metaClient) { + return new ArchivedTimelineV2(metaClient); + } + + @Override + public HoodieArchivedTimeline createArchivedTimeline(HoodieTableMetaClient metaClient, String startTs) { + return new ArchivedTimelineV2(metaClient, startTs); + } + + @Override + public ArchivedTimelineLoader createArchivedTimelineLoader() { + return new ArchivedTimelineLoaderV2(); + } + + @Override + public HoodieActiveTimeline createActiveTimeline(HoodieTableMetaClient metaClient) { + return new TestActiveTimeline(metaClient); + } + + @Override + public HoodieActiveTimeline createActiveTimeline(HoodieTableMetaClient metaClient, boolean applyLayoutFilter) { + return new TestActiveTimeline(metaClient, applyLayoutFilter); + } + + @Override + public CompletionTimeQueryView createCompletionTimeQueryView(HoodieTableMetaClient metaClient) { + return new CompletionTimeQueryViewV2(metaClient); + } + + @Override + public CompletionTimeQueryView createCompletionTimeQueryView(HoodieTableMetaClient metaClient, String eagerInstant) { + return new CompletionTimeQueryViewV2(metaClient, eagerInstant); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java index d191bc9a8250..4b5287280530 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java @@ -420,6 +420,13 @@ private FlinkOptions() { .defaultValue(HoodieWriteConfig.WRITE_TABLE_VERSION.defaultValue()) .withDescription("Table version produced by this writer."); + @AdvancedConfig + public static final ConfigOption WRITE_TABLE_FORMAT = ConfigOptions + .key(HoodieTableConfig.TABLE_FORMAT.key()) + .stringType() + .defaultValue(HoodieTableConfig.TABLE_FORMAT.defaultValue()) + .withDescription("Table format produced by this writer."); + /** * Flag to indicate whether to drop duplicates before insert/upsert. * By default false to gain extra performance. diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/BootstrapOperator.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/BootstrapOperator.java index 8ab6547a6236..a44ce02272d7 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/BootstrapOperator.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/BootstrapOperator.java @@ -145,8 +145,7 @@ protected void preLoadIndexRecords() throws Exception { StoragePath basePath = hoodieTable.getMetaClient().getBasePath(); int taskID = getRuntimeContext().getIndexOfThisSubtask(); LOG.info("Start loading records in table {} into the index state, taskId = {}", basePath, taskID); - for (String partitionPath : FSUtils.getAllPartitionPaths( - new HoodieFlinkEngineContext(hadoopConf), hoodieTable.getStorage(), metadataConfig(conf), basePath)) { + for (String partitionPath : FSUtils.getAllPartitionPaths(new HoodieFlinkEngineContext(hadoopConf), hoodieTable.getMetaClient(), metadataConfig(conf))) { if (pattern.matcher(partitionPath).matches()) { loadRecords(partitionPath); } diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/FileIndex.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/FileIndex.java index 351137c3d51b..273bbb581031 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/FileIndex.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/FileIndex.java @@ -21,16 +21,15 @@ import org.apache.hudi.client.common.HoodieFlinkEngineContext; import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.configuration.HadoopConfigurations; -import org.apache.hudi.hadoop.fs.HadoopFSUtils; import org.apache.hudi.index.bucket.BucketIdentifier; import org.apache.hudi.source.prune.ColumnStatsProbe; import org.apache.hudi.source.prune.PartitionPruners; import org.apache.hudi.source.stats.FileStatsIndex; import org.apache.hudi.storage.StoragePath; import org.apache.hudi.storage.StoragePathInfo; -import org.apache.hudi.storage.hadoop.HoodieHadoopStorage; import org.apache.hudi.util.StreamerUtil; import org.apache.flink.annotation.VisibleForTesting; @@ -71,11 +70,13 @@ public class FileIndex implements Serializable { private final Function partitionBucketIdFunc; // for bucket pruning private List partitionPaths; // cache of partition paths private final FileStatsIndex fileStatsIndex; // for data skipping + private final HoodieTableMetaClient metaClient; private FileIndex( StoragePath path, Configuration conf, RowType rowType, + HoodieTableMetaClient metaClient, ColumnStatsProbe colStatsProbe, PartitionPruners.PartitionPruner partitionPruner, Function partitionBucketIdFunc) { @@ -85,8 +86,9 @@ private FileIndex( this.metadataConfig = StreamerUtil.metadataConfig(conf); this.colStatsProbe = isDataSkippingFeasible(conf.get(FlinkOptions.READ_DATA_SKIPPING_ENABLED)) ? colStatsProbe : null; this.partitionPruner = partitionPruner; - this.fileStatsIndex = new FileStatsIndex(path.toString(), rowType, metadataConfig); + this.fileStatsIndex = new FileStatsIndex(path.toString(), rowType, conf, metaClient); this.partitionBucketIdFunc = partitionBucketIdFunc; + this.metaClient = metaClient; } /** @@ -158,8 +160,7 @@ public List getFilesInPartitions() { return Collections.emptyList(); } Map> filesInPartitions = FSUtils.getFilesInPartitions( - new HoodieFlinkEngineContext(hadoopConf), - new HoodieHadoopStorage(path, HadoopFSUtils.getStorageConf(hadoopConf)), metadataConfig, path.toString(), partitions); + new HoodieFlinkEngineContext(hadoopConf), metaClient, metadataConfig, partitions); int totalFilesNum = filesInPartitions.values().stream().mapToInt(List::size).sum(); if (totalFilesNum < 1) { // returns early for empty table. @@ -232,9 +233,7 @@ public List getOrBuildPartitionPaths() { if (this.partitionPaths != null) { return this.partitionPaths; } - List allPartitionPaths = this.tableExists ? FSUtils.getAllPartitionPaths( - new HoodieFlinkEngineContext(hadoopConf), - new HoodieHadoopStorage(path, HadoopFSUtils.getStorageConf(hadoopConf)), metadataConfig, path.toString()) + List allPartitionPaths = this.tableExists ? FSUtils.getAllPartitionPaths(new HoodieFlinkEngineContext(hadoopConf), metaClient, metadataConfig) : Collections.emptyList(); if (this.partitionPruner == null) { this.partitionPaths = allPartitionPaths; @@ -288,6 +287,7 @@ public static class Builder { private StoragePath path; private Configuration conf; private RowType rowType; + private HoodieTableMetaClient metaClient; private ColumnStatsProbe columnStatsProbe; private PartitionPruners.PartitionPruner partitionPruner; private Function partitionBucketIdFunc; @@ -310,6 +310,11 @@ public Builder rowType(RowType rowType) { return this; } + public Builder metaClient(HoodieTableMetaClient metaClient) { + this.metaClient = metaClient; + return this; + } + public Builder columnStatsProbe(ColumnStatsProbe columnStatsProbe) { this.columnStatsProbe = columnStatsProbe; return this; @@ -327,7 +332,7 @@ public Builder partitionBucketIdFunc(Function partitionBucketId public FileIndex build() { return new FileIndex(Objects.requireNonNull(path), Objects.requireNonNull(conf), Objects.requireNonNull(rowType), - columnStatsProbe, partitionPruner, partitionBucketIdFunc); + metaClient, columnStatsProbe, partitionPruner, partitionBucketIdFunc); } } } diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/IncrementalInputSplits.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/IncrementalInputSplits.java index 60bef1f3a929..80256c6dcd81 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/IncrementalInputSplits.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/IncrementalInputSplits.java @@ -177,7 +177,7 @@ public Result inputSplits( final List fileInfoList; if (fullTableScan) { // scans the partitions and files directly. - FileIndex fileIndex = getFileIndex(); + FileIndex fileIndex = getFileIndex(metaClient); readPartitions = new TreeSet<>(fileIndex.getOrBuildPartitionPaths()); if (readPartitions.size() == 0) { LOG.warn("No partitions found for reading in user provided path."); @@ -208,7 +208,7 @@ public Result inputSplits( LOG.warn("Found deleted files in metadata, fall back to full table scan."); // fallback to full table scan // reading from the earliest, scans the partitions and files directly. - FileIndex fileIndex = getFileIndex(); + FileIndex fileIndex = getFileIndex(metaClient); readPartitions = new TreeSet<>(fileIndex.getOrBuildPartitionPaths()); if (readPartitions.size() == 0) { LOG.warn("No partitions found for reading in user provided path."); @@ -275,7 +275,7 @@ public Result inputSplits( if (instantRange.isEmpty()) { // reading from the earliest, scans the partitions and files directly. - FileIndex fileIndex = getFileIndex(); + FileIndex fileIndex = getFileIndex(metaClient); Set readPartitions = new TreeSet<>(fileIndex.getOrBuildPartitionPaths()); if (readPartitions.size() == 0) { @@ -411,11 +411,12 @@ private static Stream getFileSlices( : fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, endInstant); } - private FileIndex getFileIndex() { + private FileIndex getFileIndex(HoodieTableMetaClient metaClient) { return FileIndex.builder() .path(new StoragePath(path.toUri())) .conf(conf) .rowType(rowType) + .metaClient(metaClient) .partitionPruner(partitionPruner) .build(); } diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/prune/PartitionPruners.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/prune/PartitionPruners.java index 955f471df31c..f80ba96bda88 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/prune/PartitionPruners.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/prune/PartitionPruners.java @@ -18,7 +18,7 @@ package org.apache.hudi.source.prune; -import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.source.ExpressionEvaluators; import org.apache.hudi.source.ExpressionEvaluators.Evaluator; @@ -26,12 +26,13 @@ import org.apache.hudi.source.stats.PartitionStatsIndex; import org.apache.hudi.table.format.FilePathUtils; import org.apache.hudi.util.DataTypeUtils; -import org.apache.hudi.util.StreamerUtil; import org.apache.flink.configuration.Configuration; import org.apache.flink.table.types.DataType; import org.apache.flink.table.types.logical.RowType; +import javax.annotation.Nullable; + import java.io.Serializable; import java.util.ArrayList; import java.util.Collection; @@ -147,10 +148,11 @@ public static class ColumnStatsPartitionPruner implements PartitionPruner { public ColumnStatsPartitionPruner( RowType rowType, String basePath, - HoodieMetadataConfig metadataConfig, - ColumnStatsProbe probe) { + Configuration conf, + ColumnStatsProbe probe, + @Nullable HoodieTableMetaClient metaClient) { this.probe = probe; - this.partitionStatsIndex = new PartitionStatsIndex(basePath, rowType, metadataConfig); + this.partitionStatsIndex = new PartitionStatsIndex(basePath, rowType, conf, metaClient); } @Override @@ -190,6 +192,7 @@ public static Builder builder() { public static class Builder { private RowType rowType; private String basePath; + private HoodieTableMetaClient metaClient; private Configuration conf; private ColumnStatsProbe probe; private List partitionEvaluators; @@ -212,6 +215,11 @@ public Builder basePath(String basePath) { return this; } + public Builder metaClient(HoodieTableMetaClient metaClient) { + this.metaClient = metaClient; + return this; + } + public Builder conf(Configuration conf) { this.conf = conf; return this; @@ -267,8 +275,8 @@ public PartitionPruner build() { if (probe != null && conf.get(FlinkOptions.READ_DATA_SKIPPING_ENABLED) && conf.get(FlinkOptions.METADATA_ENABLED)) { - columnStatsPruner = new ColumnStatsPartitionPruner(Objects.requireNonNull(rowType), Objects.requireNonNull(basePath), - StreamerUtil.metadataConfig(Objects.requireNonNull(conf)), probe); + columnStatsPruner = new ColumnStatsPartitionPruner(Objects.requireNonNull(rowType), Objects.requireNonNull(basePath), Objects.requireNonNull(conf), + probe, metaClient); } List partitionPruners = Stream.of(staticPruner, dynamicPruner, columnStatsPruner) diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/stats/FileStatsIndex.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/stats/FileStatsIndex.java index 0b3ffeff7d5f..b01f4c11c542 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/stats/FileStatsIndex.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/stats/FileStatsIndex.java @@ -20,9 +20,9 @@ import org.apache.hudi.avro.model.HoodieMetadataRecord; import org.apache.hudi.client.common.HoodieFlinkEngineContext; -import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.VisibleForTesting; import org.apache.hudi.common.util.collection.Pair; @@ -33,13 +33,13 @@ import org.apache.hudi.metadata.HoodieTableMetadata; import org.apache.hudi.metadata.HoodieTableMetadataUtil; import org.apache.hudi.source.prune.ColumnStatsProbe; -import org.apache.hudi.storage.hadoop.HoodieHadoopStorage; import org.apache.hudi.util.AvroToRowDataConverters; import org.apache.hudi.util.DataTypeUtils; -import org.apache.hudi.util.FlinkClientUtil; import org.apache.hudi.util.RowDataProjection; +import org.apache.hudi.util.StreamerUtil; import org.apache.avro.generic.GenericRecord; +import org.apache.flink.configuration.Configuration; import org.apache.flink.table.data.GenericRowData; import org.apache.flink.table.data.RowData; import org.apache.flink.table.data.StringData; @@ -82,16 +82,19 @@ public class FileStatsIndex implements ColumnStatsIndex { private static final Logger LOG = LoggerFactory.getLogger(FileStatsIndex.class); private final RowType rowType; private final String basePath; - private final HoodieMetadataConfig metadataConfig; + private final Configuration conf; + private HoodieTableMetaClient metaClient; private HoodieTableMetadata metadataTable; public FileStatsIndex( String basePath, RowType rowType, - HoodieMetadataConfig metadataConfig) { + Configuration conf, + @Nullable HoodieTableMetaClient metaClient) { this.basePath = basePath; this.rowType = rowType; - this.metadataConfig = metadataConfig; + this.conf = conf; + this.metaClient = metaClient; } @Override @@ -102,15 +105,22 @@ public String getIndexPartitionName() { public HoodieTableMetadata getMetadataTable() { // initialize the metadata table lazily if (this.metadataTable == null) { - this.metadataTable = HoodieTableMetadata.create( + initMetaClient(); + this.metadataTable = metaClient.getTableFormat().getMetadataFactory().create( HoodieFlinkEngineContext.DEFAULT, - new HoodieHadoopStorage(basePath, FlinkClientUtil.getHadoopConf()), - metadataConfig, + metaClient.getStorage(), + StreamerUtil.metadataConfig(conf), basePath); } return this.metadataTable; } + private void initMetaClient() { + if (this.metaClient == null) { + this.metaClient = StreamerUtil.createMetaClient(conf); + } + } + @Override public Set computeCandidateFiles(ColumnStatsProbe probe, List allFiles) { if (probe == null) { diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/stats/PartitionStatsIndex.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/stats/PartitionStatsIndex.java index e2f6431e77db..1facae5671d9 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/stats/PartitionStatsIndex.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/stats/PartitionStatsIndex.java @@ -19,12 +19,15 @@ package org.apache.hudi.source.stats; import org.apache.hudi.avro.model.HoodieMetadataColumnStats; -import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.metadata.HoodieTableMetadataUtil; import org.apache.hudi.source.prune.ColumnStatsProbe; +import org.apache.flink.configuration.Configuration; import org.apache.flink.table.types.logical.RowType; +import javax.annotation.Nullable; + import java.util.List; import java.util.Set; @@ -37,8 +40,9 @@ public class PartitionStatsIndex extends FileStatsIndex { public PartitionStatsIndex( String basePath, RowType tableRowType, - HoodieMetadataConfig metadataConfig) { - super(basePath, tableRowType, metadataConfig); + Configuration conf, + @Nullable HoodieTableMetaClient metaClient) { + super(basePath, tableRowType, conf, metaClient); } @Override diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSource.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSource.java index 5b944682f22a..a764d8f175cd 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSource.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSource.java @@ -113,7 +113,6 @@ import java.util.Collection; import java.util.Collections; import java.util.List; -import java.util.Optional; import java.util.Set; import java.util.StringJoiner; import java.util.concurrent.atomic.AtomicInteger; @@ -191,16 +190,16 @@ public HoodieTableSource( this.partitionKeys = partitionKeys; this.defaultPartName = defaultPartName; this.conf = conf; - this.predicates = Optional.ofNullable(predicates).orElse(Collections.emptyList()); + this.predicates = Option.ofNullable(predicates).orElse(Collections.emptyList()); this.columnStatsProbe = columnStatsProbe; this.partitionPruner = partitionPruner; this.dataBucketFunc = dataBucketFunc; - this.requiredPos = Optional.ofNullable(requiredPos).orElseGet(() -> IntStream.range(0, this.tableRowType.getFieldCount()).toArray()); - this.limit = Optional.ofNullable(limit).orElse(NO_LIMIT_CONSTANT); + this.requiredPos = Option.ofNullable(requiredPos).orElseGet(() -> IntStream.range(0, this.tableRowType.getFieldCount()).toArray()); + this.limit = Option.ofNullable(limit).orElse(NO_LIMIT_CONSTANT); this.hadoopConf = new HadoopStorageConfiguration(HadoopConfigurations.getHadoopConf(conf)); - this.metaClient = Optional.ofNullable(metaClient).orElseGet(() -> StreamerUtil.metaClientForReader(conf, this.hadoopConf.unwrap())); + this.metaClient = Option.ofNullable(metaClient).orElseGet(() -> StreamerUtil.metaClientForReader(conf, this.hadoopConf.unwrap())); this.maxCompactionMemoryInBytes = StreamerUtil.getMaxCompactionMemoryInBytes(conf); - this.internalSchemaManager = Optional.ofNullable(internalSchemaManager).orElseGet(() -> InternalSchemaManager.get(this.conf, this.metaClient)); + this.internalSchemaManager = Option.ofNullable(internalSchemaManager).orElseGet(() -> InternalSchemaManager.get(this.conf, this.metaClient)); } @Override @@ -358,6 +357,7 @@ private PartitionPruners.PartitionPruner createPartitionPruner(List> fileIdCommitTimeSet = new HashSet<>(); HoodieTableMetaClient metaClient = StreamerUtil.createMetaClient(conf); HoodieStorage storage = metaClient.getStorage(); - FSUtils.getAllPartitionPaths(HoodieFlinkEngineContext.DEFAULT, metaClient.getStorage(), metaClient.getBasePath(), false).forEach( + FSUtils.getAllPartitionPaths(HoodieFlinkEngineContext.DEFAULT, metaClient, false).forEach( partition -> { try { storage.listDirectEntries(FSUtils.constructAbsolutePath(metaClient.getBasePath(), partition)) diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestFileIndex.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestFileIndex.java index 6d35785e1b30..fadffd134601 100644 --- a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestFileIndex.java +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestFileIndex.java @@ -27,6 +27,7 @@ import org.apache.hudi.source.prune.PartitionPruners; import org.apache.hudi.storage.StoragePath; import org.apache.hudi.storage.StoragePathInfo; +import org.apache.hudi.util.StreamerUtil; import org.apache.hudi.utils.TestConfigurations; import org.apache.hudi.utils.TestData; @@ -82,7 +83,7 @@ void testFileListingUsingMetadata(boolean hiveStylePartitioning) throws Exceptio conf.setBoolean(HIVE_STYLE_PARTITIONING, hiveStylePartitioning); TestData.writeData(TestData.DATA_SET_INSERT, conf); FileIndex fileIndex = FileIndex.builder().path(new StoragePath(tempFile.getAbsolutePath())).conf(conf) - .rowType(TestConfigurations.ROW_TYPE).build(); + .rowType(TestConfigurations.ROW_TYPE).metaClient(StreamerUtil.createMetaClient(conf)).build(); List partitionKeys = Collections.singletonList("partition"); List> partitions = fileIndex.getPartitions(partitionKeys, PARTITION_DEFAULT_NAME.defaultValue(), @@ -106,7 +107,7 @@ void testFileListingUsingMetadataNonPartitionedTable() throws Exception { conf.setBoolean(METADATA_ENABLED, true); TestData.writeData(TestData.DATA_SET_INSERT, conf); FileIndex fileIndex = FileIndex.builder().path(new StoragePath(tempFile.getAbsolutePath())).conf(conf) - .rowType(TestConfigurations.ROW_TYPE).build(); + .rowType(TestConfigurations.ROW_TYPE).metaClient(StreamerUtil.createMetaClient(conf)).build(); List partitionKeys = Collections.singletonList(""); List> partitions = fileIndex.getPartitions(partitionKeys, PARTITION_DEFAULT_NAME.defaultValue(), false); @@ -147,7 +148,9 @@ void testFileListingWithDataSkipping() throws Exception { FileIndex fileIndex = FileIndex.builder() .path(new StoragePath(tempFile.getAbsolutePath())) - .conf(conf).rowType(TestConfigurations.ROW_TYPE_BIGINT) + .conf(conf) + .rowType(TestConfigurations.ROW_TYPE_BIGINT) + .metaClient(StreamerUtil.createMetaClient(conf)) .columnStatsProbe(ColumnStatsProbe.newInstance(Collections.singletonList(new CallExpression( FunctionIdentifier.of("greaterThan"), BuiltInFunctionDefinitions.GREATER_THAN, @@ -205,6 +208,7 @@ void testFileListingWithPartitionStatsPruning(HoodieTableType tableType) throws .path(new StoragePath(tempFile.getAbsolutePath())) .conf(conf) .rowType(TestConfigurations.ROW_TYPE) + .metaClient(StreamerUtil.createMetaClient(conf)) .partitionPruner(PartitionPruners.builder().rowType(TestConfigurations.ROW_TYPE).basePath(tempFile.getAbsolutePath()).conf(conf).columnStatsProbe(columnStatsProbe).build()) .build(); diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestIncrementalInputSplits.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestIncrementalInputSplits.java index f6b0fca83476..46bb2a012f4e 100644 --- a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestIncrementalInputSplits.java +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestIncrementalInputSplits.java @@ -391,7 +391,7 @@ void testInputSplitsWithPartitionStatsPruner(HoodieTableType tableType) throws E DataTypes.BOOLEAN()))); PartitionPruners.PartitionPruner partitionPruner = - PartitionPruners.builder().rowType(TestConfigurations.ROW_TYPE).basePath(basePath).conf(conf).columnStatsProbe(columnStatsProbe).build(); + PartitionPruners.builder().rowType(TestConfigurations.ROW_TYPE).basePath(basePath).metaClient(metaClient).conf(conf).columnStatsProbe(columnStatsProbe).build(); IncrementalInputSplits iis = IncrementalInputSplits.builder() .conf(conf) .path(new Path(basePath)) diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/stats/TestColumnStatsIndex.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/stats/TestColumnStatsIndex.java index 631be02c02c3..e911f9fdcbca 100644 --- a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/stats/TestColumnStatsIndex.java +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/stats/TestColumnStatsIndex.java @@ -21,6 +21,7 @@ import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.util.StreamerUtil; import org.apache.hudi.utils.TestConfigurations; import org.apache.hudi.utils.TestData; @@ -56,14 +57,10 @@ void testReadPartitionStatsIndex() throws Exception { conf.set(FlinkOptions.METADATA_ENABLED, true); conf.setString("hoodie.metadata.index.partition.stats.enable", "true"); conf.setString("hoodie.metadata.index.column.stats.enable", "true"); - HoodieMetadataConfig metadataConfig = HoodieMetadataConfig.newBuilder() - .enable(true) - .withMetadataIndexColumnStats(true) - .build(); TestData.writeData(TestData.DATA_SET_INSERT, conf); String[] queryColumns = {"uuid", "age"}; - PartitionStatsIndex indexSupport = new PartitionStatsIndex(path, TestConfigurations.ROW_TYPE, metadataConfig); + PartitionStatsIndex indexSupport = new PartitionStatsIndex(path, TestConfigurations.ROW_TYPE, conf, StreamerUtil.createMetaClient(conf)); List indexRows = indexSupport.readColumnStatsIndexByColumns(queryColumns); List results = indexRows.stream().map(Object::toString).sorted(String::compareTo).collect(Collectors.toList()); List expected = Arrays.asList( @@ -93,19 +90,15 @@ void testReadPartitionStatsIndex() throws Exception { void testTransposeColumnStatsIndex() throws Exception { final String path = tempFile.getAbsolutePath(); Configuration conf = TestConfigurations.getDefaultConf(path); - conf.setBoolean(FlinkOptions.METADATA_ENABLED, true); + conf.setBoolean(HoodieMetadataConfig.ENABLE.key(), true); conf.setBoolean(FlinkOptions.READ_DATA_SKIPPING_ENABLED, true); - conf.setString("hoodie.metadata.index.column.stats.enable", "true"); + conf.setBoolean(HoodieMetadataConfig.ENABLE_METADATA_INDEX_COLUMN_STATS.key(), true); - HoodieMetadataConfig metadataConfig = HoodieMetadataConfig.newBuilder() - .enable(true) - .withMetadataIndexColumnStats(true) - .build(); TestData.writeData(TestData.DATA_SET_INSERT, conf); // explicit query columns String[] queryColumns1 = {"uuid", "age"}; - FileStatsIndex indexSupport = new FileStatsIndex(path, TestConfigurations.ROW_TYPE, metadataConfig); + FileStatsIndex indexSupport = new FileStatsIndex(path, TestConfigurations.ROW_TYPE, conf, StreamerUtil.createMetaClient(conf)); List indexRows1 = indexSupport.readColumnStatsIndexByColumns(queryColumns1); Pair, String[]> transposedIndexTable1 = indexSupport.transposeColumnStatsIndex(indexRows1, queryColumns1); assertThat("The schema columns should sort by natural order", diff --git a/hudi-hadoop-common/src/test/java/org/apache/hudi/common/table/TestHoodieTableConfig.java b/hudi-hadoop-common/src/test/java/org/apache/hudi/common/table/TestHoodieTableConfig.java index 191c0e7ac846..97070b6266a6 100644 --- a/hudi-hadoop-common/src/test/java/org/apache/hudi/common/table/TestHoodieTableConfig.java +++ b/hudi-hadoop-common/src/test/java/org/apache/hudi/common/table/TestHoodieTableConfig.java @@ -290,7 +290,7 @@ public void testDropInvalidConfigs() { @Test public void testDefinedTableConfigs() { List> configProperties = HoodieTableConfig.definedTableConfigs(); - assertEquals(38, configProperties.size()); + assertEquals(39, configProperties.size()); configProperties.forEach(c -> { assertNotNull(c); assertFalse(c.doc().isEmpty()); diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSHoodieDatasetInputReader.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSHoodieDatasetInputReader.java index 819ae0c1c527..b1bfa61d65ca 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSHoodieDatasetInputReader.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSHoodieDatasetInputReader.java @@ -97,8 +97,7 @@ protected List getPartitions(Option partitionsLimit) throws IOE // Using FSUtils.getFS here instead of metaClient.getFS() since we don't want to count these listStatus // calls in metrics as they are not part of normal HUDI operation. HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); - List partitionPaths = FSUtils.getAllPartitionPaths(engineContext, metaClient.getStorage(), metaClient.getBasePath(), - HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS); + List partitionPaths = FSUtils.getAllPartitionPaths(engineContext, metaClient, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS); // Sort partition so we can pick last N partitions by default Collections.sort(partitionPaths); if (!partitionPaths.isEmpty()) { diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectTransactionServices.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectTransactionServices.java index 30deffa77bcc..c7a4d6314c59 100644 --- a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectTransactionServices.java +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectTransactionServices.java @@ -26,6 +26,7 @@ import org.apache.hudi.common.model.HoodieAvroPayload; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; @@ -100,6 +101,7 @@ public KafkaConnectTransactionServices(KafkaConnectConfigs connectConfigs) throw .setRecordKeyFields(recordKeyFields) .setPartitionFields(partitionColumns) .setTableVersion(writeConfig.getWriteVersion()) + .setTableFormat(connectConfigs.getStringOrDefault(HoodieTableConfig.TABLE_FORMAT)) .setKeyGeneratorClassProp(writeConfig.getKeyGeneratorClass()) .fromProperties(connectConfigs.getProps()) .initTable(storageConf.newInstance(), tableBasePath)); diff --git a/hudi-platform-service/hudi-metaserver/hudi-metaserver-client/src/main/java/org/apache/hudi/common/table/timeline/HoodieMetaserverBasedTimeline.java b/hudi-platform-service/hudi-metaserver/hudi-metaserver-client/src/main/java/org/apache/hudi/common/table/timeline/HoodieMetaserverBasedTimeline.java index 0b31c0f6f4f7..4672de9bbbdc 100644 --- a/hudi-platform-service/hudi-metaserver/hudi-metaserver-client/src/main/java/org/apache/hudi/common/table/timeline/HoodieMetaserverBasedTimeline.java +++ b/hudi-platform-service/hudi-metaserver/hudi-metaserver-client/src/main/java/org/apache/hudi/common/table/timeline/HoodieMetaserverBasedTimeline.java @@ -60,9 +60,9 @@ protected void deleteInstantFile(HoodieInstant instant) { } @Override - protected void transitionStateToComplete(boolean shouldLock, HoodieInstant fromInstant, HoodieInstant toInstant, Option metadata) { + protected HoodieInstant transitionStateToComplete(boolean shouldLock, HoodieInstant fromInstant, HoodieInstant toInstant, Option metadata) { ValidationUtils.checkArgument(fromInstant.requestedTime().equals(toInstant.requestedTime())); - metaserverClient.transitionInstantState(databaseName, tableName, fromInstant, toInstant, + return metaserverClient.transitionInstantState(databaseName, tableName, fromInstant, toInstant, metadata.map(m -> convertMetadataToByteArray(m, metadataSerDeV2))); } diff --git a/hudi-platform-service/hudi-metaserver/hudi-metaserver-client/src/main/java/org/apache/hudi/metaserver/client/HoodieMetaserverClient.java b/hudi-platform-service/hudi-metaserver/hudi-metaserver-client/src/main/java/org/apache/hudi/metaserver/client/HoodieMetaserverClient.java index 83ee52eba20c..f27c423510df 100644 --- a/hudi-platform-service/hudi-metaserver/hudi-metaserver-client/src/main/java/org/apache/hudi/metaserver/client/HoodieMetaserverClient.java +++ b/hudi-platform-service/hudi-metaserver/hudi-metaserver-client/src/main/java/org/apache/hudi/metaserver/client/HoodieMetaserverClient.java @@ -45,7 +45,7 @@ public interface HoodieMetaserverClient extends Serializable, AutoCloseable { void createNewInstant(String db, String tb, HoodieInstant instant, Option content); - void transitionInstantState(String db, String tb, HoodieInstant fromInstant, HoodieInstant toInstant, Option content); + HoodieInstant transitionInstantState(String db, String tb, HoodieInstant fromInstant, HoodieInstant toInstant, Option content); void deleteInstant(String db, String tb, HoodieInstant instant); diff --git a/hudi-platform-service/hudi-metaserver/hudi-metaserver-client/src/main/java/org/apache/hudi/metaserver/client/HoodieMetaserverClientImp.java b/hudi-platform-service/hudi-metaserver/hudi-metaserver-client/src/main/java/org/apache/hudi/metaserver/client/HoodieMetaserverClientImp.java index 409c3aeece40..3858f0a30368 100644 --- a/hudi-platform-service/hudi-metaserver/hudi-metaserver-client/src/main/java/org/apache/hudi/metaserver/client/HoodieMetaserverClientImp.java +++ b/hudi-platform-service/hudi-metaserver/hudi-metaserver-client/src/main/java/org/apache/hudi/metaserver/client/HoodieMetaserverClientImp.java @@ -136,11 +136,12 @@ public void createNewInstant(String db, String tb, HoodieInstant instant, Option } @Override - public void transitionInstantState(String db, String tb, HoodieInstant fromInstant, HoodieInstant toInstant, Option content) { + public HoodieInstant transitionInstantState(String db, String tb, HoodieInstant fromInstant, HoodieInstant toInstant, Option content) { exceptionWrapper(() -> this.client.transitionInstantState(db, tb, EntityConversions.toTHoodieInstant(fromInstant), EntityConversions.toTHoodieInstant(toInstant), getByteBuffer(content))).get(); + return toInstant; } @Override diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/commit/DatasetBulkInsertOverwriteTableCommitActionExecutor.java b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/commit/DatasetBulkInsertOverwriteTableCommitActionExecutor.java index 772e388f9113..4dc82a34064d 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/commit/DatasetBulkInsertOverwriteTableCommitActionExecutor.java +++ b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/commit/DatasetBulkInsertOverwriteTableCommitActionExecutor.java @@ -48,9 +48,8 @@ public WriteOperationType getWriteOperationType() { protected Map> getPartitionToReplacedFileIds(HoodieData writeStatuses) { HoodieEngineContext context = writeClient.getEngineContext(); List partitionPaths = FSUtils.getAllPartitionPaths(context, - table.getStorage(), - writeConfig.getMetadataConfig(), - table.getMetaClient().getBasePath()); + table.getMetaClient(), + writeConfig.getMetadataConfig()); if (partitionPaths == null || partitionPaths.isEmpty()) { return Collections.emptyMap(); diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala index 8c9b9d1ca803..383f825f712e 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala @@ -299,9 +299,11 @@ class HoodieSparkSqlWriterInternal { if (StringUtils.nonEmpty(hoodieConfig.getString(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME))) hoodieConfig.getString(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME) else KeyGeneratorType.getKeyGeneratorClassName(hoodieConfig) + val tableFormat = hoodieConfig.getStringOrDefault(HoodieTableConfig.TABLE_FORMAT) HoodieTableMetaClient.newTableBuilder() .setTableType(tableType) .setTableVersion(tableVersion) + .setTableFormat(tableFormat) .setDatabaseName(databaseName) .setTableName(tblName) .setBaseFileFormat(baseFileFormat) @@ -326,6 +328,7 @@ class HoodieSparkSqlWriterInternal { .setRecordMergeStrategyId(recordMergeStrategyId) .setRecordMergeMode(RecordMergeMode.getValue(hoodieConfig.getString(HoodieWriteConfig.RECORD_MERGE_MODE))) .setMultipleBaseFileFormatsEnabled(hoodieConfig.getBoolean(HoodieTableConfig.MULTIPLE_BASE_FILE_FORMATS_ENABLE)) + .setTableFormat(hoodieConfig.getStringOrDefault(HoodieTableConfig.TABLE_FORMAT)) .initTable(HadoopFSUtils.getStorageConfWithCopy(sparkContext.hadoopConfiguration), path) } @@ -428,12 +431,13 @@ class HoodieSparkSqlWriterInternal { val keyGenerator = HoodieSparkKeyGeneratorFactory.createKeyGenerator(TypedProperties.copy(hoodieConfig.getProps)) val tableMetaClient = HoodieTableMetaClient.builder .setConf(HadoopFSUtils.getStorageConfWithCopy(sparkContext.hadoopConfiguration)) - .setBasePath(basePath.toString).build() + .setBasePath(basePath.toString) + .build() // Get list of partitions to delete val partitionsToDelete = if (parameters.contains(DataSourceWriteOptions.PARTITIONS_TO_DELETE.key())) { val partitionColsToDelete = parameters(DataSourceWriteOptions.PARTITIONS_TO_DELETE.key()).split(",") java.util.Arrays.asList(resolvePartitionWildcards(java.util.Arrays.asList(partitionColsToDelete: _*).asScala.toList, jsc, - tableMetaClient.getStorage, hoodieConfig, basePath.toString): _*) + tableMetaClient.getStorage, hoodieConfig, basePath.toString, tableMetaClient): _*) } else { val genericRecords = HoodieSparkUtils.createRdd(df, avroRecordName, avroRecordNamespace) genericRecords.map(gr => keyGenerator.getKey(gr).getPartitionPath).toJavaRDD().distinct().collect() @@ -597,13 +601,13 @@ class HoodieSparkSqlWriterInternal { * @return Pair of(boolean, table schema), where first entry will be true only if schema conversion is required. */ private def resolvePartitionWildcards(partitions: List[String], jsc: JavaSparkContext, - storage: HoodieStorage, cfg: HoodieConfig, basePath: String): List[String] = { + storage: HoodieStorage, cfg: HoodieConfig, basePath: String, metaClient: HoodieTableMetaClient): List[String] = { //find out if any of the input partitions have wildcards //note:spark-sql may url-encode special characters (* -> %2A) var (wildcardPartitions, fullPartitions) = partitions.partition(partition => partition.matches(".*(\\*|%2A).*")) val allPartitions = FSUtils.getAllPartitionPaths(new HoodieSparkEngineContext(jsc): HoodieEngineContext, - storage, HoodieMetadataConfig.newBuilder().fromProperties(cfg.getProps).build(), basePath) + metaClient, HoodieMetadataConfig.newBuilder().fromProperties(cfg.getProps).build()) if (fullPartitions.nonEmpty) { fullPartitions = fullPartitions.filter(partition => allPartitions.contains(partition)) @@ -756,12 +760,14 @@ class HoodieSparkSqlWriterInternal { HoodieTableConfig.PARTITION_METAFILE_USE_BASE_FORMAT.key(), String.valueOf(HoodieTableConfig.PARTITION_METAFILE_USE_BASE_FORMAT.defaultValue()) )) + val tableFormat = hoodieConfig.getStringOrDefault(HoodieTableConfig.TABLE_FORMAT) HoodieTableMetaClient.newTableBuilder() .setTableType(HoodieTableType.valueOf(tableType)) .setTableName(tableName) .setRecordKeyFields(recordKeyFields) .setTableVersion(tableVersion) + .setTableFormat(tableFormat) .setArchiveLogFolder(archiveLogFolder) .setPayloadClassName(payloadClass) .setRecordMergeMode(RecordMergeMode.getValue(hoodieConfig.getString(HoodieWriteConfig.RECORD_MERGE_MODE))) diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkBaseIndexSupport.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkBaseIndexSupport.scala index 7ff4e4936c13..d47c8785ab26 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkBaseIndexSupport.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkBaseIndexSupport.scala @@ -42,7 +42,7 @@ abstract class SparkBaseIndexSupport(spark: SparkSession, metaClient: HoodieTableMetaClient) { @transient protected lazy val engineCtx = new HoodieSparkEngineContext(new JavaSparkContext(spark.sparkContext)) @transient protected lazy val metadataTable: HoodieTableMetadata = - HoodieTableMetadata.create(engineCtx, metaClient.getStorage, metadataConfig, metaClient.getBasePath.toString) + metaClient.getTableFormat.getMetadataFactory.create(engineCtx, metaClient.getStorage, metadataConfig, metaClient.getBasePath.toString) def getIndexName: String diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala index 69473ec355bb..e4ffcac2c542 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala @@ -180,7 +180,7 @@ class HoodieCatalogTable(val spark: SparkSession, var table: CatalogTable) exten def getPartitionPaths: Seq[String] = { val droppedPartitions = TimelineUtils.getDroppedPartitions(metaClient, org.apache.hudi.common.util.Option.empty(), org.apache.hudi.common.util.Option.empty()) - getAllPartitionPaths(spark, table, metaClient.getStorage) + getAllPartitionPaths(spark, table, metaClient) .filter(!droppedPartitions.contains(_)) } @@ -231,6 +231,7 @@ class HoodieCatalogTable(val spark: SparkSession, var table: CatalogTable) exten HoodieTableMetaClient.newTableBuilder() .fromProperties(properties) .setTableVersion(Integer.valueOf(getStringWithAltKeys(tableConfigs, HoodieWriteConfig.WRITE_TABLE_VERSION))) + .setTableFormat(getStringWithAltKeys(tableConfigs, HoodieTableConfig.TABLE_FORMAT)) .setDatabaseName(catalogDatabaseName) .setTableName(table.identifier.table) .setTableCreateSchema(schema.toString()) diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlCommonUtils.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlCommonUtils.scala index c365a2cfcece..942adda6ff9a 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlCommonUtils.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlCommonUtils.scala @@ -66,26 +66,25 @@ object HoodieSqlCommonUtils extends SparkAdapterSupport { avroSchema.map(AvroConversionUtils.convertAvroSchemaToStructType) } - def getAllPartitionPaths(spark: SparkSession, table: CatalogTable, storage: HoodieStorage): Seq[String] = { + def getAllPartitionPaths(spark: SparkSession, table: CatalogTable, metaClient: HoodieTableMetaClient): Seq[String] = { val sparkEngine = new HoodieSparkEngineContext(new JavaSparkContext(spark.sparkContext)) val metadataConfig = { val properties = TypedProperties.fromMap((spark.sessionState.conf.getAllConfs ++ table.storage.properties ++ table.properties).asJava) HoodieMetadataConfig.newBuilder.fromProperties(properties).build() } - FSUtils.getAllPartitionPaths(sparkEngine, storage, metadataConfig, getTableLocation(table, spark)).asScala.toSeq + FSUtils.getAllPartitionPaths(sparkEngine, metaClient, metadataConfig).asScala.toSeq } def getFilesInPartitions(spark: SparkSession, table: CatalogTable, - storage: HoodieStorage, + metaClient: HoodieTableMetaClient, partitionPaths: Seq[String]): Map[String, Seq[StoragePathInfo]] = { val sparkEngine = new HoodieSparkEngineContext(new JavaSparkContext(spark.sparkContext)) val metadataConfig = { val properties = TypedProperties.fromMap((spark.sessionState.conf.getAllConfs ++ table.storage.properties ++ table.properties).asJava) HoodieMetadataConfig.newBuilder.fromProperties(properties).build() } - FSUtils.getFilesInPartitions(sparkEngine, storage, metadataConfig, getTableLocation(table, spark), - partitionPaths.toArray).asScala + FSUtils.getFilesInPartitions(sparkEngine, metaClient, metadataConfig, partitionPaths.toArray).asScala .map(e => (e._1, e._2.asScala.toSeq)) .toMap } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/RepairHoodieTableCommand.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/RepairHoodieTableCommand.scala index 777d88f1bf1a..a713bdbebc6d 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/RepairHoodieTableCommand.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/RepairHoodieTableCommand.scala @@ -17,9 +17,6 @@ package org.apache.spark.sql.hudi.command -import org.apache.hudi.hadoop.fs.HadoopFSUtils -import org.apache.hudi.storage.HoodieStorageUtils - import org.apache.hadoop.fs.Path import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier @@ -87,9 +84,7 @@ case class RepairHoodieTableCommand(tableName: TableIdentifier, val total = partitionSpecsAndLocs.length val partitionList = partitionSpecsAndLocs.map(_._2.toString) val partitionStats = if (spark.sqlContext.conf.gatherFastStats && total > 0) { - HoodieSqlCommonUtils.getFilesInPartitions(spark, table, - HoodieStorageUtils.getStorage(partitionList.head, HadoopFSUtils.getStorageConf(spark.sessionState.newHadoopConf())), - partitionList) + HoodieSqlCommonUtils.getFilesInPartitions(spark, table, hoodieCatalogTable.metaClient, partitionList) .mapValues(statuses => PartitionStatistics(statuses.length, statuses.map(_.getLength).sum)) } else { Map.empty[String, PartitionStatistics] diff --git a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/cli/BootstrapExecutorUtils.java b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/cli/BootstrapExecutorUtils.java index 3edeac283cbb..325e8c6a5019 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/cli/BootstrapExecutorUtils.java +++ b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/cli/BootstrapExecutorUtils.java @@ -244,6 +244,8 @@ private void initializeTable() throws IOException { TIMELINE_HISTORY_PATH.key(), TIMELINE_HISTORY_PATH.defaultValue())) .setPayloadClassName(cfg.payloadClass) .setBaseFileFormat(cfg.baseFileFormat) + .setTableFormat(props.getString(HoodieTableConfig.TABLE_FORMAT.key(), + HoodieTableConfig.TABLE_FORMAT.defaultValue())) .setBootstrapIndexClass(cfg.bootstrapIndexClass) .setBootstrapBasePath(bootstrapBasePath) .setCDCEnabled(props.getBoolean(HoodieTableConfig.CDC_ENABLED.key(), diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/PartitionBucketIndexManager.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/PartitionBucketIndexManager.scala index 0420de211d42..4b4f0e3261bd 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/PartitionBucketIndexManager.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/PartitionBucketIndexManager.scala @@ -146,7 +146,7 @@ class PartitionBucketIndexManager extends BaseProcedure val mdtEnable = metaClient.getStorage().exists(new StoragePath(metaClient.getBasePath, HoodieTableMetaClient.METADATA_TABLE_FOLDER_PATH)) // get all partition paths - val allPartitions = FSUtils.getAllPartitionPaths(context, metaClient.getStorage, metaClient.getBasePath, mdtEnable) + val allPartitions = FSUtils.getAllPartitionPaths(context, metaClient, mdtEnable) val usePartitionBucketIndexBefore = PartitionBucketIndexUtils.isPartitionSimpleBucketIndex(context.getStorageConf, basePath.toString) var partition2BucketWithLatestHashingConfig: util.Map[String, Integer] = null @@ -195,8 +195,8 @@ class PartitionBucketIndexManager extends BaseProcedure logInfo("Perform OVERWRITE with dry-run disabled.") val partitionsToRescale = rescalePartitionsMap.keys // get all fileSlices need to read - val allFilesMap = FSUtils.getFilesInPartitions(context, metaClient.getStorage(), HoodieMetadataConfig.newBuilder.enable(mdtEnable).build, - metaClient.getBasePath.toString, partitionsToRescale.map(relative => { + val allFilesMap = FSUtils.getFilesInPartitions(context, metaClient, HoodieMetadataConfig.newBuilder.enable(mdtEnable).build, + partitionsToRescale.map(relative => { new StoragePath(basePath, relative) }).map(storagePath => storagePath.toString).toArray) val files = allFilesMap.values().asScala.flatMap(x => x.asScala).toList diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RepairMigratePartitionMetaProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RepairMigratePartitionMetaProcedure.scala index 3cedd1a9cba3..1f7ebbfde237 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RepairMigratePartitionMetaProcedure.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RepairMigratePartitionMetaProcedure.scala @@ -63,7 +63,7 @@ class RepairMigratePartitionMetaProcedure extends BaseProcedure with ProcedureBu val metaClient = createMetaClient(jsc, tablePath) val engineContext: HoodieLocalEngineContext = new HoodieLocalEngineContext(metaClient.getStorageConf) - val partitionPaths: util.List[String] = FSUtils.getAllPartitionPaths(engineContext, metaClient.getStorage, tablePath, false) + val partitionPaths: util.List[String] = FSUtils.getAllPartitionPaths(engineContext, metaClient, false) val basePath: StoragePath = new StoragePath(tablePath) val rows = new util.ArrayList[Row](partitionPaths.size) diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowColumnStatsOverlapProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowColumnStatsOverlapProcedure.scala index 1d3bfd19aaa5..fa259c5970b9 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowColumnStatsOverlapProcedure.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowColumnStatsOverlapProcedure.scala @@ -25,7 +25,6 @@ import org.apache.hudi.common.data.HoodieData import org.apache.hudi.common.fs.FSUtils import org.apache.hudi.common.model.{FileSlice, HoodieRecord} import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} -import org.apache.hudi.common.table.timeline.{HoodieInstant, HoodieInstantTimeGenerator} import org.apache.hudi.common.table.view.HoodieTableFileSystemView import org.apache.hudi.metadata.{HoodieTableMetadata, HoodieTableMetadataUtil} import org.apache.hudi.storage.StoragePath @@ -35,7 +34,6 @@ import org.apache.spark.sql.Row import org.apache.spark.sql.hudi.command.procedures.ShowColumnStatsOverlapProcedure.{MAX_VALUE_TYPE, MIN_VALUE_TYPE} import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} -import java.time.Instant import java.util import java.util.function.Supplier @@ -109,7 +107,7 @@ class ShowColumnStatsOverlapProcedure extends BaseProcedure with ProcedureBuilde val columnStatsIndex = new ColumnStatsIndexSupport(spark, schema, metadataConfig, metaClient) val fsView = buildFileSystemView(table) val engineCtx = new HoodieSparkEngineContext(jsc) - val metaTable = HoodieTableMetadata.create(engineCtx, metaClient.getStorage, metadataConfig, basePath) + val metaTable = metaClient.getTableFormat.getMetadataFactory.create(engineCtx, metaClient.getStorage, metadataConfig, basePath) val allFileSlices = getAllFileSlices(partitionsSeq, metaTable, fsView) val fileSlicesSizeByPartition = allFileSlices.groupBy(_.getPartitionPath).mapValues(_.size) diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowFileSystemViewProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowFileSystemViewProcedure.scala index ff87543854bc..d2f7ead67ea4 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowFileSystemViewProcedure.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowFileSystemViewProcedure.scala @@ -115,7 +115,7 @@ class ShowFileSystemViewProcedure(showLatest: Boolean) extends BaseProcedure wit instants = instants.filter(instant => predicate.test(maxInstant, instant.requestedTime)) } - val filteredTimeline = metaClient.getTimelineLayout.getTimelineFactory.createDefaultTimeline( + val filteredTimeline = metaClient.getTableFormat.getTimelineFactory.createDefaultTimeline( new JArrayList[HoodieInstant](instants.toList.asJava).stream(), metaClient.getActiveTimeline.getInstantReader) new HoodieTableFileSystemView(metaClient, filteredTimeline, statuses) } @@ -148,7 +148,7 @@ class ShowFileSystemViewProcedure(showLatest: Boolean) extends BaseProcedure wit maxInstant: String, merge: Boolean): JList[Row] = { var fileSliceStream: JStream[FileSlice] = JStream.empty() - val completionTimeQueryView =metaClient.getTimelineLayout().getTimelineFactory().createCompletionTimeQueryView(metaClient) + val completionTimeQueryView =metaClient.getTableFormat.getTimelineFactory.createCompletionTimeQueryView(metaClient) if (merge) { partitions.foreach(p => fileSliceStream = JStream.concat(fileSliceStream, fsView.getLatestMergedFileSlicesBeforeOrOn(p, maxInstant))) } else { diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowInvalidParquetProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowInvalidParquetProcedure.scala index e052f5b1fc5e..395dc740dff7 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowInvalidParquetProcedure.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowInvalidParquetProcedure.scala @@ -22,7 +22,7 @@ import org.apache.hudi.common.config.HoodieMetadataConfig import org.apache.hudi.common.fs.FSUtils import org.apache.hudi.common.util.StringUtils import org.apache.hudi.hadoop.fs.HadoopFSUtils -import org.apache.hudi.metadata.HoodieTableMetadata +import org.apache.hudi.metadata.{HoodieTableMetadata, NativeTableMetadataFactory} import org.apache.hudi.storage.hadoop.HoodieHadoopStorage import collection.JavaConverters._ @@ -65,7 +65,7 @@ class ShowInvalidParquetProcedure extends BaseProcedure with ProcedureBuilder { val storageConf = HadoopFSUtils.getStorageConfWithCopy(jsc.hadoopConfiguration()) val storage = new HoodieHadoopStorage(srcPath, storageConf) val metadataConfig = HoodieMetadataConfig.newBuilder.enable(false).build - val metadata = HoodieTableMetadata.create(new HoodieSparkEngineContext(jsc), storage, metadataConfig, srcPath) + val metadata = NativeTableMetadataFactory.getInstance().create(new HoodieSparkEngineContext(jsc), storage, metadataConfig, srcPath) val partitionPaths: java.util.List[String] = metadata.getPartitionPathWithPathPrefixes(partitions.split(",").toList.asJava) val instantsList = if (StringUtils.isNullOrEmpty(instants)) Array.empty[String] else instants.split(",") val fileStatus = partitionPaths.asScala.flatMap(part => { diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowMetadataTableColumnStatsProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowMetadataTableColumnStatsProcedure.scala index 6abae9b598be..47da3695372d 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowMetadataTableColumnStatsProcedure.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowMetadataTableColumnStatsProcedure.scala @@ -82,7 +82,7 @@ class ShowMetadataTableColumnStatsProcedure extends BaseProcedure with Procedure val fsView = buildFileSystemView(table, engineCtx) val allFileSlices: Set[FileSlice] = { if (partitionsSeq.isEmpty) { - val metaTable = HoodieTableMetadata.create(engineCtx, metaClient.getStorage, metadataConfig, basePath) + val metaTable = metaClient.getTableFormat.getMetadataFactory.create(engineCtx, metaClient.getStorage, metadataConfig, basePath) metaTable.getAllPartitionPaths .asScala .flatMap(path => fsView.getLatestFileSlices(path).iterator().asScala) diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/client/functional/TestMetadataUtilRLIandSIRecordGeneration.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/client/functional/TestMetadataUtilRLIandSIRecordGeneration.java index 9e4b8f169f1f..72947e617e54 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/client/functional/TestMetadataUtilRLIandSIRecordGeneration.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/client/functional/TestMetadataUtilRLIandSIRecordGeneration.java @@ -334,7 +334,7 @@ public void testSecondaryIndexRecordGenerationForMOR() throws IOException { .withIndexOptions(Collections.emptyMap()) .build(); HoodieMetadataConfig metadataConfig = HoodieMetadataConfig.newBuilder().enable(true).withSecondaryIndexParallelism(2).build(); - HoodieTableMetadata metadata = HoodieTableMetadata.create(engineContext, storage, metadataConfig, metaClient.getBasePath().toString()); + HoodieTableMetadata metadata = metaClient.getTableFormat().getMetadataFactory().create(engineContext, storage, metadataConfig, metaClient.getBasePath().toString()); HoodieTableFileSystemView metadataView = new HoodieTableFileSystemView(metadata, metaClient, metaClient.getActiveTimeline()); metadataView.loadAllPartitions(); List> partitionFileSlicePairs = new ArrayList<>(); diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestBootstrap.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestBootstrap.java index d8f4c957c958..e833919c30ac 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestBootstrap.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestBootstrap.java @@ -418,7 +418,7 @@ private void checkBootstrapResults(int totalRecords, Schema schema, String insta reloadInputFormats(); List records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat( HadoopFSUtils.getStorageConf(jsc.hadoopConfiguration()), - FSUtils.getAllPartitionPaths(context, storage, basePath, false).stream() + FSUtils.getAllPartitionPaths(context, metaClient, false).stream() .map(f -> basePath + "/" + f).collect(Collectors.toList()), basePath, roJobConf, false, schema, TRIP_HIVE_COLUMN_TYPES, false, new ArrayList<>()); assertEquals(totalRecords, records.size()); @@ -436,7 +436,7 @@ private void checkBootstrapResults(int totalRecords, Schema schema, String insta seenKeys = new HashSet<>(); records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat( HadoopFSUtils.getStorageConf(jsc.hadoopConfiguration()), - FSUtils.getAllPartitionPaths(context, storage, basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS).stream() + FSUtils.getAllPartitionPaths(context, metaClient, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS).stream() .map(f -> basePath + "/" + f).collect(Collectors.toList()), basePath, rtJobConf, true, schema, TRIP_HIVE_COLUMN_TYPES, false, new ArrayList<>()); assertEquals(totalRecords, records.size()); @@ -452,7 +452,7 @@ private void checkBootstrapResults(int totalRecords, Schema schema, String insta reloadInputFormats(); records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat( HadoopFSUtils.getStorageConf(jsc.hadoopConfiguration()), - FSUtils.getAllPartitionPaths(context, storage, basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS).stream() + FSUtils.getAllPartitionPaths(context, metaClient, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS).stream() .map(f -> basePath + "/" + f).collect(Collectors.toList()), basePath, roJobConf, false, schema, TRIP_HIVE_COLUMN_TYPES, true, HoodieRecord.HOODIE_META_COLUMNS); @@ -469,7 +469,7 @@ private void checkBootstrapResults(int totalRecords, Schema schema, String insta seenKeys = new HashSet<>(); records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat( HadoopFSUtils.getStorageConf(jsc.hadoopConfiguration()), - FSUtils.getAllPartitionPaths(context, storage, basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS).stream() + FSUtils.getAllPartitionPaths(context, metaClient, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS).stream() .map(f -> basePath + "/" + f).collect(Collectors.toList()), basePath, rtJobConf, true, schema, TRIP_HIVE_COLUMN_TYPES, true, HoodieRecord.HOODIE_META_COLUMNS); @@ -484,7 +484,7 @@ private void checkBootstrapResults(int totalRecords, Schema schema, String insta reloadInputFormats(); records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat( HadoopFSUtils.getStorageConf(jsc.hadoopConfiguration()), - FSUtils.getAllPartitionPaths(context, storage, basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS).stream() + FSUtils.getAllPartitionPaths(context, metaClient, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS).stream() .map(f -> basePath + "/" + f).collect(Collectors.toList()), basePath, roJobConf, false, schema, TRIP_HIVE_COLUMN_TYPES, true, Arrays.asList("_row_key")); @@ -501,7 +501,7 @@ private void checkBootstrapResults(int totalRecords, Schema schema, String insta seenKeys = new HashSet<>(); records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat( HadoopFSUtils.getStorageConf(jsc.hadoopConfiguration()), - FSUtils.getAllPartitionPaths(context, storage, basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS).stream() + FSUtils.getAllPartitionPaths(context, metaClient, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS).stream() .map(f -> basePath + "/" + f).collect(Collectors.toList()), basePath, rtJobConf, true, schema, TRIP_HIVE_COLUMN_TYPES, true, Arrays.asList("_row_key")); diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestHoodieBackedMetadata.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestHoodieBackedMetadata.java index 712c8ba70755..ad5ff84e6d4c 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestHoodieBackedMetadata.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestHoodieBackedMetadata.java @@ -809,7 +809,7 @@ public void testMetadataTableDeletePartition(HoodieTableType tableType) throws E HoodieTableMetaClient metadataMetaClient = createMetaClient(metadataTableBasePath); List metadataTablePartitions = FSUtils.getAllPartitionPaths( - engineContext, metadataMetaClient.getStorage(), metadataMetaClient.getBasePath(), false); + engineContext, metadataMetaClient, false); Option completedReplaceInstant = metadataMetaClient.reloadActiveTimeline().getCompletedReplaceTimeline().lastInstant(); assertTrue(completedReplaceInstant.isPresent()); @@ -1184,7 +1184,7 @@ public void testMetadataRollbackDuringInit() throws Exception { client.commit(newCommitTime2, writeStatuses); } - HoodieTableMetadata metadataReader = HoodieTableMetadata.create( + HoodieTableMetadata metadataReader = metaClient.getTableFormat().getMetadataFactory().create( context, storage, writeConfig.getMetadataConfig(), writeConfig.getBasePath()); Map result = metadataReader .readRecordIndex(records1.stream().map(HoodieRecord::getRecordKey).collect(Collectors.toList())); @@ -3634,7 +3634,7 @@ public void testDeleteWithRecordIndex() throws Exception { allRecords.addAll(secondBatchOfrecords); // RI should have created mappings for all the records inserted above - HoodieTableMetadata metadataReader = HoodieTableMetadata.create( + HoodieTableMetadata metadataReader = metaClient.getTableFormat().getMetadataFactory().create( context, storage, writeConfig.getMetadataConfig(), writeConfig.getBasePath()); Map result = metadataReader .readRecordIndex(allRecords.stream().map(HoodieRecord::getRecordKey).collect(Collectors.toList())); @@ -3651,7 +3651,7 @@ public void testDeleteWithRecordIndex() throws Exception { client.commit(deleteTime, jsc.parallelize(writeStatuses)); // RI should not return mappings for deleted records - metadataReader = HoodieTableMetadata.create( + metadataReader = metaClient.getTableFormat().getMetadataFactory().create( context, storage, writeConfig.getMetadataConfig(), writeConfig.getBasePath()); result = metadataReader.readRecordIndex(allRecords.stream().map(HoodieRecord::getRecordKey).collect(Collectors.toList())); assertEquals(allRecords.size() - recordsToDelete.size(), result.size(), "RI should not have mapping for deleted records"); @@ -3666,12 +3666,12 @@ public void testDeleteWithRecordIndex() throws Exception { List writeStatuses = client.delete(jsc.emptyRDD(), deleteTime).collect(); client.commit(deleteTime, jsc.parallelize(writeStatuses)); - HoodieTableMetadata metadataReader = HoodieTableMetadata.create( + HoodieTableMetadata metadataReader = metaClient.getTableFormat().getMetadataFactory().create( context, storage, writeConfig.getMetadataConfig(), writeConfig.getBasePath()); assertTrue(metadataReader.getLatestCompactionTime().isPresent(), "Compaction should have taken place on MDT"); // RI should not return mappings for deleted records - metadataReader = HoodieTableMetadata.create(context, storage, writeConfig.getMetadataConfig(), writeConfig.getBasePath()); + metadataReader = metaClient.getTableFormat().getMetadataFactory().create(context, storage, writeConfig.getMetadataConfig(), writeConfig.getBasePath()); Map result = metadataReader.readRecordIndex(allRecords.stream().map(HoodieRecord::getRecordKey).collect(Collectors.toList())); assertEquals(allRecords.size() - keysToDelete.size(), result.size(), "RI should not have mapping for deleted records"); result.keySet().forEach(mappingKey -> assertFalse(keysToDelete.contains(mappingKey), "RI should not have mapping for deleted records")); @@ -3682,7 +3682,7 @@ public void testDeleteWithRecordIndex() throws Exception { client.commit(reinsertTime, jsc.parallelize(writeStatuses)); // New mappings should have been created for re-inserted records and should map to the new commit time - metadataReader = HoodieTableMetadata.create(context, storage, writeConfig.getMetadataConfig(), writeConfig.getBasePath()); + metadataReader = metaClient.getTableFormat().getMetadataFactory().create(context, storage, writeConfig.getMetadataConfig(), writeConfig.getBasePath()); result = metadataReader.readRecordIndex(allRecords.stream().map(HoodieRecord::getRecordKey).collect(Collectors.toList())); assertEquals(allRecords.size(), result.size(), "RI should have mappings for re-inserted records"); for (String reInsertedKey : keysToDelete) { @@ -3738,8 +3738,7 @@ private void validateMetadata(SparkRDDWriteClient testClient, Option ign // Metadata table has a fixed number of partitions // Cannot use FSUtils.getAllFoldersWithPartitionMetaFile for this as that function filters all directory // in the .hoodie folder. - List metadataTablePartitions = FSUtils.getAllPartitionPaths(engineContext, storage, getMetadataTableBasePath(basePath), - false); + List metadataTablePartitions = FSUtils.getAllPartitionPaths(engineContext, metadataMetaClient, false); // Secondary index is enabled by default but no MDT partition corresponding to it is available final boolean isPartitionStatsEnabled; if (!metadataWriter.getEnabledPartitionTypes().contains(COLUMN_STATS)) { @@ -3916,7 +3915,7 @@ public static void validateMetadata(HoodieWriteConfig config, Option ign // Metadata table has a fixed number of partitions // Cannot use FSUtils.getAllFoldersWithPartitionMetaFile for this as that function filters all directory // in the .hoodie folder. - List metadataTablePartitions = FSUtils.getAllPartitionPaths(engineContext, storage, getMetadataTableBasePath(basePath), false); + List metadataTablePartitions = FSUtils.getAllPartitionPaths(engineContext, metadataMetaClient, false); // check if the last instant is restore, then the metadata table should have only the partitions that are not deleted metaClient.reloadActiveTimeline().getReverseOrderedInstants().findFirst().ifPresent(instant -> { if (instant.getAction().equals(HoodieActiveTimeline.RESTORE_ACTION)) { @@ -4018,7 +4017,8 @@ private static HoodieBackedTableMetadataWriter, JavaRDD) v1 -> - HoodieTableMetadata.create(context, metaClient.getStorage(), config.getMetadataConfig(), config.getBasePath())) + metaClient.getTableFormat().getMetadataFactory().create(context, metaClient.getStorage(), config.getMetadataConfig(), config.getBasePath())) .getFileSystemView(basePath); assertFileSystemViews(config, enableMdt, storageType); @@ -224,7 +224,7 @@ private void assertFileSystemViews(HoodieWriteConfig writeConfig, boolean enable HoodieTableFileSystemView actualFileSystemView = (HoodieTableFileSystemView) FileSystemViewManager .createViewManager(context, writeConfig.getMetadataConfig(), viewStorageConfig, writeConfig.getCommonConfig(), (SerializableFunctionUnchecked) v1 -> - HoodieTableMetadata.create(context, metaClient.getStorage(), writeConfig.getMetadataConfig(), writeConfig.getBasePath())) + metaClient.getTableFormat().getMetadataFactory().create(context, metaClient.getStorage(), writeConfig.getMetadataConfig(), writeConfig.getBasePath())) .getFileSystemView(basePath); try { assertForFSVEquality(expectedFileSystemView, actualFileSystemView, enableMdt, Option.empty()); diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableCompaction.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableCompaction.java index 6bb11c67ddce..80b3455dc407 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableCompaction.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableCompaction.java @@ -395,21 +395,19 @@ private void validateFilesExistInCompactionPlan(String compactionInstant) { } private void validateFileListingInMetadataTable() { - List partitionPaths = FSUtils.getAllPartitionPaths(context(), hoodieStorage(), basePath(), false) + List partitionPaths = FSUtils.getAllPartitionPaths(context(), metaClient, false) .stream() .map(e -> new StoragePath(basePath(), e).toString()) .collect(Collectors.toList()); Map> filesFromStorage = FSUtils.getFilesInPartitions( context(), - hoodieStorage(), + metaClient, HoodieMetadataConfig.newBuilder().enable(false).build(), - basePath(), partitionPaths.toArray(new String[0])); Map> filesFromMetadataTable = FSUtils.getFilesInPartitions( context(), - hoodieStorage(), + metaClient, HoodieMetadataConfig.newBuilder().enable(true).build(), - basePath(), partitionPaths.toArray(new String[0])); assertEquals(filesFromStorage.size(), filesFromMetadataTable.size()); for (String partition : filesFromStorage.keySet()) { diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/META-INF/services/org.apache.hudi.common.HoodieTableFormat b/hudi-spark-datasource/hudi-spark/src/test/resources/META-INF/services/org.apache.hudi.common.HoodieTableFormat new file mode 100644 index 000000000000..eb7e6220e5e6 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/META-INF/services/org.apache.hudi.common.HoodieTableFormat @@ -0,0 +1,19 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +org.apache.hudi.tableformat.TestTableFormat \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala index b9150d3de009..9a2c39f718b9 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala @@ -634,7 +634,7 @@ class TestHoodieFileIndex extends HoodieSparkClientTestBase with ScalaAssertionS metaClient = HoodieTableMetaClient.reload(metaClient) // Test getting partition paths in a subset of directories - val metadata = HoodieTableMetadata.create(context, + val metadata = metaClient.getTableFormat.getMetadataFactory.create(context, metaClient.getStorage, HoodieMetadataConfig.newBuilder().enable(enableMetadataTable).build(), metaClient.getBasePath.toString) diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriterWithTestFormat.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriterWithTestFormat.scala new file mode 100644 index 000000000000..ce770599ed3e --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriterWithTestFormat.scala @@ -0,0 +1,717 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + + +import org.apache.hudi.client.SparkRDDWriteClient +import org.apache.hudi.common.config.HoodieMetadataConfig +import org.apache.hudi.common.model._ +import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient, TableSchemaResolver} +import org.apache.hudi.common.testutils.HoodieTestDataGenerator +import org.apache.hudi.config.{HoodieBootstrapConfig, HoodieIndexConfig, HoodieWriteConfig} +import org.apache.hudi.exception.{HoodieException, SchemaCompatibilityException} +import org.apache.hudi.execution.bulkinsert.BulkInsertSortMode +import org.apache.hudi.functional.TestBootstrap +import org.apache.hudi.hadoop.fs.HadoopFSUtils +import org.apache.hudi.keygen.{ComplexKeyGenerator, NonpartitionedKeyGenerator, SimpleKeyGenerator} +import org.apache.hudi.testutils.DataSourceTestUtils +import org.apache.hudi.testutils.HoodieClientTestUtils.createMetaClient + +import org.apache.avro.Schema +import org.apache.commons.io.FileUtils +import org.apache.spark.api.java.JavaSparkContext +import org.apache.spark.sql.{DataFrame, Row, SaveMode} +import org.apache.spark.sql.functions.{expr, lit} +import org.apache.spark.sql.hudi.command.SqlKeyGenerator +import org.junit.jupiter.api.Assertions._ +import org.junit.jupiter.api.Test +import org.junit.jupiter.params.ParameterizedTest +import org.junit.jupiter.params.provider._ +import org.mockito.ArgumentMatchers.any +import org.mockito.Mockito.{spy, times, verify} +import org.scalatest.Assertions.assertThrows +import org.scalatest.Matchers.intercept + +import java.time.Instant +import java.util.{Collections, Date, UUID} + +import scala.collection.JavaConverters._ + +/** + * Test suite for SparkSqlWriter class with format as "test-format" that implements org.apache.hudi.common.table.HoodieTableFormat. + * All cases of using of {@link HoodieTimelineTimeZone.UTC} should be done in a separate test class {@link TestHoodieSparkSqlWriterUtc}. + * Otherwise UTC tests will generate infinite loops, if there is any initiated test with time zone that is greater then UTC+0. + * The reason is in a saved value in the heap of static {@link org.apache.hudi.common.table.timeline.HoodieInstantTimeGenerator.lastInstantTime}. + */ +class TestHoodieSparkSqlWriterWithTestFormat extends HoodieSparkWriterTestBase { + + /** + * Local utility method for performing bulk insert tests. + * + * @param sortMode Bulk insert sort mode + * @param populateMetaFields Flag for populating meta fields + */ + def testBulkInsertWithSortMode(sortMode: BulkInsertSortMode, populateMetaFields: Boolean = true, enableOCCConfigs: Boolean = false): Unit = { + //create a new table + var fooTableModifier = commonTableModifier.updated("hoodie.bulkinsert.shuffle.parallelism", "4") + .updated(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL) + .updated(DataSourceWriteOptions.ENABLE_ROW_WRITER.key, "true") + .updated(HoodieTableConfig.POPULATE_META_FIELDS.key(), String.valueOf(populateMetaFields)) + .updated(HoodieWriteConfig.BULK_INSERT_SORT_MODE.key(), sortMode.name()) + .updated(HoodieTableConfig.TABLE_FORMAT.key, "test-format") + + if (enableOCCConfigs) { + fooTableModifier = fooTableModifier + .updated("hoodie.write.concurrency.mode", "optimistic_concurrency_control") + .updated("hoodie.clean.failed.writes.policy", "LAZY") + .updated("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.lock.InProcessLockProvider") + } + + // generate the inserts + val schema = DataSourceTestUtils.getStructTypeExampleSchema + val structType = AvroConversionUtils.convertAvroSchemaToStructType(schema) + val inserts = DataSourceTestUtils.generateRandomRows(1000) + + // add some updates so that preCombine kicks in + val toUpdateDataset = sqlContext.createDataFrame(DataSourceTestUtils.getUniqueRows(inserts, 40), structType) + val updates = DataSourceTestUtils.updateRowsWithUpdatedTs(toUpdateDataset) + val records = inserts.asScala.union(updates.asScala) + val recordsSeq = convertRowListToSeq(records.asJava) + val df = spark.createDataFrame(sc.parallelize(recordsSeq), structType) + // write to Hudi + HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableModifier, df) + + // collect all partition paths to issue read of parquet files + val partitions = Seq(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, + HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH) + // Check the entire dataset has all records still + val fullPartitionPaths = new Array[String](3) + for (i <- fullPartitionPaths.indices) { + fullPartitionPaths(i) = String.format("%s/%s/*", tempBasePath, partitions(i)) + } + // fetch all records from parquet files generated from write to hudi + val actualDf = sqlContext.read.parquet(fullPartitionPaths(0), fullPartitionPaths(1), fullPartitionPaths(2)) + if (!populateMetaFields) { + List(0, 1, 2, 3, 4).foreach(i => assertEquals(0, actualDf.select(HoodieRecord.HOODIE_META_COLUMNS.get(i)).filter(entry => !(entry.mkString(",").equals(""))).count())) + } + // remove metadata columns so that expected and actual DFs can be compared as is + val trimmedDf = dropMetaFields(actualDf) + assert(df.except(trimmedDf).count() == 0) + } + + /** + * Test case for throw hoodie exception when there already exist a table + * with different name with Append Save mode + */ + @Test + def testThrowExceptionAlreadyExistsWithAppendSaveMode(): Unit = { + //create a new table + val fooTableModifier = Map( + "path" -> tempBasePath, + HoodieWriteConfig.TBL_NAME.key -> hoodieFooTableName, + "hoodie.datasource.write.recordkey.field" -> "uuid", + "hoodie.insert.shuffle.parallelism" -> "4", + "hoodie.upsert.shuffle.parallelism" -> "4", + HoodieTableConfig.TABLE_FORMAT.key -> "test-format") + val dataFrame = spark.createDataFrame(Seq(StringLongTest(UUID.randomUUID().toString, new Date().getTime))) + HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableModifier, dataFrame) + + //on same path try append with different("hoodie_bar_tbl") table name which should throw an exception + val barTableModifier = Map( + "path" -> tempBasePath, + HoodieWriteConfig.TBL_NAME.key -> "hoodie_bar_tbl", + "hoodie.datasource.write.recordkey.field" -> "uuid", + "hoodie.insert.shuffle.parallelism" -> "4", + "hoodie.upsert.shuffle.parallelism" -> "4") + val dataFrame2 = spark.createDataFrame(Seq(StringLongTest(UUID.randomUUID().toString, new Date().getTime))) + val tableAlreadyExistException = intercept[HoodieException](HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, barTableModifier, dataFrame2)) + assert(tableAlreadyExistException.getMessage.contains("Config conflict")) + assert(tableAlreadyExistException.getMessage.contains(s"${HoodieWriteConfig.TBL_NAME.key}:\thoodie_bar_tbl\thoodie_foo_tbl")) + + //on same path try append with delete operation and different("hoodie_bar_tbl") table name which should throw an exception + val deleteTableModifier = barTableModifier ++ Map(DataSourceWriteOptions.OPERATION.key -> "delete") + val deleteCmdException = intercept[HoodieException](HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, deleteTableModifier, dataFrame2)) + assert(tableAlreadyExistException.getMessage.contains("Config conflict")) + assert(tableAlreadyExistException.getMessage.contains(s"${HoodieWriteConfig.TBL_NAME.key}:\thoodie_bar_tbl\thoodie_foo_tbl")) + } + + /** + * Test case for Do not validate table config if save mode is set to Overwrite + */ + @Test + def testValidateTableConfigWithOverwriteSaveMode(): Unit = { + //create a new table + val tableModifier1 = Map("path" -> tempBasePath, HoodieWriteConfig.TBL_NAME.key -> hoodieFooTableName, + "hoodie.datasource.write.recordkey.field" -> "uuid", HoodieTableConfig.TABLE_FORMAT.key -> "test-format") + val dataFrame = spark.createDataFrame(Seq(StringLongTest(UUID.randomUUID().toString, new Date().getTime))) + HoodieSparkSqlWriter.write(sqlContext, SaveMode.Overwrite, tableModifier1, dataFrame) + + //on same path try write with different RECORDKEY_FIELD_NAME and Append SaveMode should throw an exception + val tableModifier2 = Map("path" -> tempBasePath, HoodieWriteConfig.TBL_NAME.key -> hoodieFooTableName, + "hoodie.datasource.write.recordkey.field" -> "ts", HoodieTableConfig.TABLE_FORMAT.key -> "test-format") + val dataFrame2 = spark.createDataFrame(Seq(StringLongTest(UUID.randomUUID().toString, new Date().getTime))) + val hoodieException = intercept[HoodieException](HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, tableModifier2, dataFrame2)) + assert(hoodieException.getMessage.contains("Config conflict")) + assert(hoodieException.getMessage.contains(s"RecordKey:\tts\tuuid")) + + //on same path try write with different RECORDKEY_FIELD_NAME and Overwrite SaveMode should be successful. + assert(HoodieSparkSqlWriter.write(sqlContext, SaveMode.Overwrite, tableModifier2, dataFrame2)._1) + } + + + /** + * Test case for each bulk insert sort mode + * + * @param sortMode Bulk insert sort mode + */ + @ParameterizedTest + @EnumSource(value = classOf[BulkInsertSortMode]) + def testBulkInsertForSortMode(sortMode: BulkInsertSortMode): Unit = { + testBulkInsertWithSortMode(sortMode, populateMetaFields = true) + } + + @Test + def testBulkInsertForSortModeWithOCC(): Unit = { + testBulkInsertWithSortMode(BulkInsertSortMode.GLOBAL_SORT, populateMetaFields = true, true) + } + + /** + * Test case for Bulk insert with populating meta fields or + * without populating meta fields. + * + * @param populateMetaFields Flag for populating meta fields + */ + @ParameterizedTest + @ValueSource(booleans = Array(true, false)) + def testBulkInsertForPopulateMetaFields(populateMetaFields: Boolean): Unit = { + testBulkInsertWithSortMode(BulkInsertSortMode.NONE, populateMetaFields) + } + + /** + * Test case for disable and enable meta fields. + */ + @Test + def testDisableAndEnableMetaFields(): Unit = { + testBulkInsertWithSortMode(BulkInsertSortMode.NONE, populateMetaFields = false) + //create a new table + val fooTableModifier = commonTableModifier.updated("hoodie.bulkinsert.shuffle.parallelism", "4") + .updated(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL) + .updated(DataSourceWriteOptions.ENABLE_ROW_WRITER.key, "true") + .updated(HoodieWriteConfig.BULK_INSERT_SORT_MODE.key(), BulkInsertSortMode.NONE.name()) + .updated(HoodieTableConfig.POPULATE_META_FIELDS.key(), "true") + .updated(HoodieTableConfig.TABLE_FORMAT.key, "test-format") + + // generate the inserts + val schema = DataSourceTestUtils.getStructTypeExampleSchema + val structType = AvroConversionUtils.convertAvroSchemaToStructType(schema) + val inserts = DataSourceTestUtils.generateRandomRows(1000) + val df = spark.createDataFrame(sc.parallelize(inserts.asScala.toSeq), structType) + try { + // write to Hudi + HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableModifier, df) + fail("Should have thrown exception") + } catch { + case e: HoodieException => assertTrue(e.getMessage.startsWith("Config conflict")) + case e: Exception => fail(e); + } + } + + /** + * Test case for drop duplicates row writing for bulk_insert. + */ + @Test + def testDropDuplicatesRowForBulkInsert(): Unit = { + try { + //create a new table + val fooTableModifier = commonTableModifier.updated("hoodie.bulkinsert.shuffle.parallelism", "4") + .updated(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL) + .updated(DataSourceWriteOptions.ENABLE_ROW_WRITER.key, "true") + .updated(DataSourceWriteOptions.INSERT_DROP_DUPS.key, "true") + .updated(HoodieTableConfig.TABLE_FORMAT.key, "test-format") + + // generate the inserts + val schema = DataSourceTestUtils.getStructTypeExampleSchema + val structType = AvroConversionUtils.convertAvroSchemaToStructType(schema) + val records = DataSourceTestUtils.generateRandomRows(100) + val recordsSeq = convertRowListToSeq(records) + val df = spark.createDataFrame(spark.sparkContext.parallelize(recordsSeq), structType) + // write to Hudi + HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableModifier, df) + fail("Drop duplicates with bulk insert in row writing should have thrown exception") + } catch { + case e: HoodieException => assertTrue(e.getMessage.contains("Dropping duplicates with bulk_insert in row writer path is not supported yet")) + } + } + + /** + * Test case for insert dataset without precombine field. + */ + @Test + def testInsertDatasetWithoutPrecombineField(): Unit = { + + val fooTableModifier = commonTableModifier.updated(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) + .updated(DataSourceWriteOptions.INSERT_DROP_DUPS.key, "false") + .updated(HoodieTableConfig.TABLE_FORMAT.key, "test-format") + + // generate the inserts + val schema = DataSourceTestUtils.getStructTypeExampleSchema + val structType = AvroConversionUtils.convertAvroSchemaToStructType(schema) + val records = DataSourceTestUtils.generateRandomRows(100) + val recordsSeq = convertRowListToSeq(records) + val df = spark.createDataFrame(sc.parallelize(recordsSeq), structType) + // write to Hudi + HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableModifier - DataSourceWriteOptions.PRECOMBINE_FIELD.key, df) + + // collect all partition paths to issue read of parquet files + val partitions = Seq(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, + HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH) + // Check the entire dataset has all records still + val fullPartitionPaths = new Array[String](3) + for (i <- fullPartitionPaths.indices) { + fullPartitionPaths(i) = String.format("%s/%s/*", tempBasePath, partitions(i)) + } + + // fetch all records from parquet files generated from write to hudi + val actualDf = spark.sqlContext.read.parquet(fullPartitionPaths(0), fullPartitionPaths(1), fullPartitionPaths(2)) + // remove metadata columns so that expected and actual DFs can be compared as is + val trimmedDf = dropMetaFields(actualDf) + assert(df.except(trimmedDf).count() == 0) + } + + /** + * Test case for insert dataset without partitioning field + */ + @Test + def testInsertDatasetWithoutPartitionField(): Unit = { + val tableOpts = + commonTableModifier + .updated(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) + .updated(HoodieTableConfig.TABLE_FORMAT.key, "test-format") + + // generate the inserts + val schema = DataSourceTestUtils.getStructTypeExampleSchema + val structType = AvroConversionUtils.convertAvroSchemaToStructType(schema) + val records = DataSourceTestUtils.generateRandomRows(1) + val recordsSeq = convertRowListToSeq(records) + val df = spark.createDataFrame(sc.parallelize(recordsSeq), structType) + + // try write to Hudi + assertThrows[HoodieException] { + HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, tableOpts - DataSourceWriteOptions.PARTITIONPATH_FIELD.key, df) + } + } + + /** + * Test case for bulk insert dataset with datasource impl multiple rounds. + */ + @Test + def testBulkInsertDatasetWithDatasourceImplMultipleRounds(): Unit = { + + val fooTableModifier = commonTableModifier.updated("hoodie.bulkinsert.shuffle.parallelism", "4") + .updated(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL) + .updated(DataSourceWriteOptions.ENABLE_ROW_WRITER.key, "true") + .updated(HoodieTableConfig.TABLE_FORMAT.key, "test-format") + val partitions = Seq(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, + HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH) + val fullPartitionPaths = new Array[String](3) + for (i <- 0 to 2) { + fullPartitionPaths(i) = String.format("%s/%s/*", tempBasePath, partitions(i)) + } + val schema = DataSourceTestUtils.getStructTypeExampleSchema + val structType = AvroConversionUtils.convertAvroSchemaToStructType(schema) + var totalExpectedDf = spark.createDataFrame(sc.emptyRDD[Row], structType) + for (_ <- 0 to 2) { + // generate the inserts + val records = DataSourceTestUtils.generateRandomRows(200) + val recordsSeq = convertRowListToSeq(records) + val df = spark.createDataFrame(sc.parallelize(recordsSeq), structType) + // write to Hudi + HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableModifier, df) + // Fetch records from entire dataset + val actualDf = sqlContext.read.parquet(fullPartitionPaths(0), fullPartitionPaths(1), fullPartitionPaths(2)) + // remove metadata columns so that expected and actual DFs can be compared as is + val trimmedDf = dropMetaFields(actualDf) + // find total df (union from multiple rounds) + totalExpectedDf = totalExpectedDf.union(df) + // find mismatch between actual and expected df + assert(totalExpectedDf.except(trimmedDf).count() == 0) + } + } + + /** + * Test cases for basic HoodieSparkSqlWriter functionality with datasource insert + * for different tableTypes, fileFormats and options for population meta fields. + * + * @param tableType Type of table + * @param baseFileFormat File format + * @param populateMetaFields Flag for populating meta fields + */ + @ParameterizedTest + @MethodSource(Array("testDatasourceInsert")) + def testDatasourceInsertForTableTypeBaseFileMetaFields(tableType: String, populateMetaFields: Boolean, baseFileFormat: String): Unit = { + val hoodieFooTableName = "hoodie_foo_tbl" + val fooTableModifier = Map("path" -> tempBasePath, + HoodieWriteConfig.TBL_NAME.key -> hoodieFooTableName, + HoodieWriteConfig.BASE_FILE_FORMAT.key -> baseFileFormat, + DataSourceWriteOptions.TABLE_TYPE.key -> tableType, + HoodieWriteConfig.INSERT_PARALLELISM_VALUE.key -> "4", + DataSourceWriteOptions.OPERATION.key -> DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL, + DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key", + DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition", + HoodieTableConfig.POPULATE_META_FIELDS.key() -> String.valueOf(populateMetaFields), + DataSourceWriteOptions.PAYLOAD_CLASS_NAME.key() -> classOf[DefaultHoodieRecordPayload].getCanonicalName) + val fooTableParams = HoodieWriterUtils.parametersWithWriteDefaults(fooTableModifier) + // generate the inserts + val schema = DataSourceTestUtils.getStructTypeExampleSchema + val structType = AvroConversionUtils.convertAvroSchemaToStructType(schema) + val modifiedSchema = AvroConversionUtils.convertStructTypeToAvroSchema(structType, "trip", "example.schema") + val records = DataSourceTestUtils.generateRandomRows(100) + val recordsSeq = convertRowListToSeq(records) + val df = spark.createDataFrame(sc.parallelize(recordsSeq), structType) + initializeMetaClientForBootstrap(fooTableParams, tableType, addBootstrapPath = false, initBasePath = true) + val client = spy[SparkRDDWriteClient[_]](DataSourceUtils.createHoodieClient( + new JavaSparkContext(sc), modifiedSchema.toString, tempBasePath, hoodieFooTableName, + fooTableParams.asJava).asInstanceOf[SparkRDDWriteClient[HoodieRecordPayload[Nothing]]]) + + HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableModifier, df, Option.empty, Option(client)) + // Verify that asynchronous compaction is not scheduled + verify(client, times(0)).scheduleCompaction(any()) + // Verify that HoodieWriteClient is closed correctly + verify(client, times(1)).close() + + // collect all partition paths to issue read of parquet files + val partitions = Seq(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, + HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH) + // Check the entire dataset has all records still + val fullPartitionPaths = new Array[String](3) + for (i <- fullPartitionPaths.indices) { + fullPartitionPaths(i) = String.format("%s/%s/*", tempBasePath, partitions(i)) + } + // fetch all records from parquet files generated from write to hudi + var actualDf: DataFrame = null + if (baseFileFormat.equalsIgnoreCase(HoodieFileFormat.PARQUET.name())) { + actualDf = sqlContext.read.parquet(fullPartitionPaths(0), fullPartitionPaths(1), fullPartitionPaths(2)) + } else if (baseFileFormat.equalsIgnoreCase(HoodieFileFormat.ORC.name())) { + actualDf = sqlContext.read.orc(fullPartitionPaths(0), fullPartitionPaths(1), fullPartitionPaths(2)) + } + // remove metadata columns so that expected and actual DFs can be compared as is + val trimmedDf = dropMetaFields(actualDf) + assert(df.except(trimmedDf).count() == 0) + } + + /** + * Test cases for HoodieSparkSqlWriter functionality with datasource bootstrap + * for different type of tables and table versions. + * + * @param tableType Type of table + * @param tableVersion Version of table + */ + @ParameterizedTest + @MethodSource(Array("bootstrapTestParams")) + def testWithDatasourceBootstrapForTableType(tableType: String, tableVersion: Int): Unit = { + val srcPath = java.nio.file.Files.createTempDirectory("hoodie_bootstrap_source_path") + try { + val sourceDF = TestBootstrap.generateTestRawTripDataset(Instant.now.toEpochMilli, 0, 100, Collections.emptyList(), sc, + spark.sqlContext) + // Write source data non-partitioned + sourceDF.write.format("parquet").mode(SaveMode.Overwrite).save(srcPath.toAbsolutePath.toString) + + val fooTableModifier = Map("path" -> tempBasePath, + HoodieBootstrapConfig.BASE_PATH.key -> srcPath.toAbsolutePath.toString, + HoodieWriteConfig.TBL_NAME.key -> hoodieFooTableName, + DataSourceWriteOptions.TABLE_TYPE.key -> tableType, + HoodieBootstrapConfig.PARALLELISM_VALUE.key -> "4", + DataSourceWriteOptions.OPERATION.key -> DataSourceWriteOptions.BOOTSTRAP_OPERATION_OPT_VAL, + DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key", + DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "", + DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key -> classOf[NonpartitionedKeyGenerator].getCanonicalName, + DataSourceWriteOptions.PAYLOAD_CLASS_NAME.key() -> classOf[DefaultHoodieRecordPayload].getCanonicalName, + "hoodie.write.table.version" -> tableVersion.toString, + HoodieMetadataConfig.ENABLE_METADATA_INDEX_COLUMN_STATS.key -> "false") + val fooTableParams = HoodieWriterUtils.parametersWithWriteDefaults(fooTableModifier) + initializeMetaClientForBootstrap(fooTableParams, tableType, addBootstrapPath = true, initBasePath = false) + + val client = spy[SparkRDDWriteClient[_]](DataSourceUtils.createHoodieClient( + new JavaSparkContext(sc), + null, + tempBasePath, + hoodieFooTableName, + fooTableParams.asJava).asInstanceOf[SparkRDDWriteClient[HoodieRecordPayload[Nothing]]]) + + HoodieSparkSqlWriter.bootstrap(sqlContext, SaveMode.Append, fooTableModifier, spark.emptyDataFrame, Option.empty, + Option.empty, Option(client)) + + // Verify that HoodieWriteClient is closed correctly + verify(client, times(1)).close() + + val ignoreResult = HoodieSparkSqlWriter.bootstrap(sqlContext, SaveMode.Ignore, fooTableModifier, spark.emptyDataFrame, Option.empty, + Option.empty, Option(client)) + assertFalse(ignoreResult) + verify(client, times(2)).close() + + // Assert the table version is adopted. + val metaClient = createMetaClient(spark, tempBasePath) + assertEquals(metaClient.getTableConfig.getTableVersion.versionCode(), tableVersion) + // fetch all records from parquet files generated from write to hudi + val actualDf = sqlContext.read.parquet(tempBasePath) + assert(actualDf.count == 100) + } finally { + FileUtils.deleteDirectory(srcPath.toFile) + } + } + + def initializeMetaClientForBootstrap(fooTableParams: Map[String, String], tableType: String, addBootstrapPath: Boolean, initBasePath: Boolean): Unit = { + // when metadata is enabled, directly instantiating write client using DataSourceUtils.createHoodieClient + // will hit a code which tries to instantiate meta client for data table. if table does not exist, it fails. + // hence doing an explicit instantiation here. + val tableMetaClientBuilder = HoodieTableMetaClient.newTableBuilder() + .setTableType(tableType) + .setTableName(hoodieFooTableName) + .setRecordKeyFields(fooTableParams(DataSourceWriteOptions.RECORDKEY_FIELD.key)) + .setBaseFileFormat(fooTableParams.getOrElse(HoodieWriteConfig.BASE_FILE_FORMAT.key, + HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().name)) + .setArchiveLogFolder(HoodieTableConfig.TIMELINE_HISTORY_PATH.defaultValue()) + .setPreCombineField(fooTableParams.getOrElse(DataSourceWriteOptions.PRECOMBINE_FIELD.key, null)) + .setPartitionFields(fooTableParams(DataSourceWriteOptions.PARTITIONPATH_FIELD.key)) + .setKeyGeneratorClassProp(fooTableParams.getOrElse(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key, + DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.defaultValue())) + if (addBootstrapPath) { + tableMetaClientBuilder + .setBootstrapBasePath(fooTableParams(HoodieBootstrapConfig.BASE_PATH.key)) + } + if (initBasePath) { + tableMetaClientBuilder.initTable(HadoopFSUtils.getStorageConfWithCopy(sc.hadoopConfiguration), tempBasePath) + } + } + + @Test + def testNonpartitonedWithReuseTableConfig(): Unit = { + val _spark = spark + import _spark.implicits._ + val df = Seq((1, "a1", 10, 1000, "2021-10-16")).toDF("id", "name", "value", "ts", "dt") + val options = Map( + DataSourceWriteOptions.RECORDKEY_FIELD.key -> "id", + DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "ts", + HoodieTableConfig.TABLE_FORMAT.key -> "test-format" + ) + + // case 1: When commit C1 specifies a key generator and commit C2 does not specify key generator + val (tableName1, tablePath1) = ("hoodie_test_params_1", s"$tempBasePath" + "_1") + + // NonpartitionedKeyGenerator is automatically inferred and used + df.write.format("hudi") + .options(options) + .option(HoodieWriteConfig.TBL_NAME.key, tableName1) + .mode(SaveMode.Overwrite).save(tablePath1) + + val df2 = Seq((2, "a2", 20, 1000, "2021-10-16")).toDF("id", "name", "value", "ts", "dt") + // In first commit, we explicitly over-ride it to Nonpartitioned, where as in 2nd batch, since re-using of table configs + // come into play, no exception should be thrown even if we don't supply any key gen class. + df2.write.format("hudi") + .options(options) + .option(HoodieWriteConfig.TBL_NAME.key, tableName1) + .mode(SaveMode.Append).save(tablePath1) + } + + @Test + def testDefaultKeyGenToNonpartitoned(): Unit = { + val _spark = spark + import _spark.implicits._ + val df = Seq((1, "a1", 10, 1000, "2021-10-16")).toDF("id", "name", "value", "ts", "dt") + val options = Map( + DataSourceWriteOptions.RECORDKEY_FIELD.key -> "id", + DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "ts", + DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "dt", + HoodieTableConfig.TABLE_FORMAT.key -> "test-format" + ) + + // case 1: When commit C1 does not specify key generator and commit C2 specifies a key generator + val (tableName1, tablePath1) = ("hoodie_test_params_1", s"$tempBasePath" + "_1") + + df.write.format("hudi") + .options(options) + .option(HoodieWriteConfig.TBL_NAME.key, tableName1) + .mode(SaveMode.Overwrite).save(tablePath1) + + val df2 = Seq((2, "a2", 20, 1000, "2021-10-16")).toDF("id", "name", "value", "ts", "dt") + // raise exception when NonpartitionedKeyGenerator is specified + val configConflictException = intercept[HoodieException] { + df2.write.format("hudi") + .options(options) + .option(HoodieWriteConfig.TBL_NAME.key, tableName1) + .option(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key, classOf[NonpartitionedKeyGenerator].getName) + .mode(SaveMode.Append).save(tablePath1) + } + assert(configConflictException.getMessage.contains("Config conflict")) + assert(configConflictException.getMessage.contains(s"KeyGenerator:\t${classOf[NonpartitionedKeyGenerator].getName}\t${classOf[SimpleKeyGenerator].getName}")) + } + + @Test + def testNoKeyGenToSimpleKeyGen(): Unit = { + val _spark = spark + import _spark.implicits._ + val df = Seq((1, "a1", 10, 1000, "2021-10-16")).toDF("id", "name", "value", "ts", "dt") + val options = Map( + DataSourceWriteOptions.RECORDKEY_FIELD.key -> "id", + DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "ts", + DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "dt", + HoodieTableConfig.TABLE_FORMAT.key -> "test-format" + ) + + // case 1: When commit C1 specifies a key generator and commkt C2 does not specify key generator + val (tableName1, tablePath1) = ("hoodie_test_params_1", s"$tempBasePath" + "_1") + + df.write.format("hudi") + .options(options) + .option(HoodieWriteConfig.TBL_NAME.key, tableName1) + .mode(SaveMode.Overwrite).save(tablePath1) + + val df2 = Seq((2, "a2", 20, 1000, "2021-10-16")).toDF("id", "name", "value", "ts", "dt") + // No Exception Should be raised + try { + df2.write.format("hudi") + .options(options) + .option(HoodieWriteConfig.TBL_NAME.key, tableName1) + .option(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key, classOf[SimpleKeyGenerator].getName) + .mode(SaveMode.Append).save(tablePath1) + } catch { + case _: Throwable => fail("Switching from no keygen to explicit SimpleKeyGenerator should not fail"); + } + } + + @Test + def testSimpleKeyGenToNoKeyGen(): Unit = { + val _spark = spark + import _spark.implicits._ + val df = Seq((1, "a1", 10, 1000, "2021-10-16")).toDF("id", "name", "value", "ts", "dt") + val options = Map( + DataSourceWriteOptions.RECORDKEY_FIELD.key -> "id", + DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "ts", + DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "dt", + HoodieTableConfig.TABLE_FORMAT.key -> "test-format" + ) + + // case 1: When commit C1 specifies a key generator and commkt C2 does not specify key generator + val (tableName1, tablePath1) = ("hoodie_test_params_1", s"$tempBasePath" + "_1") + + // the first write need to specify KEYGENERATOR_CLASS_NAME params + df.write.format("hudi") + .options(options) + .option(HoodieWriteConfig.TBL_NAME.key, tableName1) + .option(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key, classOf[SimpleKeyGenerator].getName) + .mode(SaveMode.Overwrite).save(tablePath1) + + val df2 = Seq((2, "a2", 20, 1000, "2021-10-16")).toDF("id", "name", "value", "ts", "dt") + // No Exception Should be raised when default keygen is used + try { + df2.write.format("hudi") + .options(options) + .option(HoodieWriteConfig.TBL_NAME.key, tableName1) + .mode(SaveMode.Append).save(tablePath1) + } catch { + case _: Throwable => fail("Switching from explicit SimpleKeyGenerator to default keygen should not fail"); + } + } + + @Test + def testGetOriginKeyGenerator(): Unit = { + // for dataframe write + val m1 = Map( + HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key -> classOf[ComplexKeyGenerator].getName, + HoodieTableConfig.TABLE_FORMAT.key -> "test-format" + ) + val kg1 = HoodieWriterUtils.getOriginKeyGenerator(m1) + assertTrue(kg1 == classOf[ComplexKeyGenerator].getName) + + // for sql write + val m2 = Map( + HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key -> classOf[SqlKeyGenerator].getName, + SqlKeyGenerator.ORIGINAL_KEYGEN_CLASS_NAME -> classOf[SimpleKeyGenerator].getName, + HoodieTableConfig.TABLE_FORMAT.key -> "test-format" + ) + val kg2 = HoodieWriterUtils.getOriginKeyGenerator(m2) + assertTrue(kg2 == classOf[SimpleKeyGenerator].getName) + } + + /** + * + * Test that you can't have consistent hashing bucket index on a COW table + * */ + @Test + def testCOWConsistentHashing(): Unit = { + val _spark = spark + import _spark.implicits._ + val df = Seq((1, "a1", 10, 1000, "2021-10-16")).toDF("id", "name", "value", "ts", "dt") + val options = Map( + DataSourceWriteOptions.RECORDKEY_FIELD.key -> "id", + DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "ts", + HoodieIndexConfig.BUCKET_INDEX_ENGINE_TYPE.key -> "CONSISTENT_HASHING", + HoodieIndexConfig.INDEX_TYPE.key -> "BUCKET", + HoodieTableConfig.TABLE_FORMAT.key -> "test-format" + ) + + val (tableName1, tablePath1) = ("hoodie_test_params_1", s"$tempBasePath" + "_1") + val exc = intercept[HoodieException] { + df.write.format("hudi") + .options(options) + .option(HoodieWriteConfig.TBL_NAME.key, tableName1) + .mode(SaveMode.Overwrite).save(tablePath1) + } + assert(exc.getMessage.contains("Consistent hashing bucket index does not work with COW table. Use simple bucket index or an MOR table.")) + } + + private def fetchActualSchema(): Schema = { + val tableMetaClient = createMetaClient(spark, tempBasePath) + new TableSchemaResolver(tableMetaClient).getTableAvroSchema(false) + } +} + +object TestHoodieSparkSqlWriterWithTestFormat { + def testDatasourceInsert: java.util.stream.Stream[Arguments] = { + val scenarios = Array( + Seq("COPY_ON_WRITE", true), + Seq("COPY_ON_WRITE", false), + Seq("MERGE_ON_READ", true), + Seq("MERGE_ON_READ", false) + ) + + val parquetScenarios = scenarios.map { + _ :+ "parquet" + } + val orcScenarios = scenarios.map { + _ :+ "orc" + } + val targetScenarios = parquetScenarios ++ orcScenarios + + java.util.Arrays.stream(targetScenarios.map(as => Arguments.arguments(as.map(_.asInstanceOf[AnyRef]): _*))) + } + + def deletePartitionsWildcardTestParams(): java.util.stream.Stream[Arguments] = { + java.util.stream.Stream.of( + Arguments.arguments("*5/03/1*", Seq("2016/03/15")), + Arguments.arguments("2016/03/*", Seq("2015/03/16", "2015/03/17"))) + } + + def bootstrapTestParams(): java.util.stream.Stream[Arguments] = { + java.util.stream.Stream.of( + Arguments.arguments("MERGE_ON_READ", Integer.valueOf(8)), + Arguments.arguments("MERGE_ON_READ", Integer.valueOf(6)), + Arguments.arguments("COPY_ON_WRITE", Integer.valueOf(8)) + ) + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala index 4bfe7ed8afb8..08b989bb3b57 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala @@ -202,8 +202,9 @@ class TestCOWDataSource extends HoodieSparkClientTestBase with ScalaAssertionSup snapshot1.cache() assertEquals(300, snapshot1.count()) + metaClient = createMetaClient(spark, basePath) var partitionPaths = FSUtils.getAllPartitionPaths( - new HoodieSparkEngineContext(jsc), storage, HoodieMetadataConfig.newBuilder().build(), basePath) + new HoodieSparkEngineContext(jsc), metaClient, HoodieMetadataConfig.newBuilder().build()) assertTrue(partitionPaths.contains("100/rider-123")) assertTrue(partitionPaths.contains("200/rider-456")) @@ -226,8 +227,7 @@ class TestCOWDataSource extends HoodieSparkClientTestBase with ScalaAssertionSup .mode(SaveMode.Overwrite) .save(basePath) - partitionPaths = FSUtils.getAllPartitionPaths( - new HoodieSparkEngineContext(jsc), storage, HoodieMetadataConfig.newBuilder().build(), basePath) + partitionPaths = FSUtils.getAllPartitionPaths(new HoodieSparkEngineContext(jsc), metaClient, HoodieMetadataConfig.newBuilder().build()) assertEquals(partitionPaths.size(), 1) assertEquals(partitionPaths.get(0), "") } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestHoodieMultipleBaseFileFormat.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestHoodieMultipleBaseFileFormat.scala index e7b9fc751a1d..2e98d0b7d591 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestHoodieMultipleBaseFileFormat.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestHoodieMultipleBaseFileFormat.scala @@ -114,7 +114,7 @@ class TestHoodieMultipleBaseFileFormat extends HoodieSparkClientTestBase with Sp val viewManager: FileSystemViewManager = FileSystemViewManager.createViewManager( engineContext, metadataConfig, FileSystemViewStorageConfig.newBuilder.build, HoodieCommonConfig.newBuilder.build, - (mc: HoodieTableMetaClient) => HoodieTableMetadata.create(engineContext, mc.getStorage, metadataConfig, basePath)) + (mc: HoodieTableMetaClient) => metaClient.getTableFormat.getMetadataFactory.create(engineContext, mc.getStorage, metadataConfig, basePath)) val fsView: SyncableFileSystemView = viewManager.getFileSystemView(metaClient) val orcFiles = fsView.getAllBaseFiles(DEFAULT_SECOND_PARTITION_PATH).filter(bf => bf.getFileName.endsWith("orc")) assertTrue(orcFiles.count() > 0) diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/common/HoodieSparkSqlTestBase.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/common/HoodieSparkSqlTestBase.scala index 5fdc2a5b0f34..15b95960097d 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/common/HoodieSparkSqlTestBase.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/common/HoodieSparkSqlTestBase.scala @@ -401,7 +401,7 @@ object HoodieSparkSqlTestBase { engineContext, metadataConfig, FileSystemViewStorageConfig.newBuilder.build, HoodieCommonConfig.newBuilder.build, (_: HoodieTableMetaClient) => { - HoodieTableMetadata.create( + metaClient.getTableFormat.getMetadataFactory.create( engineContext, metaClient.getStorage, metadataConfig, metaClient.getBasePath.toString) } ) diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/procedure/TestRepairsProcedure.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/procedure/TestRepairsProcedure.scala index 0ed710c9d0df..53c9d49614fc 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/procedure/TestRepairsProcedure.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/procedure/TestRepairsProcedure.scala @@ -144,6 +144,7 @@ class TestRepairsProcedure extends HoodieSparkProcedureTestBase { |[hoodie.record.merge.strategy.id,eeb8d96f-b1e4-49fd-bbf8-28ac514178e5,null] |[hoodie.table.checksum,,] |[hoodie.table.create.schema,,] + |[hoodie.table.format,native,null] |[hoodie.table.initial.version,8,8] |[hoodie.table.keygenerator.type,NON_PARTITION,null] |[hoodie.table.name,,] diff --git a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncClient.java b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncClient.java index f453bf856660..e6fee456585f 100644 --- a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncClient.java +++ b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncClient.java @@ -123,8 +123,7 @@ public MessageType getStorageSchema(boolean includeMetadataField) { public List getAllPartitionPathsOnStorage() { HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(metaClient.getStorageConf()); return FSUtils.getAllPartitionPaths(engineContext, - metaClient.getStorage(), - config.getString(META_SYNC_BASE_PATH), + metaClient, config.getBoolean(META_SYNC_USE_FILE_LISTING_FROM_METADATA)); } diff --git a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/util/ManifestFileWriter.java b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/util/ManifestFileWriter.java index 773eae1db97b..07533ee90087 100644 --- a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/util/ManifestFileWriter.java +++ b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/util/ManifestFileWriter.java @@ -104,7 +104,7 @@ public StoragePath getManifestFolder(boolean useAbsolutePath) { @VisibleForTesting static Stream getLatestBaseFiles(boolean canUseMetadataTable, HoodieEngineContext engContext, HoodieTableMetaClient metaClient, boolean useAbsolutePath) { - List partitions = FSUtils.getAllPartitionPaths(engContext, metaClient.getStorage(), metaClient.getBasePath(), canUseMetadataTable); + List partitions = FSUtils.getAllPartitionPaths(engContext, metaClient, canUseMetadataTable); LOG.info("Retrieve all partitions: {}", partitions.size()); HoodieTableFileSystemView fsView = null; try { diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java index 1308687079a1..7dca20c9666f 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java @@ -734,10 +734,9 @@ private boolean checkMetadataTableIsAvailable() { List validatePartitions(HoodieSparkEngineContext engineContext, StoragePath basePath, HoodieTableMetaClient metaClient) { // compare partitions HoodieTimeline completedTimeline = metaClient.getCommitsTimeline().filterCompletedInstants(); - List allPartitionPathsFromFS = getPartitionsFromFileSystem(engineContext, basePath, metaClient.getStorage(), - completedTimeline); + List allPartitionPathsFromFS = getPartitionsFromFileSystem(engineContext, metaClient, completedTimeline); - List allPartitionPathsMeta = getPartitionsFromMDT(engineContext, basePath, metaClient.getStorage()); + List allPartitionPathsMeta = getPartitionsFromMDT(engineContext, metaClient); Collections.sort(allPartitionPathsFromFS); Collections.sort(allPartitionPathsMeta); @@ -801,20 +800,18 @@ Option getPartitionCreationInstant(HoodieStorage storage, StoragePath ba } @VisibleForTesting - List getPartitionsFromMDT(HoodieEngineContext engineContext, StoragePath basePath, - HoodieStorage storage) { - return FSUtils.getAllPartitionPaths(engineContext, storage, basePath, true); + List getPartitionsFromMDT(HoodieEngineContext engineContext, HoodieTableMetaClient metaClient) { + return FSUtils.getAllPartitionPaths(engineContext, metaClient, true); } @VisibleForTesting - List getPartitionsFromFileSystem(HoodieEngineContext engineContext, StoragePath basePath, - HoodieStorage storage, HoodieTimeline completedTimeline) { - List allPartitionPathsFromFS = FSUtils.getAllPartitionPaths(engineContext, storage, basePath, false); + List getPartitionsFromFileSystem(HoodieEngineContext engineContext, HoodieTableMetaClient metaClient, HoodieTimeline completedTimeline) { + List allPartitionPathsFromFS = FSUtils.getAllPartitionPaths(engineContext, metaClient, false); // ignore partitions created by uncommitted ingestion. return allPartitionPathsFromFS.stream().parallel().filter(part -> { HoodiePartitionMetadata hoodiePartitionMetadata = - new HoodiePartitionMetadata(storage, FSUtils.constructAbsolutePath(basePath, part)); + new HoodiePartitionMetadata(metaClient.getStorage(), FSUtils.constructAbsolutePath(metaClient.getBasePath().toString(), part)); Option instantOption = hoodiePartitionMetadata.readPartitionCreatedCommitTime(); if (instantOption.isPresent()) { String instantTime = instantOption.get(); @@ -1758,7 +1755,7 @@ public HoodieMetadataValidationContext( FileSystemViewStorageConfig viewConf = FileSystemViewStorageConfig.newBuilder().fromProperties(props).build(); ValidationUtils.checkArgument(viewConf.getStorageType().name().equals(viewStorageType), "View storage type not reflected"); HoodieCommonConfig commonConfig = HoodieCommonConfig.newBuilder().fromProperties(props).build(); - this.tableMetadata = HoodieTableMetadata.create( + this.tableMetadata = metaClient.getTableFormat().getMetadataFactory().create( engineContext, metaClient.getStorage(), metadataConfig, metaClient.getBasePath().toString()); this.fileSystemView = getFileSystemView(engineContext, metaClient, metadataConfig, viewConf, commonConfig); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieSnapshotExporter.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieSnapshotExporter.java index 0a09e6783417..d42e0cccf866 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieSnapshotExporter.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieSnapshotExporter.java @@ -184,7 +184,8 @@ private Option getLatestCommitTimestamp(HoodieTableMetaClient tableMetad private List getPartitions(HoodieEngineContext engineContext, Config cfg, HoodieStorage storage) { - return FSUtils.getAllPartitionPaths(engineContext, storage, cfg.sourceBasePath, true); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setBasePath(cfg.sourceBasePath).setConf(storage.getConf()).build(); + return FSUtils.getAllPartitionPaths(engineContext, metaClient, true); } private void createSuccessTag(FileSystem fs, Config cfg) throws IOException { diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/TableSizeStats.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/TableSizeStats.java index 90267c8c0b8d..5ccc7ac7b128 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/TableSizeStats.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/TableSizeStats.java @@ -278,7 +278,10 @@ private void logTableStats(String basePath, LocalDate[] dateInterval) throws IOE .build(); HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); StorageConfiguration storageConf = HadoopFSUtils.getStorageConfWithCopy(jsc.hadoopConfiguration()); - HoodieTableMetadata tableMetadata = HoodieTableMetadata.create( + HoodieTableMetaClient metaClientLocal = HoodieTableMetaClient.builder() + .setBasePath(basePath) + .setConf(storageConf.newInstance()).build(); + HoodieTableMetadata tableMetadata = metaClientLocal.getTableFormat().getMetadataFactory().create( engineContext, new HoodieHadoopStorage(basePath, storageConf), metadataConfig, basePath); List allPartitions = tableMetadata.getAllPartitionPaths(); @@ -313,9 +316,6 @@ private void logTableStats(String basePath, LocalDate[] dateInterval) throws IOE || (endDate == null && (partitionDate.isEqual(startDate) || partitionDate.isAfter(startDate))) || (startDate == null && partitionDate.isBefore(endDate)) || (startDate != null && endDate != null && ((partitionDate.isEqual(startDate) || partitionDate.isAfter(startDate)) && partitionDate.isBefore(endDate)))) { - HoodieTableMetaClient metaClientLocal = HoodieTableMetaClient.builder() - .setBasePath(basePath) - .setConf(storageConf.newInstance()).build(); HoodieMetadataConfig metadataConfig1 = HoodieMetadataConfig.newBuilder() .enable(false) .build(); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/perf/TimelineServerPerf.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/perf/TimelineServerPerf.java index b1043efb363e..59c9d9a8b2e7 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/perf/TimelineServerPerf.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/perf/TimelineServerPerf.java @@ -32,7 +32,6 @@ import org.apache.hudi.hadoop.fs.HadoopFSUtils; import org.apache.hudi.storage.HoodieStorage; import org.apache.hudi.storage.StoragePath; -import org.apache.hudi.storage.hadoop.HoodieHadoopStorage; import org.apache.hudi.timeline.service.TimelineService; import org.apache.hudi.utilities.UtilHelpers; @@ -95,12 +94,6 @@ private void setHostAddrFromSparkConf(SparkConf sparkConf) { public void run() throws IOException { JavaSparkContext jsc = UtilHelpers.buildSparkContext("hudi-view-perf-" + cfg.basePath, cfg.sparkMaster); HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); - List allPartitionPaths = FSUtils.getAllPartitionPaths( - engineContext, new HoodieHadoopStorage(cfg.basePath, engineContext.getStorageConf()), - cfg.basePath, cfg.useFileListingFromMetadata); - Collections.shuffle(allPartitionPaths); - List selected = allPartitionPaths.stream().filter(p -> !p.contains("error")).limit(cfg.maxPartitions) - .collect(Collectors.toList()); if (!useExternalTimelineServer) { this.timelineServer.startService(); @@ -114,6 +107,11 @@ engineContext, new HoodieHadoopStorage(cfg.basePath, engineContext.getStorageCon .setConf(timelineServer.getStorageConf().newInstance()) .setBasePath(cfg.basePath) .setLoadActiveTimelineOnLoad(true).build(); + + List allPartitionPaths = FSUtils.getAllPartitionPaths(engineContext, metaClient, cfg.useFileListingFromMetadata); + Collections.shuffle(allPartitionPaths); + List selected = allPartitionPaths.stream().filter(p -> !p.contains("error")).limit(cfg.maxPartitions) + .collect(Collectors.toList()); SyncableFileSystemView fsView = new RemoteHoodieTableFileSystemView(this.hostAddr, cfg.serverPort, metaClient); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/BootstrapExecutor.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/BootstrapExecutor.java index ae697dff1730..749be2e83068 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/BootstrapExecutor.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/BootstrapExecutor.java @@ -209,6 +209,7 @@ private void initializeTable() throws IOException { .setRecordKeyFields(props.getString(RECORDKEY_FIELD_NAME.key())) .setPreCombineField(props.getString(PRECOMBINE_FIELD_NAME.key(), null)) .setTableVersion(ConfigUtils.getIntWithAltKeys(props, WRITE_TABLE_VERSION)) + .setTableFormat(props.getString(HoodieTableConfig.TABLE_FORMAT.key(), HoodieTableConfig.TABLE_FORMAT.defaultValue())) .setPopulateMetaFields(props.getBoolean( POPULATE_META_FIELDS.key(), POPULATE_META_FIELDS.defaultValue())) .setArchiveLogFolder(props.getString( diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/StreamSync.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/StreamSync.java index efd2bed0d1bb..32e6945d3116 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/StreamSync.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/StreamSync.java @@ -456,6 +456,7 @@ HoodieTableMetaClient initializeEmptyTable(HoodieTableMetaClient.TableBuilder ta Boolean.parseBoolean(HIVE_STYLE_PARTITIONING_ENABLE.defaultValue()))) .setUrlEncodePartitioning(props.getBoolean(URL_ENCODE_PARTITIONING.key(), Boolean.parseBoolean(URL_ENCODE_PARTITIONING.defaultValue()))) + .setTableFormat(props.getProperty(HoodieTableConfig.TABLE_FORMAT.key(), HoodieTableConfig.TABLE_FORMAT.defaultValue())) .initTable(storageConf, cfg.targetBasePath); } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieMetadataTableValidator.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieMetadataTableValidator.java index 5d578a73dce8..c698070d4fc7 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieMetadataTableValidator.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieMetadataTableValidator.java @@ -1231,12 +1231,12 @@ void setPartitionCreationTime(Option partitionCreationTime) { } @Override - List getPartitionsFromFileSystem(HoodieEngineContext engineContext, StoragePath basePath, HoodieStorage storage, HoodieTimeline completedTimeline) { + List getPartitionsFromFileSystem(HoodieEngineContext engineContext, HoodieTableMetaClient metaClient, HoodieTimeline completedTimeline) { return fsPartitionsToReturn; } @Override - List getPartitionsFromMDT(HoodieEngineContext engineContext, StoragePath basePath, HoodieStorage storage) { + List getPartitionsFromMDT(HoodieEngineContext engineContext, HoodieTableMetaClient metaClient) { return metadataPartitionsToReturn; } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieDeltaStreamer.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieDeltaStreamer.java index cfbd4170e178..49252565d94a 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieDeltaStreamer.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieDeltaStreamer.java @@ -3197,8 +3197,7 @@ public void testBulkInsertWithUserDefinedPartitioner() throws Exception { syncAndAssertRecordCount(cfg, 1000, tableBasePath, "00000", 1); HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setBasePath(tableBasePath).setConf(HoodieTestUtils.getDefaultStorageConf()).build(); - List partitions = FSUtils.getAllPartitionPaths( - new HoodieLocalEngineContext(metaClient.getStorageConf()), metaClient.getStorage(), metaClient.getBasePath(), false); + List partitions = FSUtils.getAllPartitionPaths(new HoodieLocalEngineContext(metaClient.getStorageConf()), metaClient, false); StorageConfiguration hadoopConf = metaClient.getStorageConf(); HoodieLocalEngineContext engContext = new HoodieLocalEngineContext(hadoopConf); HoodieTableFileSystemView fsView = HoodieTableFileSystemView.fileListingBasedFileSystemView(engContext, metaClient, diff --git a/rfc/rfc-93/rfc-93.md b/rfc/rfc-93/rfc-93.md index cefd3198aec0..3a5f0aef540b 100644 --- a/rfc/rfc-93/rfc-93.md +++ b/rfc/rfc-93/rfc-93.md @@ -27,7 +27,7 @@ ## Status -JIRA: +JIRA: https://issues.apache.org/jira/browse/HUDI-9332 ## Abstract @@ -57,7 +57,7 @@ Some non-technical reasons: ## **Implementation** -The main implementation step here is to create abstraction called TableFormatPlugin which handles table format operations such as +The main implementation step here is to create abstraction called `HoodieTableFormat` which handles table format operations such as 1. Committing writes @@ -68,7 +68,8 @@ The main implementation step here is to create abstraction called TableFormatPlu 6. Rollbacks -The Hudi platform is responsible for managing the data path and can be configured with the table format plugin by default using Hudi native table format. Other table formats will have their own implementation of this abstraction. +The Hudi platform is responsible for managing the data path and can be configured with the table format plugin or by default it continues to use Hudi's native table format. +Other table formats will have their own implementation of this abstraction. ### Commit Protocol: @@ -80,7 +81,7 @@ Hudi uses the creation of an action-complete file (e:g .commit, .deltacommit, .c The Hudi timeline is still very much used for all internal operations and the table format's commit metadata will be an overlay on top of this. With this, the action is only completed when both the above steps are completed. The plugin provided timeline needs to fence the timeline ensuring the definition of complete stays consistent. This ensures the snapshot isolation is maintained. -There will be a hudi table-property "hudi.table.format.plugin" to identify the table format with default being "native". This allows consistency in plugin behaviors across all hudi writers. +There will be a hudi table-property "hudi.table.format" to identify the table format with default being "native". This allows consistency in plugin behaviors across all hudi writers. ### Metadata: