diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/MultipleSparkJobExecutionStrategy.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/MultipleSparkJobExecutionStrategy.java index c3a4ec3bd4697..4785e308ee813 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/MultipleSparkJobExecutionStrategy.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/MultipleSparkJobExecutionStrategy.java @@ -70,6 +70,7 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.sources.BaseRelation; +import org.apache.spark.storage.StorageLevel; import java.io.IOException; import java.util.ArrayList; @@ -122,6 +123,8 @@ public HoodieWriteMetadata> performClustering(final Hood .stream(); JavaRDD[] writeStatuses = convertStreamToArray(writeStatusesStream.map(HoodieJavaRDD::getJavaRDD)); JavaRDD writeStatusRDD = engineContext.union(writeStatuses); + // Persist writeStatus, since it may be reused + writeStatusRDD.persist(StorageLevel.MEMORY_AND_DISK()); HoodieWriteMetadata> writeMetadata = new HoodieWriteMetadata<>(); writeMetadata.setWriteStatuses(HoodieJavaRDD.of(writeStatusRDD));