-
Notifications
You must be signed in to change notification settings - Fork 3k
Spark 3.3: Add Default Parallelism Level for All Spark Driver Based Deletes #6588
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
7bdefa4
1c48fe7
21d3adc
4558d58
8b0786d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -47,4 +47,8 @@ private SparkSQLProperties() {} | |
| public static final String PRESERVE_DATA_GROUPING = | ||
| "spark.sql.iceberg.planning.preserve-data-grouping"; | ||
| public static final boolean PRESERVE_DATA_GROUPING_DEFAULT = false; | ||
|
|
||
| // Controls how many physical file deletes to execute in parallel when not otherwise specified | ||
| public static final String DELETE_PARALLELISM = "driver-delete-default-parallelism"; | ||
| public static final String DELETE_PARALLELISM_DEFAULT = "25"; | ||
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -56,9 +56,11 @@ | |
| import org.apache.iceberg.relocated.com.google.common.collect.Maps; | ||
| import org.apache.iceberg.spark.JobGroupInfo; | ||
| import org.apache.iceberg.spark.JobGroupUtils; | ||
| import org.apache.iceberg.spark.SparkSQLProperties; | ||
| import org.apache.iceberg.spark.SparkTableUtil; | ||
| import org.apache.iceberg.spark.source.SerializableTableWithSize; | ||
| import org.apache.iceberg.util.Tasks; | ||
| import org.apache.iceberg.util.ThreadPools; | ||
| import org.apache.spark.SparkContext; | ||
| import org.apache.spark.api.java.JavaSparkContext; | ||
| import org.apache.spark.api.java.function.FlatMapFunction; | ||
|
|
@@ -218,6 +220,30 @@ private Dataset<FileInfo> toFileInfoDS(List<String> paths, String type) { | |
| return spark.createDataset(fileInfoList, FileInfo.ENCODER); | ||
| } | ||
|
|
||
| /** | ||
| * Uses a fixed thread pool with {@link SparkSQLProperties#DELETE_PARALLELISM} when the executorService | ||
| * passed to this function is null. | ||
| */ | ||
| protected void withDefaultDeleteService(ExecutorService executorService, Consumer<ExecutorService> func) { | ||
| ExecutorService deleteService = executorService; | ||
|
|
||
| boolean createdDefaultDeleteService = false; | ||
| if (deleteService == null) { | ||
| int numThreads = Integer.parseInt( | ||
| spark.conf().get( | ||
| SparkSQLProperties.DELETE_PARALLELISM, | ||
| SparkSQLProperties.DELETE_PARALLELISM_DEFAULT)); | ||
| deleteService = ThreadPools.newWorkerPool(this + "-default-delete-service", numThreads); | ||
| createdDefaultDeleteService = true; | ||
| } | ||
|
|
||
| func.accept(deleteService); | ||
|
|
||
| if (createdDefaultDeleteService) { | ||
| deleteService.shutdown(); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Deletes files and keeps track of how many files were removed for each file type. | ||
| * | ||
|
|
@@ -231,24 +257,27 @@ protected DeleteSummary deleteFiles( | |
|
|
||
| DeleteSummary summary = new DeleteSummary(); | ||
|
|
||
| Tasks.foreach(files) | ||
| .retry(DELETE_NUM_RETRIES) | ||
| .stopRetryOn(NotFoundException.class) | ||
| .suppressFailureWhenFinished() | ||
| .executeWith(executorService) | ||
| .onFailure( | ||
| (fileInfo, exc) -> { | ||
| String path = fileInfo.getPath(); | ||
| String type = fileInfo.getType(); | ||
| LOG.warn("Delete failed for {}: {}", type, path, exc); | ||
| }) | ||
| .run( | ||
| fileInfo -> { | ||
| String path = fileInfo.getPath(); | ||
| String type = fileInfo.getType(); | ||
| deleteFunc.accept(path); | ||
| summary.deletedFile(path, type); | ||
| }); | ||
|
|
||
| withDefaultDeleteService(executorService, (deleteService) -> { | ||
|
||
| Tasks.foreach(files) | ||
| .retry(DELETE_NUM_RETRIES) | ||
| .stopRetryOn(NotFoundException.class) | ||
| .suppressFailureWhenFinished() | ||
| .executeWith(deleteService) | ||
| .onFailure( | ||
| (fileInfo, exc) -> { | ||
| String path = fileInfo.getPath(); | ||
| String type = fileInfo.getType(); | ||
| LOG.warn("Delete failed for {}: {}", type, path, exc); | ||
| }) | ||
| .run( | ||
| fileInfo -> { | ||
| String path = fileInfo.getPath(); | ||
| String type = fileInfo.getType(); | ||
| deleteFunc.accept(path); | ||
| summary.deletedFile(path, type); | ||
| }); | ||
| }); | ||
|
|
||
| return summary; | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -246,12 +246,13 @@ private DeleteOrphanFiles.Result doExecute() { | |
| List<String> orphanFiles = | ||
| findOrphanFiles(spark(), actualFileIdentDS, validFileIdentDS, prefixMismatchMode); | ||
|
|
||
| Tasks.foreach(orphanFiles) | ||
| .noRetry() | ||
| .executeWith(deleteExecutorService) | ||
| .suppressFailureWhenFinished() | ||
| .onFailure((file, exc) -> LOG.warn("Failed to delete file: {}", file, exc)) | ||
| .run(deleteFunc::accept); | ||
| withDefaultDeleteService(deleteExecutorService, deleteService -> | ||
|
||
| Tasks.foreach(orphanFiles) | ||
| .noRetry() | ||
| .executeWith(deleteService) | ||
| .suppressFailureWhenFinished() | ||
| .onFailure((file, exc) -> LOG.warn("Failed to delete file: {}", file, exc)) | ||
| .run(deleteFunc::accept)); | ||
|
|
||
| return new BaseDeleteOrphanFilesActionResult(orphanFiles); | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.