-
Notifications
You must be signed in to change notification settings - Fork 3k
Spark-3.4: Harmonize RewriteDataFilesSparkAction #7630
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,7 +22,6 @@ | |
| import java.math.RoundingMode; | ||
| import java.util.Arrays; | ||
| import java.util.Collection; | ||
| import java.util.Comparator; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.Set; | ||
|
|
@@ -84,6 +83,9 @@ public class RewriteDataFilesSparkAction | |
| USE_STARTING_SEQUENCE_NUMBER, | ||
| REWRITE_JOB_ORDER); | ||
|
|
||
| private static final RewriteDataFilesSparkAction.Result EMPTY_RESULT = | ||
| ImmutableRewriteDataFiles.Result.builder().rewriteResults(ImmutableList.of()).build(); | ||
|
|
||
| private final Table table; | ||
|
|
||
| private Expression filter = Expressions.alwaysTrue(); | ||
|
|
@@ -147,7 +149,7 @@ public RewriteDataFilesSparkAction filter(Expression expression) { | |
| @Override | ||
| public RewriteDataFiles.Result execute() { | ||
| if (table.currentSnapshot() == null) { | ||
| return ImmutableRewriteDataFiles.Result.builder().rewriteResults(ImmutableList.of()).build(); | ||
| return EMPTY_RESULT; | ||
| } | ||
|
|
||
| long startingSnapshotId = table.currentSnapshot().snapshotId(); | ||
|
|
@@ -159,26 +161,25 @@ public RewriteDataFiles.Result execute() { | |
|
|
||
| validateAndInitOptions(); | ||
|
|
||
| Map<StructLike, List<List<FileScanTask>>> fileGroupsByPartition = | ||
| StructLikeMap<List<List<FileScanTask>>> fileGroupsByPartition = | ||
| planFileGroups(startingSnapshotId); | ||
| RewriteExecutionContext ctx = new RewriteExecutionContext(fileGroupsByPartition); | ||
|
|
||
| if (ctx.totalGroupCount() == 0) { | ||
| LOG.info("Nothing found to rewrite in {}", table.name()); | ||
| return ImmutableRewriteDataFiles.Result.builder().rewriteResults(ImmutableList.of()).build(); | ||
| return EMPTY_RESULT; | ||
| } | ||
|
|
||
| Stream<RewriteFileGroup> groupStream = toGroupStream(ctx, fileGroupsByPartition); | ||
|
|
||
| RewriteDataFilesCommitManager commitManager = commitManager(startingSnapshotId); | ||
| if (partialProgressEnabled) { | ||
| return doExecuteWithPartialProgress(ctx, groupStream, commitManager); | ||
| return doExecuteWithPartialProgress(ctx, groupStream, commitManager(startingSnapshotId)); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess i'd personally put the variable back, as in the RewriteDeleteFilesSparkAction case it was just a call without arguments, but here there are arguments so probably is more clearer if we assign it separately, but not a big deal
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I too thought about it during the change. I will leave it as it is (because the style looks similar to RewriteDeleteFilesSparkAction) |
||
| } else { | ||
| return doExecute(ctx, groupStream, commitManager); | ||
| return doExecute(ctx, groupStream, commitManager(startingSnapshotId)); | ||
| } | ||
| } | ||
|
|
||
| Map<StructLike, List<List<FileScanTask>>> planFileGroups(long startingSnapshotId) { | ||
| StructLikeMap<List<List<FileScanTask>>> planFileGroups(long startingSnapshotId) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't comment on the exact line , but on line 196-304 below , initially you had done some refactoring there. I see it was reverted as @aokolnychyi said we can't coerce partition to change the behavior in this discussion: #7630 (comment) That is correct, but we can still do some refactoring without coerce. I took a brief look and came up with this to maintain the logic: First add the methods: and then we can make this method like:
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think I tried it before. Have you checked this comment? If I use
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yea I saw that, so hence my suggestion did not use coerce, but rather the table's latest partitionType. It should be exactly the same as the existing code now?
Yea , in RewritePositionDeleteFilesSparkAction, we've moved that check as part of toGroupStream, which I think you also copied here.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I can see this now. SGTM. |
||
| CloseableIterable<FileScanTask> fileScanTasks = | ||
| table | ||
| .newScan() | ||
|
|
@@ -189,43 +190,9 @@ Map<StructLike, List<List<FileScanTask>>> planFileGroups(long startingSnapshotId | |
|
|
||
| try { | ||
| StructType partitionType = table.spec().partitionType(); | ||
| StructLikeMap<List<FileScanTask>> filesByPartition = StructLikeMap.create(partitionType); | ||
| StructLike emptyStruct = GenericRecord.create(partitionType); | ||
|
|
||
| fileScanTasks.forEach( | ||
| task -> { | ||
| // If a task uses an incompatible partition spec the data inside could contain values | ||
| // which | ||
| // belong to multiple partitions in the current spec. Treating all such files as | ||
| // un-partitioned and | ||
| // grouping them together helps to minimize new files made. | ||
| StructLike taskPartition = | ||
| task.file().specId() == table.spec().specId() | ||
| ? task.file().partition() | ||
| : emptyStruct; | ||
|
|
||
| List<FileScanTask> files = filesByPartition.get(taskPartition); | ||
| if (files == null) { | ||
| files = Lists.newArrayList(); | ||
| } | ||
|
|
||
| files.add(task); | ||
| filesByPartition.put(taskPartition, files); | ||
| }); | ||
|
|
||
| StructLikeMap<List<List<FileScanTask>>> fileGroupsByPartition = | ||
| StructLikeMap.create(partitionType); | ||
|
|
||
| filesByPartition.forEach( | ||
| (partition, tasks) -> { | ||
| Iterable<List<FileScanTask>> plannedFileGroups = rewriter.planFileGroups(tasks); | ||
| List<List<FileScanTask>> fileGroups = ImmutableList.copyOf(plannedFileGroups); | ||
| if (fileGroups.size() > 0) { | ||
| fileGroupsByPartition.put(partition, fileGroups); | ||
| } | ||
| }); | ||
|
|
||
| return fileGroupsByPartition; | ||
| StructLikeMap<List<FileScanTask>> filesByPartition = | ||
| groupByPartition(partitionType, fileScanTasks); | ||
| return fileGroupsByPartition(filesByPartition); | ||
| } finally { | ||
| try { | ||
| fileScanTasks.close(); | ||
|
|
@@ -235,6 +202,38 @@ Map<StructLike, List<List<FileScanTask>>> planFileGroups(long startingSnapshotId | |
| } | ||
| } | ||
|
|
||
| private StructLikeMap<List<FileScanTask>> groupByPartition( | ||
| StructType partitionType, Iterable<FileScanTask> tasks) { | ||
| StructLikeMap<List<FileScanTask>> filesByPartition = StructLikeMap.create(partitionType); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should continue to use
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thats strange. So, we don't have enough testcases to catch this behaviour? I have reverted it back now. Also, I didn't use |
||
| StructLike emptyStruct = GenericRecord.create(partitionType); | ||
|
|
||
| for (FileScanTask task : tasks) { | ||
| // If a task uses an incompatible partition spec the data inside could contain values | ||
| // which belong to multiple partitions in the current spec. Treating all such files as | ||
| // un-partitioned and grouping them together helps to minimize new files made. | ||
| StructLike taskPartition = | ||
| task.file().specId() == table.spec().specId() ? task.file().partition() : emptyStruct; | ||
|
|
||
| List<FileScanTask> files = filesByPartition.get(taskPartition); | ||
| if (files == null) { | ||
| files = Lists.newArrayList(); | ||
| } | ||
|
|
||
| files.add(task); | ||
| filesByPartition.put(taskPartition, files); | ||
| } | ||
| return filesByPartition; | ||
| } | ||
|
|
||
| private StructLikeMap<List<List<FileScanTask>>> fileGroupsByPartition( | ||
| StructLikeMap<List<FileScanTask>> filesByPartition) { | ||
| return filesByPartition.transformValues(this::planFileGroups); | ||
| } | ||
|
|
||
| private List<List<FileScanTask>> planFileGroups(List<FileScanTask> tasks) { | ||
| return ImmutableList.copyOf(rewriter.planFileGroups(tasks)); | ||
| } | ||
|
|
||
| @VisibleForTesting | ||
| RewriteFileGroup rewriteFiles(RewriteExecutionContext ctx, RewriteFileGroup fileGroup) { | ||
| String desc = jobDesc(fileGroup, ctx); | ||
|
|
@@ -300,7 +299,7 @@ private Result doExecute( | |
|
|
||
| Tasks.foreach(rewrittenGroups) | ||
| .suppressFailureWhenFinished() | ||
| .run(group -> commitManager.abortFileGroup(group)); | ||
| .run(commitManager::abortFileGroup); | ||
| throw e; | ||
| } finally { | ||
| rewriteService.shutdown(); | ||
|
|
@@ -332,14 +331,14 @@ private Result doExecuteWithPartialProgress( | |
| RewriteDataFilesCommitManager commitManager) { | ||
| ExecutorService rewriteService = rewriteService(); | ||
|
|
||
| // Start Commit Service | ||
| // start commit service | ||
| int groupsPerCommit = IntMath.divide(ctx.totalGroupCount(), maxCommits, RoundingMode.CEILING); | ||
| RewriteDataFilesCommitManager.CommitService commitService = | ||
| commitManager.service(groupsPerCommit); | ||
| commitService.start(); | ||
|
|
||
| Collection<FileGroupFailureResult> rewriteFailures = new ConcurrentLinkedQueue<>(); | ||
| // Start rewrite tasks | ||
| // start rewrite tasks | ||
| Tasks.foreach(groupStream) | ||
| .suppressFailureWhenFinished() | ||
| .executeWith(rewriteService) | ||
|
|
@@ -356,7 +355,7 @@ private Result doExecuteWithPartialProgress( | |
| .run(fileGroup -> commitService.offer(rewriteFiles(ctx, fileGroup))); | ||
| rewriteService.shutdown(); | ||
|
|
||
| // Stop Commit service | ||
| // stop commit service | ||
| commitService.close(); | ||
| List<RewriteFileGroup> commitResults = commitService.results(); | ||
| if (commitResults.size() == 0) { | ||
|
|
@@ -377,45 +376,29 @@ private Result doExecuteWithPartialProgress( | |
| } | ||
|
|
||
| Stream<RewriteFileGroup> toGroupStream( | ||
| RewriteExecutionContext ctx, | ||
| Map<StructLike, List<List<FileScanTask>>> fileGroupsByPartition) { | ||
| Stream<RewriteFileGroup> rewriteFileGroupStream = | ||
| fileGroupsByPartition.entrySet().stream() | ||
| .flatMap( | ||
| e -> { | ||
| StructLike partition = e.getKey(); | ||
| List<List<FileScanTask>> fileGroups = e.getValue(); | ||
| return fileGroups.stream() | ||
| .map( | ||
| tasks -> { | ||
| int globalIndex = ctx.currentGlobalIndex(); | ||
| int partitionIndex = ctx.currentPartitionIndex(partition); | ||
| FileGroupInfo info = | ||
| ImmutableRewriteDataFiles.FileGroupInfo.builder() | ||
| .globalIndex(globalIndex) | ||
| .partitionIndex(partitionIndex) | ||
| .partition(partition) | ||
| .build(); | ||
| return new RewriteFileGroup(info, tasks); | ||
| }); | ||
| }); | ||
|
|
||
| return rewriteFileGroupStream.sorted(rewriteGroupComparator()); | ||
| RewriteExecutionContext ctx, Map<StructLike, List<List<FileScanTask>>> groupsByPartition) { | ||
| return groupsByPartition.entrySet().stream() | ||
| .filter(e -> e.getValue().size() != 0) | ||
| .flatMap( | ||
| e -> { | ||
| StructLike partition = e.getKey(); | ||
| List<List<FileScanTask>> scanGroups = e.getValue(); | ||
| return scanGroups.stream().map(tasks -> newRewriteGroup(ctx, partition, tasks)); | ||
| }) | ||
| .sorted(RewriteFileGroup.comparator(rewriteJobOrder)); | ||
| } | ||
|
|
||
| private Comparator<RewriteFileGroup> rewriteGroupComparator() { | ||
| switch (rewriteJobOrder) { | ||
| case BYTES_ASC: | ||
| return Comparator.comparing(RewriteFileGroup::sizeInBytes); | ||
| case BYTES_DESC: | ||
| return Comparator.comparing(RewriteFileGroup::sizeInBytes, Comparator.reverseOrder()); | ||
| case FILES_ASC: | ||
| return Comparator.comparing(RewriteFileGroup::numFiles); | ||
| case FILES_DESC: | ||
| return Comparator.comparing(RewriteFileGroup::numFiles, Comparator.reverseOrder()); | ||
| default: | ||
| return (fileGroupOne, fileGroupTwo) -> 0; | ||
| } | ||
| private RewriteFileGroup newRewriteGroup( | ||
| RewriteExecutionContext ctx, StructLike partition, List<FileScanTask> tasks) { | ||
| int globalIndex = ctx.currentGlobalIndex(); | ||
| int partitionIndex = ctx.currentPartitionIndex(partition); | ||
| FileGroupInfo info = | ||
| ImmutableRewriteDataFiles.FileGroupInfo.builder() | ||
| .globalIndex(globalIndex) | ||
| .partitionIndex(partitionIndex) | ||
| .partition(partition) | ||
| .build(); | ||
| return new RewriteFileGroup(info, tasks); | ||
| } | ||
|
|
||
| void validateAndInitOptions() { | ||
|
|
@@ -495,15 +478,13 @@ private String jobDesc(RewriteFileGroup group, RewriteExecutionContext ctx) { | |
|
|
||
| @VisibleForTesting | ||
| static class RewriteExecutionContext { | ||
| private final Map<StructLike, Integer> numGroupsByPartition; | ||
| private final StructLikeMap<Integer> numGroupsByPartition; | ||
| private final int totalGroupCount; | ||
| private final Map<StructLike, Integer> partitionIndexMap; | ||
| private final AtomicInteger groupIndex; | ||
|
|
||
| RewriteExecutionContext(Map<StructLike, List<List<FileScanTask>>> fileGroupsByPartition) { | ||
| this.numGroupsByPartition = | ||
| fileGroupsByPartition.entrySet().stream() | ||
| .collect(Collectors.toMap(Map.Entry::getKey, e -> e.getValue().size())); | ||
| RewriteExecutionContext(StructLikeMap<List<List<FileScanTask>>> fileGroupsByPartition) { | ||
| this.numGroupsByPartition = fileGroupsByPartition.transformValues(List::size); | ||
| this.totalGroupCount = numGroupsByPartition.values().stream().reduce(Integer::sum).orElse(0); | ||
| this.partitionIndexMap = Maps.newConcurrentMap(); | ||
| this.groupIndex = new AtomicInteger(1); | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we also add
EMPTY_RESULTstatic variable like in the action for position deletes? There are two return statements in this method (not this line).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok. Added.