-
Notifications
You must be signed in to change notification settings - Fork 2.9k
Spark: support replace equality deletes to position deletes #2216
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
76f2908
75e24ce
16c8840
844a5ea
3eaa2e8
b57d8f9
74e08e6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -57,4 +57,22 @@ public RewriteFiles rewriteFiles(Set<DataFile> filesToDelete, Set<DataFile> file | |
|
|
||
| return this; | ||
| } | ||
|
|
||
| @Override | ||
| public RewriteFiles rewriteDeletes(Set<DeleteFile> deletesToDelete, Set<DeleteFile> deletesToAdd) { | ||
| Preconditions.checkArgument(deletesToDelete != null && !deletesToDelete.isEmpty(), | ||
| "Files to delete cannot be null or empty"); | ||
| Preconditions.checkArgument(deletesToAdd != null && !deletesToAdd.isEmpty(), | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This check is incorrect, because if all the equality deletes are not hit the data files, then there will be no position delete to produce..
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will suggest to add an unit test for this.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I understand your concern. The check is used to discard the invalid rewrite, we don't want to continue the rewrite if there is no position delete produced. Don't we?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This kind of rewrite is valid actually because it replace all the useless equality files to empty position delete files. After the rewrite action, the normal read path don't have to filter the useless equality deletes again, that will be a great performance improvement. So we have to submit the RewriteFiles transaction here.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could see the validation in the extended RewriteFiles API here ( https://github.com/apache/iceberg/pull/2294/files#diff-b92a78b7fb207d4979d503a442189d9d096e4d19519a4b83eed9e1e779843810R68)
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Make sense to me! I will update then. |
||
| "Files to add can not be null or empty"); | ||
|
|
||
| for (DeleteFile toDelete : deletesToDelete) { | ||
| delete(toDelete); | ||
openinx marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| for (DeleteFile toAdd : deletesToAdd) { | ||
| add(toAdd); | ||
| } | ||
|
|
||
| return this; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,42 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| package org.apache.iceberg.util; | ||
|
|
||
| import java.util.List; | ||
| import java.util.function.Predicate; | ||
|
|
||
| public class ChainOrFilter<T> extends Filter<T> { | ||
| private final List<Predicate<T>> filters; | ||
|
|
||
| public ChainOrFilter(List<Predicate<T>> filters) { | ||
| this.filters = filters; | ||
| } | ||
|
|
||
| @Override | ||
| protected boolean shouldKeep(T item) { | ||
| for (Predicate<T> filter : filters) { | ||
| if (filter.test(item)) { | ||
| return true; | ||
| } | ||
| } | ||
|
|
||
| return false; | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,6 +23,7 @@ | |
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.Set; | ||
| import java.util.function.Predicate; | ||
| import org.apache.iceberg.Accessor; | ||
| import org.apache.iceberg.DataFile; | ||
| import org.apache.iceberg.DeleteFile; | ||
|
|
@@ -48,6 +49,8 @@ | |
| import org.apache.iceberg.relocated.com.google.common.collect.Sets; | ||
| import org.apache.iceberg.types.TypeUtil; | ||
| import org.apache.iceberg.types.Types; | ||
| import org.apache.iceberg.util.ChainOrFilter; | ||
| import org.apache.iceberg.util.Filter; | ||
| import org.apache.iceberg.util.StructLikeSet; | ||
| import org.apache.iceberg.util.StructProjection; | ||
| import org.apache.parquet.Preconditions; | ||
|
|
@@ -110,7 +113,42 @@ public CloseableIterable<T> filter(CloseableIterable<T> records) { | |
| return applyEqDeletes(applyPosDeletes(records)); | ||
| } | ||
|
|
||
| private CloseableIterable<T> applyEqDeletes(CloseableIterable<T> records) { | ||
| public CloseableIterable<T> matchEqDeletes(CloseableIterable<T> records) { | ||
openinx marked this conversation as resolved.
Show resolved
Hide resolved
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like this method is mostly the same as
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, in this separate PR, we've abstracted them into a single method. https://github.com/apache/iceberg/pull/2320/files#diff-a6641d31cdfd66835b3447bef04be87786849126b07761e47b852837f67a988aR151 |
||
| if (eqDeletes.isEmpty()) { | ||
| return records; | ||
| } | ||
|
|
||
| Multimap<Set<Integer>, DeleteFile> filesByDeleteIds = Multimaps.newMultimap(Maps.newHashMap(), Lists::newArrayList); | ||
| for (DeleteFile delete : eqDeletes) { | ||
| filesByDeleteIds.put(Sets.newHashSet(delete.equalityFieldIds()), delete); | ||
| } | ||
|
|
||
| List<Predicate<T>> deleteSetFilters = Lists.newArrayList(); | ||
| for (Map.Entry<Set<Integer>, Collection<DeleteFile>> entry : filesByDeleteIds.asMap().entrySet()) { | ||
| Set<Integer> ids = entry.getKey(); | ||
| Iterable<DeleteFile> deletes = entry.getValue(); | ||
|
|
||
| Schema deleteSchema = TypeUtil.select(requiredSchema, ids); | ||
|
|
||
| // a projection to select and reorder fields of the file schema to match the delete rows | ||
| StructProjection projectRow = StructProjection.create(requiredSchema, deleteSchema); | ||
|
|
||
| Iterable<CloseableIterable<Record>> deleteRecords = Iterables.transform(deletes, | ||
| delete -> openDeletes(delete, deleteSchema)); | ||
| StructLikeSet deleteSet = Deletes.toEqualitySet( | ||
| // copy the delete records because they will be held in a set | ||
| CloseableIterable.transform(CloseableIterable.concat(deleteRecords), Record::copy), | ||
| deleteSchema.asStruct()); | ||
|
|
||
| Predicate<T> predicate = record -> deleteSet.contains(projectRow.wrap(asStructLike(record))); | ||
| deleteSetFilters.add(predicate); | ||
| } | ||
|
|
||
| Filter<T> findDeleteRows = new ChainOrFilter<>(deleteSetFilters); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need an extra class for this? This seems to be achievable via something like
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We've removed the |
||
| return findDeleteRows.filter(records); | ||
| } | ||
|
|
||
| protected CloseableIterable<T> applyEqDeletes(CloseableIterable<T> records) { | ||
| if (eqDeletes.isEmpty()) { | ||
| return records; | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,41 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| package org.apache.iceberg.actions; | ||
|
|
||
| import java.util.List; | ||
| import org.apache.iceberg.DeleteFile; | ||
|
|
||
| public class DeleteRewriteActionResult { | ||
| private List<DeleteFile> posDeletes; | ||
| private List<DeleteFile> eqDeletes; | ||
|
|
||
| public DeleteRewriteActionResult(List<DeleteFile> eqDeletes, List<DeleteFile> posDeletes) { | ||
| this.eqDeletes = eqDeletes; | ||
| this.posDeletes = posDeletes; | ||
| } | ||
|
|
||
| public List<DeleteFile> deletedFiles() { | ||
| return eqDeletes; | ||
| } | ||
|
|
||
| public List<DeleteFile> addedFiles() { | ||
| return posDeletes; | ||
| } | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Before we start the replacing equality deletes with position deletes, I think we need to refactor the RewriteFiles API to adjust more cases:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That makes sense to me. I think we could parallelize the API refactoring and the implementation.