-
Notifications
You must be signed in to change notification settings - Fork 3k
Core: add delete option for bin-packing #3454
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -33,6 +33,7 @@ | |
| import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Iterables; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Lists; | ||
| import org.junit.Assert; | ||
| import org.junit.Test; | ||
| import org.junit.runner.RunWith; | ||
|
|
@@ -107,6 +108,22 @@ public void testFilteringCustomMinMaxFileSize() { | |
| Assert.assertEquals("Should remove files that exceed or are smaller than new bounds", expectedFiles, filtered); | ||
| } | ||
|
|
||
| @Test | ||
| public void testFilteringWithDeletes() { | ||
| RewriteStrategy strategy = defaultBinPack().options(ImmutableMap.of( | ||
| BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(550 * MB), | ||
| BinPackStrategy.MIN_FILE_SIZE_BYTES, Long.toString(490 * MB), | ||
| BinPackStrategy.MIN_DELETES_PER_FILE, Integer.toString(2) | ||
| )); | ||
|
|
||
| List<FileScanTask> testFiles = filesOfSize(500, 500, 480, 480, 560, 520); | ||
| testFiles.add(MockFileScanTask.mockTaskWithDeletes(500 * MB, 2)); | ||
| Iterable<FileScanTask> expectedFiles = filesOfSize(480, 480, 560, 500); | ||
| Iterable<FileScanTask> filtered = ImmutableList.copyOf(strategy.selectFilesToRewrite(testFiles)); | ||
|
|
||
| Assert.assertEquals("Should include file with deletes", expectedFiles, filtered); | ||
| } | ||
|
|
||
| @Test | ||
| public void testGroupingMinInputFilesInvalid() { | ||
| RewriteStrategy strategy = defaultBinPack().options(ImmutableMap.of( | ||
|
|
@@ -150,6 +167,23 @@ public void testGroupingMinInputFilesValid() { | |
| ImmutableList.of(testFiles), grouped); | ||
| } | ||
|
|
||
| @Test | ||
| public void testGroupingWithDeletes() { | ||
| RewriteStrategy strategy = defaultBinPack().options(ImmutableMap.of( | ||
| BinPackStrategy.MIN_INPUT_FILES, Integer.toString(5), | ||
| BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(550 * MB), | ||
| BinPackStrategy.MIN_FILE_SIZE_BYTES, Long.toString(490 * MB), | ||
| BinPackStrategy.MIN_DELETES_PER_FILE, Integer.toString(2) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @kbendick, can you check our checkstyle config? It looks like lines that have incorrect indentation are getting through. |
||
| )); | ||
|
|
||
| List<FileScanTask> testFiles = Lists.newArrayList(); | ||
| testFiles.add(MockFileScanTask.mockTaskWithDeletes(500 * MB, 2)); | ||
| Iterable<List<FileScanTask>> grouped = strategy.planFileGroups(testFiles); | ||
|
|
||
| Assert.assertEquals("Should plan 1 groups since there are enough input files", | ||
| ImmutableList.of(testFiles), grouped); | ||
| } | ||
|
|
||
| @Test | ||
| public void testMaxGroupSize() { | ||
| RewriteStrategy strategy = defaultBinPack().options(ImmutableMap.of( | ||
|
|
@@ -196,12 +230,18 @@ public void testInvalidOptions() { | |
| BinPackStrategy.MIN_FILE_SIZE_BYTES, Long.toString(1000 * MB))); | ||
| }); | ||
|
|
||
| AssertHelpers.assertThrows("Should not allow min input size smaller tha 1", | ||
| AssertHelpers.assertThrows("Should not allow min input size smaller than 1", | ||
| IllegalArgumentException.class, () -> { | ||
| defaultBinPack().options(ImmutableMap.of( | ||
| BinPackStrategy.MIN_INPUT_FILES, Long.toString(-5))); | ||
| }); | ||
|
|
||
| AssertHelpers.assertThrows("Should not allow min deletes per file smaller than 1", | ||
| IllegalArgumentException.class, () -> { | ||
| defaultBinPack().options(ImmutableMap.of( | ||
| BinPackStrategy.MIN_DELETES_PER_FILE, Long.toString(-5))); | ||
| }); | ||
|
|
||
| AssertHelpers.assertThrows("Should not allow negative target size", | ||
| IllegalArgumentException.class, () -> { | ||
| defaultBinPack().options(ImmutableMap.of( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,14 +26,18 @@ | |
| import java.util.Map; | ||
| import java.util.Random; | ||
| import java.util.Set; | ||
| import java.util.UUID; | ||
| import java.util.stream.Collectors; | ||
| import java.util.stream.IntStream; | ||
| import java.util.stream.Stream; | ||
| import org.apache.hadoop.conf.Configuration; | ||
| import org.apache.iceberg.AssertHelpers; | ||
| import org.apache.iceberg.ContentFile; | ||
| import org.apache.iceberg.DataFile; | ||
| import org.apache.iceberg.FileFormat; | ||
| import org.apache.iceberg.FileScanTask; | ||
| import org.apache.iceberg.PartitionSpec; | ||
| import org.apache.iceberg.RowDelta; | ||
| import org.apache.iceberg.Schema; | ||
| import org.apache.iceberg.SortOrder; | ||
| import org.apache.iceberg.StructLike; | ||
|
|
@@ -46,6 +50,12 @@ | |
| import org.apache.iceberg.actions.RewriteDataFilesCommitManager; | ||
| import org.apache.iceberg.actions.RewriteFileGroup; | ||
| import org.apache.iceberg.actions.SortStrategy; | ||
| import org.apache.iceberg.data.GenericAppenderFactory; | ||
| import org.apache.iceberg.data.Record; | ||
| import org.apache.iceberg.deletes.PositionDeleteWriter; | ||
| import org.apache.iceberg.encryption.EncryptedFiles; | ||
| import org.apache.iceberg.encryption.EncryptedOutputFile; | ||
| import org.apache.iceberg.encryption.EncryptionKeyMetadata; | ||
| import org.apache.iceberg.exceptions.CommitStateUnknownException; | ||
| import org.apache.iceberg.expressions.Expressions; | ||
| import org.apache.iceberg.hadoop.HadoopTables; | ||
|
|
@@ -177,6 +187,59 @@ public void testBinPackWithFilter() { | |
| assertEquals("Rows must match", expectedRecords, actualRecords); | ||
| } | ||
|
|
||
| @Test | ||
| public void testBinPackWithDeletes() throws Exception { | ||
| Table table = createTablePartitioned(4, 2); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: Would it be worth while to add inline comments to explain these parameters (like
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is not needed, because you can see the meaning of the parameters in Intellij |
||
| table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); | ||
| shouldHaveFiles(table, 8); | ||
| table.refresh(); | ||
|
|
||
| CloseableIterable<FileScanTask> tasks = table.newScan().planFiles(); | ||
| List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); | ||
| GenericAppenderFactory appenderFactory = new GenericAppenderFactory(table.schema(), table.spec(), | ||
| null, null, null); | ||
| int total = (int) dataFiles.stream().mapToLong(ContentFile::recordCount).sum(); | ||
|
|
||
| RowDelta rowDelta = table.newRowDelta(); | ||
| // remove 2 rows for odd files, 1 row for even files | ||
| for (int i = 0; i < dataFiles.size(); i++) { | ||
| DataFile dataFile = dataFiles.get(i); | ||
| EncryptedOutputFile outputFile = EncryptedFiles.encryptedOutput( | ||
| table.io().newOutputFile(table.locationProvider().newDataLocation(UUID.randomUUID().toString())), | ||
| EncryptionKeyMetadata.EMPTY); | ||
| PositionDeleteWriter<Record> posDeleteWriter = appenderFactory.newPosDeleteWriter( | ||
| outputFile, FileFormat.PARQUET, dataFile.partition()); | ||
| posDeleteWriter.delete(dataFile.path(), 0); | ||
| posDeleteWriter.close(); | ||
| rowDelta.addDeletes(posDeleteWriter.toDeleteFile()); | ||
|
|
||
| if (i % 2 != 0) { | ||
| outputFile = EncryptedFiles.encryptedOutput( | ||
| table.io().newOutputFile(table.locationProvider().newDataLocation(UUID.randomUUID().toString())), | ||
| EncryptionKeyMetadata.EMPTY); | ||
| posDeleteWriter = appenderFactory.newPosDeleteWriter(outputFile, FileFormat.PARQUET, dataFile.partition()); | ||
| posDeleteWriter.delete(dataFile.path(), 1); | ||
| posDeleteWriter.close(); | ||
| rowDelta.addDeletes(posDeleteWriter.toDeleteFile()); | ||
| } | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I know there are some repeated code here for generating deletes. So far I am still not sure what is the correct boundary to create util methods. I am planning to refactor after I add more tests for the |
||
| } | ||
|
|
||
| rowDelta.commit(); | ||
| table.refresh(); | ||
| List<Object[]> expectedRecords = currentData(); | ||
| Result result = actions().rewriteDataFiles(table) | ||
| .option(BinPackStrategy.MIN_FILE_SIZE_BYTES, "0") | ||
| .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE - 1)) | ||
| .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE)) | ||
| .option(BinPackStrategy.MIN_DELETES_PER_FILE, "4") | ||
| .execute(); | ||
| Assert.assertEquals("Action should rewrite 4 data files", 4, result.rewrittenDataFilesCount()); | ||
|
|
||
| List<Object[]> actualRecords = currentData(); | ||
| assertEquals("Rows must match", expectedRecords, actualRecords); | ||
| Assert.assertEquals("12 rows are removed", total - 12, actualRecords.size()); | ||
| } | ||
|
|
||
| @Test | ||
| public void testRewriteLargeTableHasResiduals() { | ||
| PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).build(); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What value is recommended to the user? Or how does the user compute the proper value? I'm thinking maybe adding an option to count filegroup valid if any file of it contains deletes. Because you don't know how many records match the equality delete, for example, delete a set of records in an area/province.
Nit: This is disabled by default but it is always comparing.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's fine to just do this based on the number of files since the read penalty is directly related to the number of files and less so to the amount of actual rows deleted.
No strong feeling on the default since we already have the amount of delete files in the task information so I don't think the check is very expensive
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, do we need to have specific settings for equality delete and position delete separately? Since the read penalties are different.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think that is equivalent to setting this value to 0?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the exact value to set depends on the user's tolerance of read performance, because more deletes means worse read performance and potentially getting out of memory, so users can tune this value based on their system requirements.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hm, that's it.