-
Notifications
You must be signed in to change notification settings - Fork 2.9k
Spark: Remove deletefiles when expiring snapshots. #2518
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,6 +27,7 @@ public class ManifestFileBean implements ManifestFile { | |
| private String path = null; | ||
| private Long length = null; | ||
| private Integer partitionSpecId = null; | ||
| private Integer content = null; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why not just use |
||
| private Long addedSnapshotId = null; | ||
|
|
||
| public String getPath() { | ||
|
|
@@ -61,6 +62,14 @@ public void setAddedSnapshotId(Long addedSnapshotId) { | |
| this.addedSnapshotId = addedSnapshotId; | ||
| } | ||
|
|
||
| public Integer getContent() { | ||
| return content; | ||
| } | ||
|
|
||
| public void setContent(Integer content) { | ||
| this.content = content; | ||
| } | ||
|
|
||
| @Override | ||
| public String path() { | ||
| return path; | ||
|
|
@@ -78,7 +87,8 @@ public int partitionSpecId() { | |
|
|
||
| @Override | ||
| public ManifestContent content() { | ||
| return ManifestContent.DATA; | ||
| return (content != null && content == ManifestContent.DELETES.id()) ? | ||
| ManifestContent.DELETES : ManifestContent.DATA; | ||
| } | ||
|
|
||
| @Override | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,6 +25,7 @@ | |
| import java.util.concurrent.atomic.AtomicInteger; | ||
| import java.util.function.Supplier; | ||
| import org.apache.iceberg.BaseTable; | ||
| import org.apache.iceberg.ManifestContent; | ||
| import org.apache.iceberg.ManifestFiles; | ||
| import org.apache.iceberg.MetadataTableType; | ||
| import org.apache.iceberg.Snapshot; | ||
|
|
@@ -45,6 +46,7 @@ | |
| import org.apache.iceberg.spark.SparkUtil; | ||
| import org.apache.spark.SparkContext; | ||
| import org.apache.spark.api.java.JavaSparkContext; | ||
| import org.apache.spark.api.java.function.FilterFunction; | ||
| import org.apache.spark.api.java.function.FlatMapFunction; | ||
| import org.apache.spark.broadcast.Broadcast; | ||
| import org.apache.spark.sql.DataFrameReader; | ||
|
|
@@ -152,14 +154,17 @@ protected Table newStaticTable(TableMetadata metadata, FileIO io) { | |
| protected Dataset<Row> buildValidDataFileDF(Table table) { | ||
| JavaSparkContext context = new JavaSparkContext(spark.sparkContext()); | ||
| Broadcast<FileIO> ioBroadcast = context.broadcast(SparkUtil.serializableFileIO(table)); | ||
| return loadAllManifestFileBean(table).filter((FilterFunction<ManifestFileBean>) manifest -> | ||
| manifest.content() == ManifestContent.DATA) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should compare using
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess == is also correct for enum |
||
| .flatMap(new ReadManifest(ioBroadcast), Encoders.STRING()).toDF("file_path"); | ||
| } | ||
|
|
||
| Dataset<ManifestFileBean> allManifests = loadMetadataTable(table, ALL_MANIFESTS) | ||
| .selectExpr("path", "length", "partition_spec_id as partitionSpecId", "added_snapshot_id as addedSnapshotId") | ||
| .dropDuplicates("path") | ||
| .repartition(spark.sessionState().conf().numShufflePartitions()) // avoid adaptive execution combining tasks | ||
| .as(Encoders.bean(ManifestFileBean.class)); | ||
|
|
||
| return allManifests.flatMap(new ReadManifest(ioBroadcast), Encoders.STRING()).toDF("file_path"); | ||
| protected Dataset<Row> buildValidDeleteFileDF(Table table) { | ||
| JavaSparkContext context = new JavaSparkContext(spark.sparkContext()); | ||
| Broadcast<FileIO> ioBroadcast = context.broadcast(SparkUtil.serializableFileIO(table)); | ||
| return loadAllManifestFileBean(table).filter((FilterFunction<ManifestFileBean>) manifest -> | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it worthwhile to separate the loadAllManifestFileBean into 2 passes instead of one? |
||
| manifest.content() == ManifestContent.DELETES) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should compare using |
||
| .flatMap(new ReadManifest(ioBroadcast), Encoders.STRING()).toDF("file_path"); | ||
| } | ||
|
|
||
| protected Dataset<Row> buildManifestFileDF(Table table) { | ||
|
|
@@ -190,6 +195,15 @@ protected Dataset<Row> buildValidMetadataFileDF(Table table, TableOperations ops | |
| .orNoop() | ||
| .build(); | ||
|
|
||
| private Dataset<ManifestFileBean> loadAllManifestFileBean(Table table) { | ||
| return loadMetadataTable(table, ALL_MANIFESTS) | ||
| .selectExpr("path", "length", "partition_spec_id as partitionSpecId", "content", | ||
| "added_snapshot_id as addedSnapshotId") | ||
| .dropDuplicates("path") | ||
| .repartition(spark.sessionState().conf().numShufflePartitions()) // avoid adaptive execution combining tasks | ||
| .as(Encoders.bean(ManifestFileBean.class)); | ||
| } | ||
|
|
||
| private Dataset<Row> loadCatalogMetadataTable(String tableName, MetadataTableType type) { | ||
| Preconditions.checkArgument(!LOAD_CATALOG.isNoop(), "Cannot find Spark3Util class but Spark3 is in use"); | ||
| return LOAD_CATALOG.asStatic().invoke(spark, tableName, type); | ||
|
|
@@ -235,7 +249,13 @@ private static class ReadManifest implements FlatMapFunction<ManifestFileBean, S | |
|
|
||
| @Override | ||
| public Iterator<String> call(ManifestFileBean manifest) { | ||
| return new ClosingIterator<>(ManifestFiles.readPaths(manifest, io.getValue()).iterator()); | ||
| switch (manifest.content()) { | ||
| case DATA: | ||
| return new ClosingIterator<>(ManifestFiles.readPaths(manifest, io.getValue()).iterator()); | ||
| case DELETES: | ||
| return new ClosingIterator<>(ManifestFiles.readDeleteFiles(manifest, io.getValue()).iterator()); | ||
| } | ||
| throw new UnsupportedOperationException("Cannot read unknown manifest type: " + manifest.content()); | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
for public class, should keep existing constructor and add new ones.