diff --git a/api/src/main/java/org/apache/iceberg/actions/DeleteOrphanFiles.java b/api/src/main/java/org/apache/iceberg/actions/DeleteOrphanFiles.java index fcdf5299629d..4ee75a6e7990 100644 --- a/api/src/main/java/org/apache/iceberg/actions/DeleteOrphanFiles.java +++ b/api/src/main/java/org/apache/iceberg/actions/DeleteOrphanFiles.java @@ -23,9 +23,9 @@ import java.util.function.Consumer; /** - * An action that deletes orphan files in a table. + * An action that deletes orphan metadata, data and delete files in a table. *

- * A metadata or data file is considered orphan if it is not reachable by any valid snapshot. + * A file is considered orphan if it is not reachable by any valid snapshot. * The set of actual files is built by listing the underlying storage which makes this operation * expensive. */ diff --git a/api/src/main/java/org/apache/iceberg/actions/ExpireSnapshots.java b/api/src/main/java/org/apache/iceberg/actions/ExpireSnapshots.java index a08dcf8a2101..2c7d274cbf63 100644 --- a/api/src/main/java/org/apache/iceberg/actions/ExpireSnapshots.java +++ b/api/src/main/java/org/apache/iceberg/actions/ExpireSnapshots.java @@ -65,12 +65,12 @@ public interface ExpireSnapshots extends Action - * Manifest files that are no longer used by valid snapshots will be deleted. Data files that were - * deleted by snapshots that are expired will be deleted. + * Manifest files that are no longer used by valid snapshots will be deleted. Content files that were + * marked as logically deleted by snapshots that are expired will be deleted as well. *

- * If this method is not called, unnecessary manifests and data files will still be deleted. + * If this method is not called, unnecessary manifests and content files will still be deleted. *

* Identical to {@link org.apache.iceberg.ExpireSnapshots#deleteWith(Consumer)} * @@ -80,9 +80,9 @@ public interface ExpireSnapshots extends Action deleteFunc); /** - * Passes an alternative executor service that will be used for manifests and data files deletion. + * Passes an alternative executor service that will be used for manifests, data and delete files deletion. *

- * If this method is not called, unnecessary manifests and data files will still be deleted in + * If this method is not called, unnecessary manifests and content files will still be deleted in * the current thread. *

* Identical to {@link org.apache.iceberg.ExpireSnapshots#executeDeleteWith(ExecutorService)} diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java index e316dfb81c11..691807a61ae8 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java @@ -61,8 +61,8 @@ import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; /** - * An action that removes orphan metadata and data files by listing a given location and comparing - * the actual files in that location with data and metadata files referenced by all valid snapshots. + * An action that removes orphan metadata, data and delete files by listing a given location and comparing + * the actual files in that location with content and metadata files referenced by all valid snapshots. * The location must be accessible for listing via the Hadoop {@link FileSystem}. *

* By default, this action cleans up the table location returned by {@link Table#location()} and @@ -162,9 +162,9 @@ private String jobDesc() { } private DeleteOrphanFiles.Result doExecute() { - Dataset validDataFileDF = buildValidDataFileDF(table); + Dataset validContentFileDF = buildValidContentFileDF(table); Dataset validMetadataFileDF = buildValidMetadataFileDF(table); - Dataset validFileDF = validDataFileDF.union(validMetadataFileDF); + Dataset validFileDF = validContentFileDF.union(validMetadataFileDF); Dataset actualFileDF = buildActualFileDF(); Column actualFileName = filenameUDF.apply(actualFileDF.col("file_path")); diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java index 6534617d2dec..e251d7891c65 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java @@ -59,7 +59,7 @@ public class BaseDeleteReachableFilesSparkAction extends BaseSparkAction implements DeleteReachableFiles { private static final Logger LOG = LoggerFactory.getLogger(BaseDeleteReachableFilesSparkAction.class); - private static final String DATA_FILE = "Data File"; + private static final String CONTENT_FILE = "Content File"; private static final String MANIFEST = "Manifest"; private static final String MANIFEST_LIST = "Manifest List"; private static final String OTHERS = "Others"; @@ -138,7 +138,7 @@ private Dataset projectFilePathWithType(Dataset ds, String type) { private Dataset buildValidFileDF(TableMetadata metadata) { Table staticTable = newStaticTable(metadata, io); - return projectFilePathWithType(buildValidDataFileDF(staticTable), DATA_FILE) + return projectFilePathWithType(buildValidContentFileDF(staticTable), CONTENT_FILE) .union(projectFilePathWithType(buildManifestFileDF(staticTable), MANIFEST)) .union(projectFilePathWithType(buildManifestListDF(staticTable), MANIFEST_LIST)) .union(projectFilePathWithType(buildOtherMetadataFileDF(staticTable), OTHERS)); @@ -177,9 +177,9 @@ private BaseDeleteReachableFilesActionResult deleteFiles(Iterator deleted) String type = fileInfo.getString(1); removeFunc.accept(file); switch (type) { - case DATA_FILE: + case CONTENT_FILE: dataFileCount.incrementAndGet(); - LOG.trace("Deleted Data File: {}", file); + LOG.trace("Deleted Content File: {}", file); break; case MANIFEST: manifestCount.incrementAndGet(); diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java index e0f38f7e1fe6..75416530b424 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java @@ -58,7 +58,7 @@ *

* This action first leverages {@link org.apache.iceberg.ExpireSnapshots} to expire snapshots and then * uses metadata tables to find files that can be safely deleted. This is done by anti-joining two Datasets - * that contain all manifest and data files before and after the expiration. The snapshot expiration + * that contain all manifest and content files before and after the expiration. The snapshot expiration * will be fully committed before any deletes are issued. *

* This operation performs a shuffle so the parallelism can be controlled through 'spark.sql.shuffle.partitions'. @@ -70,7 +70,7 @@ public class BaseExpireSnapshotsSparkAction extends BaseSparkAction implements ExpireSnapshots { private static final Logger LOG = LoggerFactory.getLogger(BaseExpireSnapshotsSparkAction.class); - private static final String DATA_FILE = "Data File"; + private static final String CONTENT_FILE = "Content File"; private static final String MANIFEST = "Manifest"; private static final String MANIFEST_LIST = "Manifest List"; @@ -226,7 +226,7 @@ private Dataset appendTypeString(Dataset ds, String type) { private Dataset buildValidFileDF(TableMetadata metadata) { Table staticTable = newStaticTable(metadata, this.table.io()); - return appendTypeString(buildValidDataFileDF(staticTable), DATA_FILE) + return appendTypeString(buildValidContentFileDF(staticTable), CONTENT_FILE) .union(appendTypeString(buildManifestFileDF(staticTable), MANIFEST)) .union(appendTypeString(buildManifestListDF(staticTable), MANIFEST_LIST)); } @@ -255,9 +255,9 @@ private BaseExpireSnapshotsActionResult deleteFiles(Iterator expired) { String type = fileInfo.getString(1); deleteFunc.accept(file); switch (type) { - case DATA_FILE: + case CONTENT_FILE: dataFileCount.incrementAndGet(); - LOG.trace("Deleted Data File: {}", file); + LOG.trace("Deleted Content File: {}", file); break; case MANIFEST: manifestCount.incrementAndGet(); diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java index 42c54679b669..ff54b693a873 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java @@ -111,7 +111,8 @@ protected Table newStaticTable(TableMetadata metadata, FileIO io) { return new BaseTable(ops, metadataFileLocation); } - protected Dataset buildValidDataFileDF(Table table) { + // builds a DF of delete and data file locations by reading all manifests + protected Dataset buildValidContentFileDF(Table table) { JavaSparkContext context = JavaSparkContext.fromSparkContext(spark.sparkContext()); Broadcast ioBroadcast = context.broadcast(SparkUtil.serializableFileIO(table));