diff --git a/api/src/main/java/org/apache/iceberg/actions/DeleteOrphanFiles.java b/api/src/main/java/org/apache/iceberg/actions/DeleteOrphanFiles.java
index fcdf5299629d..4ee75a6e7990 100644
--- a/api/src/main/java/org/apache/iceberg/actions/DeleteOrphanFiles.java
+++ b/api/src/main/java/org/apache/iceberg/actions/DeleteOrphanFiles.java
@@ -23,9 +23,9 @@
import java.util.function.Consumer;
/**
- * An action that deletes orphan files in a table.
+ * An action that deletes orphan metadata, data and delete files in a table.
*
- * A metadata or data file is considered orphan if it is not reachable by any valid snapshot.
+ * A file is considered orphan if it is not reachable by any valid snapshot.
* The set of actual files is built by listing the underlying storage which makes this operation
* expensive.
*/
diff --git a/api/src/main/java/org/apache/iceberg/actions/ExpireSnapshots.java b/api/src/main/java/org/apache/iceberg/actions/ExpireSnapshots.java
index a08dcf8a2101..2c7d274cbf63 100644
--- a/api/src/main/java/org/apache/iceberg/actions/ExpireSnapshots.java
+++ b/api/src/main/java/org/apache/iceberg/actions/ExpireSnapshots.java
@@ -65,12 +65,12 @@ public interface ExpireSnapshots extends Action
- * Manifest files that are no longer used by valid snapshots will be deleted. Data files that were
- * deleted by snapshots that are expired will be deleted.
+ * Manifest files that are no longer used by valid snapshots will be deleted. Content files that were
+ * marked as logically deleted by snapshots that are expired will be deleted as well.
*
- * If this method is not called, unnecessary manifests and data files will still be deleted.
+ * If this method is not called, unnecessary manifests and content files will still be deleted.
*
* Identical to {@link org.apache.iceberg.ExpireSnapshots#deleteWith(Consumer)}
*
@@ -80,9 +80,9 @@ public interface ExpireSnapshots extends Action deleteFunc);
/**
- * Passes an alternative executor service that will be used for manifests and data files deletion.
+ * Passes an alternative executor service that will be used for manifests, data and delete files deletion.
*
- * If this method is not called, unnecessary manifests and data files will still be deleted in
+ * If this method is not called, unnecessary manifests and content files will still be deleted in
* the current thread.
*
* Identical to {@link org.apache.iceberg.ExpireSnapshots#executeDeleteWith(ExecutorService)}
diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java
index e316dfb81c11..691807a61ae8 100644
--- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java
+++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java
@@ -61,8 +61,8 @@
import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT;
/**
- * An action that removes orphan metadata and data files by listing a given location and comparing
- * the actual files in that location with data and metadata files referenced by all valid snapshots.
+ * An action that removes orphan metadata, data and delete files by listing a given location and comparing
+ * the actual files in that location with content and metadata files referenced by all valid snapshots.
* The location must be accessible for listing via the Hadoop {@link FileSystem}.
*
* By default, this action cleans up the table location returned by {@link Table#location()} and
@@ -162,9 +162,9 @@ private String jobDesc() {
}
private DeleteOrphanFiles.Result doExecute() {
- Dataset validDataFileDF = buildValidDataFileDF(table);
+ Dataset validContentFileDF = buildValidContentFileDF(table);
Dataset validMetadataFileDF = buildValidMetadataFileDF(table);
- Dataset validFileDF = validDataFileDF.union(validMetadataFileDF);
+ Dataset validFileDF = validContentFileDF.union(validMetadataFileDF);
Dataset actualFileDF = buildActualFileDF();
Column actualFileName = filenameUDF.apply(actualFileDF.col("file_path"));
diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java
index 6534617d2dec..e251d7891c65 100644
--- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java
+++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java
@@ -59,7 +59,7 @@ public class BaseDeleteReachableFilesSparkAction
extends BaseSparkAction implements DeleteReachableFiles {
private static final Logger LOG = LoggerFactory.getLogger(BaseDeleteReachableFilesSparkAction.class);
- private static final String DATA_FILE = "Data File";
+ private static final String CONTENT_FILE = "Content File";
private static final String MANIFEST = "Manifest";
private static final String MANIFEST_LIST = "Manifest List";
private static final String OTHERS = "Others";
@@ -138,7 +138,7 @@ private Dataset projectFilePathWithType(Dataset ds, String type) {
private Dataset buildValidFileDF(TableMetadata metadata) {
Table staticTable = newStaticTable(metadata, io);
- return projectFilePathWithType(buildValidDataFileDF(staticTable), DATA_FILE)
+ return projectFilePathWithType(buildValidContentFileDF(staticTable), CONTENT_FILE)
.union(projectFilePathWithType(buildManifestFileDF(staticTable), MANIFEST))
.union(projectFilePathWithType(buildManifestListDF(staticTable), MANIFEST_LIST))
.union(projectFilePathWithType(buildOtherMetadataFileDF(staticTable), OTHERS));
@@ -177,9 +177,9 @@ private BaseDeleteReachableFilesActionResult deleteFiles(Iterator deleted)
String type = fileInfo.getString(1);
removeFunc.accept(file);
switch (type) {
- case DATA_FILE:
+ case CONTENT_FILE:
dataFileCount.incrementAndGet();
- LOG.trace("Deleted Data File: {}", file);
+ LOG.trace("Deleted Content File: {}", file);
break;
case MANIFEST:
manifestCount.incrementAndGet();
diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java
index e0f38f7e1fe6..75416530b424 100644
--- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java
+++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java
@@ -58,7 +58,7 @@
*
* This action first leverages {@link org.apache.iceberg.ExpireSnapshots} to expire snapshots and then
* uses metadata tables to find files that can be safely deleted. This is done by anti-joining two Datasets
- * that contain all manifest and data files before and after the expiration. The snapshot expiration
+ * that contain all manifest and content files before and after the expiration. The snapshot expiration
* will be fully committed before any deletes are issued.
*
* This operation performs a shuffle so the parallelism can be controlled through 'spark.sql.shuffle.partitions'.
@@ -70,7 +70,7 @@ public class BaseExpireSnapshotsSparkAction
extends BaseSparkAction implements ExpireSnapshots {
private static final Logger LOG = LoggerFactory.getLogger(BaseExpireSnapshotsSparkAction.class);
- private static final String DATA_FILE = "Data File";
+ private static final String CONTENT_FILE = "Content File";
private static final String MANIFEST = "Manifest";
private static final String MANIFEST_LIST = "Manifest List";
@@ -226,7 +226,7 @@ private Dataset appendTypeString(Dataset ds, String type) {
private Dataset buildValidFileDF(TableMetadata metadata) {
Table staticTable = newStaticTable(metadata, this.table.io());
- return appendTypeString(buildValidDataFileDF(staticTable), DATA_FILE)
+ return appendTypeString(buildValidContentFileDF(staticTable), CONTENT_FILE)
.union(appendTypeString(buildManifestFileDF(staticTable), MANIFEST))
.union(appendTypeString(buildManifestListDF(staticTable), MANIFEST_LIST));
}
@@ -255,9 +255,9 @@ private BaseExpireSnapshotsActionResult deleteFiles(Iterator expired) {
String type = fileInfo.getString(1);
deleteFunc.accept(file);
switch (type) {
- case DATA_FILE:
+ case CONTENT_FILE:
dataFileCount.incrementAndGet();
- LOG.trace("Deleted Data File: {}", file);
+ LOG.trace("Deleted Content File: {}", file);
break;
case MANIFEST:
manifestCount.incrementAndGet();
diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java
index 42c54679b669..ff54b693a873 100644
--- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java
+++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java
@@ -111,7 +111,8 @@ protected Table newStaticTable(TableMetadata metadata, FileIO io) {
return new BaseTable(ops, metadataFileLocation);
}
- protected Dataset buildValidDataFileDF(Table table) {
+ // builds a DF of delete and data file locations by reading all manifests
+ protected Dataset buildValidContentFileDF(Table table) {
JavaSparkContext context = JavaSparkContext.fromSparkContext(spark.sparkContext());
Broadcast ioBroadcast = context.broadcast(SparkUtil.serializableFileIO(table));