Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
188 changes: 187 additions & 1 deletion core/src/main/java/org/apache/iceberg/RewriteTablePathUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,24 @@ public Set<Pair<String, String>> copyPlan() {
}
}

public static class RewrittenFileInfo implements Serializable {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this is in iceberg core and this class is only used in static class for record and in SparkAction, I am wondering if we want to define it here.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could return it from rewrite*ManifestWithResult and maybe use it in fixes for delete files later

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed it altogether.

private final String newPath;
private final long newSize;

public RewrittenFileInfo(String newPath, long newSize) {
this.newPath = newPath;
this.newSize = newSize;
}

public String getNewPath() {
return newPath;
}

public long getNewSize() {
return newSize;
}
}

/**
* Create a new table metadata object, replacing path references
*
Expand Down Expand Up @@ -226,7 +244,9 @@ private static List<Snapshot> updatePathInSnapshots(
* @param outputPath location to write the manifest list
* @return a copy plan for manifest files whose metadata were contained in the rewritten manifest
* list
* @deprecated since 1.10.0, will be removed in 1.11.0
*/
@Deprecated
public static RewriteResult<ManifestFile> rewriteManifestList(
Snapshot snapshot,
FileIO io,
Expand Down Expand Up @@ -274,6 +294,65 @@ public static RewriteResult<ManifestFile> rewriteManifestList(
}
}

/**
* Rewrite a manifest list representing a snapshot, replacing path references.
* @param snapshot snapshot represented by the manifest list
* @param io file io
* @param tableMetadata metadata of table
* @param rewrittenManifests information about rewritten manifest files
* @param sourcePrefix source prefix that will be replaced
* @param targetPrefix target prefix that will replace it
* @param stagingDir staging directory
* @param outputPath location to write the manifest list
*/
public static void rewriteManifestList(
Snapshot snapshot,
FileIO io,
TableMetadata tableMetadata,
Map<String, RewrittenFileInfo> rewrittenManifests,
String sourcePrefix,
String targetPrefix,
String stagingDir,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this stagingDir is no longer used as now path come from rewrittenManifests.get(file.path()).getNewPath()

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed.

String outputPath) {
OutputFile outputFile = io.newOutputFile(outputPath);

List<ManifestFile> manifestFiles = manifestFilesInSnapshot(io, snapshot);
manifestFiles.forEach(
mf ->
Preconditions.checkArgument(
mf.path().startsWith(sourcePrefix),
"Encountered manifest file %s not under the source prefix %s",
mf.path(),
sourcePrefix));

try (FileAppender<ManifestFile> writer =
ManifestLists.write(
tableMetadata.formatVersion(),
outputFile,
snapshot.snapshotId(),
snapshot.parentId(),
snapshot.sequenceNumber(),
snapshot.firstRowId())) {

for (ManifestFile file : manifestFiles) {
ManifestFile newFile = file.copy();

if (rewrittenManifests.containsKey(file.path())) {
String rewrittenPath = rewrittenManifests.get(file.path()).getNewPath();
long rewrittenSize = rewrittenManifests.get(file.path()).getNewSize();
((StructLike) newFile).set(0, rewrittenPath);
((StructLike) newFile).set(1, rewrittenSize);
} else {
((StructLike) newFile).set(0, newPath(newFile.path(), sourcePrefix, targetPrefix));
}
writer.add(newFile);
}
} catch (IOException e) {
throw new UncheckedIOException(
"Failed to rewrite the manifest list file " + snapshot.manifestListLocation(), e);
}
}

private static List<ManifestFile> manifestFilesInSnapshot(FileIO io, Snapshot snapshot) {
String path = snapshot.manifestListLocation();
List<ManifestFile> manifestFiles = Lists.newLinkedList();
Expand Down Expand Up @@ -333,7 +412,9 @@ public static RewriteResult<DataFile> rewriteDataManifest(
* @param sourcePrefix source prefix that will be replaced
* @param targetPrefix target prefix that will replace it
* @return a copy plan of content files in the manifest that was rewritten
* @deprecated since 1.10.0, will be removed in 1.11.0
*/
@Deprecated
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this method is added in the upcoming 1.10. should we just remove this method directly? what do you think @dramaticlly ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@vaultah normally we will need to go through the deprecation cycle for public method, but as Steven suggested, both of these methods (LINE 398 and LINE 521) are part of 62d9ff5 which never gets released. So we can actually remove these directly.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That method is being used by RewriteTablePathSparkAction in Spark 3.4 and Spark 3.5, so we can't remove it unless we backport these changes to Spark 3.4 and Spark 3.5 in the same (this) PR. Same for rewriteDataManifest and rewriteDeleteManifest methods below. Let me know how you want to proceed

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am fine with doing the backport change in this same PR. it is not a super large PR anyway.

Copy link
Contributor

@dramaticlly dramaticlly Aug 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess it's my fault as I initially asked to separate the 3.4 and 3.5 changes to facilitate reviews, but I think as we get closer with more in-depth reviews. Doing Spark 3.4 & 3.5 backport together in this PR helps

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

initially, it is our recommended practice to separate out 3.4 and 3.5 changes to a separate backport PR.

Right now, it is more important to avoid carrying over debts of deprecated APIs when we can for unreleased APIs. Hence, it is good to bring in the backport changes in the same PR, although it will make the PR bigger.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if this PR is going to take longer due to larger refactoring for the suggested bottom up approach and result classes and can't make the 1.10 release in time, we can go with the deprecation approach like here. Let's defer this deprecation decision /change as the last step.

public static RewriteResult<DataFile> rewriteDataManifest(
ManifestFile manifestFile,
Set<Long> snapshotIds,
Expand All @@ -357,6 +438,55 @@ public static RewriteResult<DataFile> rewriteDataManifest(
}
}


/**
* Rewrite a data manifest, replacing path references.
*
* @param manifestFile source manifest file to rewrite
* @param snapshotIds snapshot ids for filtering returned data manifest entries
* @param outputFile output file to rewrite manifest file to
* @param io file io
* @param format format of the manifest file
* @param specsById map of partition specs by id
* @param sourcePrefix source prefix that will be replaced
* @param targetPrefix target prefix that will replace it
* @return rewritten manifest file and a copy plan for the referenced content files
*/
public static Pair<ManifestFile, RewriteResult<DataFile>> rewriteDataManifestWithResult(
ManifestFile manifestFile,
Set<Long> snapshotIds,
OutputFile outputFile,
FileIO io,
int format,
Map<Integer, PartitionSpec> specsById,
String sourcePrefix,
String targetPrefix)
throws IOException {
PartitionSpec spec = specsById.get(manifestFile.partitionSpecId());
ManifestWriter<DataFile> writer =
ManifestFiles.write(format, spec, outputFile, manifestFile.snapshotId());
RewriteResult<DataFile> rewriteResult = null;

try (ManifestWriter<DataFile> dataManifestWriter = writer;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dataManifestWriter seems unused, I assume we want to ensure writer is closed with try resource

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I used it as a dummy variable so we could close writer and have a reference to it later, after it's closed. Java 9 improves this syntax (JEP 213), and since Iceberg doesn't need to support earlier versions, I now changed it to just

try (writer;
    ManifestReader<DeleteFile> reader =
        ManifestFiles.readDeleteManifest(manifestFile, io, specsById)
        ...

ManifestReader<DataFile> reader =
ManifestFiles.read(manifestFile, io, specsById)
.select(Arrays.asList("*"))) {
rewriteResult =
StreamSupport.stream(reader.entries().spliterator(), false)
.map(
entry ->
writeDataFileEntry(
entry,
snapshotIds,
spec,
sourcePrefix,
targetPrefix,
writer))
.reduce(new RewriteResult<>(), RewriteResult::append);
}
return Pair.of(writer.toManifestFile(), rewriteResult);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like we are returning entire new manifestFile on top of list of data files in copy plan , but later in SparkAction we only use its new location and length, can we do better memory wise?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I only made it return the entire ManifestFile in an attempt to make this function more general-purpose, since it's part of public API (however specialized and niche it might be in reality). We can return RewrittenFileInfo from it though

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It now returns the size of the rewritten manifest instead of the full instance.

}

/**
* Rewrite a delete manifest, replacing path references.
*
Expand Down Expand Up @@ -411,8 +541,11 @@ public static RewriteResult<DeleteFile> rewriteDeleteManifest(
* @param targetPrefix target prefix that will replace it
* @param stagingLocation staging location for rewritten files (referred delete file will be
* rewritten here)
* @return a copy plan of content files in the manifest that was rewritten
* @return rewritten manifest file and a copy plan of content files
* in the manifest that was rewritten
* @deprecated since 1.10.0, will be removed in 1.11.0
*/
@Deprecated
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same question for direct removal?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agreed!

public static RewriteResult<DeleteFile> rewriteDeleteManifest(
ManifestFile manifestFile,
Set<Long> snapshotIds,
Expand Down Expand Up @@ -445,6 +578,59 @@ public static RewriteResult<DeleteFile> rewriteDeleteManifest(
}
}

/**
* Rewrite a delete manifest, replacing path references.
*
* @param manifestFile source delete manifest to rewrite
* @param snapshotIds snapshot ids for filtering returned delete manifest entries
* @param outputFile output file to rewrite manifest file to
* @param io file io
* @param format format of the manifest file
* @param specsById map of partition specs by id
* @param sourcePrefix source prefix that will be replaced
* @param targetPrefix target prefix that will replace it
* @param stagingLocation staging location for rewritten files (referred delete file will be
* rewritten here)
* @return rewritten manifest file and a copy plan for the referenced content files
*/
public static Pair<ManifestFile, RewriteResult<DeleteFile>> rewriteDeleteManifestWithResult(
ManifestFile manifestFile,
Set<Long> snapshotIds,
OutputFile outputFile,
FileIO io,
int format,
Map<Integer, PartitionSpec> specsById,
String sourcePrefix,
String targetPrefix,
String stagingLocation)
throws IOException {
PartitionSpec spec = specsById.get(manifestFile.partitionSpecId());
ManifestWriter<DeleteFile> writer =
ManifestFiles.writeDeleteManifest(format, spec, outputFile, manifestFile.snapshotId());
RewriteResult<DeleteFile> rewriteResult = null;

try (ManifestWriter<DeleteFile> deleteManifestWriter = writer;
ManifestReader<DeleteFile> reader =
ManifestFiles.readDeleteManifest(manifestFile, io, specsById)
.select(Arrays.asList("*"))) {
rewriteResult =
StreamSupport.stream(reader.entries().spliterator(), false)
.map(
entry ->
writeDeleteFileEntry(
entry,
snapshotIds,
spec,
sourcePrefix,
targetPrefix,
stagingLocation,
writer))
.reduce(new RewriteResult<>(), RewriteResult::append);
}

return Pair.of(writer.toManifestFile(), rewriteResult);
}

private static RewriteResult<DataFile> writeDataFileEntry(
ManifestEntry<DataFile> entry,
Set<Long> snapshotIds,
Expand Down
Loading