trinodb · raunaqmorarka · Aug 1, 2025 · May 21, 2025 · grantatspothero · Aug 1, 2025
diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergMetadata.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergMetadata.java
@@ -145,6 +145,7 @@
 import org.apache.iceberg.FileFormat;
 import org.apache.iceberg.FileMetadata;
 import org.apache.iceberg.FileScanTask;
+import org.apache.iceberg.IcebergManifestUtils;
 import org.apache.iceberg.IsolationLevel;
 import org.apache.iceberg.ManifestFile;
 import org.apache.iceberg.ManifestFiles;
@@ -2260,11 +2261,18 @@ private void removeOrphanFiles(Table table, ConnectorSession session, SchemaTabl
         ImmutableSet.Builder<String> validDataFileNames = ImmutableSet.builder();
 
         for (Snapshot snapshot : table.snapshots()) {
-            if (snapshot.manifestListLocation() != null) {
-                validMetadataFileNames.add(fileName(snapshot.manifestListLocation()));
+            String manifestListLocation = snapshot.manifestListLocation();
+            List<ManifestFile> allManifests;
+            if (manifestListLocation != null) {
+                validMetadataFileNames.add(fileName(manifestListLocation));
+                allManifests = loadAllManifestsFromManifestList(table, manifestListLocation);
+            }
+            else {
+                // This is to maintain support for V1 tables which have embedded manifest lists
+                allManifests = loadAllManifestsFromSnapshot(table, snapshot);
             }
 
-            for (ManifestFile manifest : loadAllManifestsFromSnapshot(table, snapshot)) {
+            for (ManifestFile manifest : allManifests) {
                 if (!processedManifestFilePaths.add(manifest.path())) {
                     // Already read this manifest
                     continue;
@@ -3469,6 +3477,21 @@ private static List<ManifestFile> loadAllManifestsFromSnapshot(Table icebergTabl
         }
     }
 
+    /**
+     * Use instead of loadAllManifestsFromSnapshot when loading manifests from multiple distinct snapshots
+     * Each BaseSnapshot object caches manifest files separately, so loading manifests from multiple distinct snapshots
+     * results in O(num_snapshots^2) copies of the same manifest file metadata in memory
+     */
+    private static List<ManifestFile> loadAllManifestsFromManifestList(Table icebergTable, String manifestListLocation)
+    {
+        try {
+            return IcebergManifestUtils.read(icebergTable.io(), manifestListLocation);
+        }
+        catch (NotFoundException | UncheckedIOException e) {
+            throw new TrinoException(ICEBERG_INVALID_METADATA, "Error accessing manifest file for table %s".formatted(icebergTable.name()), e);
+        }
+    }
+
     private static Set<Integer> identityPartitionColumnsInAllSpecs(Table table)
     {
         // Extract identity partition column source ids common to ALL specs

diff --git a/plugin/trino-iceberg/src/main/java/org/apache/iceberg/IcebergManifestUtils.java b/plugin/trino-iceberg/src/main/java/org/apache/iceberg/IcebergManifestUtils.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.iceberg;
+
+import org.apache.iceberg.io.FileIO;
+
+import java.util.List;
+
+public class IcebergManifestUtils
+{
+    private IcebergManifestUtils() {}
+
+    public static List<ManifestFile> read(FileIO fileIO, String manifestListLocation)
+    {
+        // Avoid using snapshot.allManifests() when processing multiple snapshots,
+        // as each Snapshot instance internally caches `org.apache.iceberg.BaseSnapshot.allManifests`
+        // and leads to high memory usage
+        return ManifestLists.read(fileIO.newInputFile(manifestListLocation));
+    }
+}