From 41fe94c47b900f09b27f753e73fcd8845dcf0419 Mon Sep 17 00:00:00 2001 From: chenjian2664 Date: Mon, 13 Apr 2026 11:26:19 +0800 Subject: [PATCH 01/10] Move MetadataAndProtocolEntries to out package --- .../java/io/trino/plugin/deltalake/DeltaLakeMetadata.java | 2 +- .../{checkpoint => }/MetadataAndProtocolEntries.java | 5 +---- .../deltalake/transactionlog/TransactionLogAccess.java | 1 - .../deltalake/transactionlog/TransactionLogEntries.java | 1 - .../io/trino/plugin/deltalake/TestTransactionLogAccess.java | 2 +- 5 files changed, 3 insertions(+), 8 deletions(-) rename plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/{checkpoint => }/MetadataAndProtocolEntries.java (87%) diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeMetadata.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeMetadata.java index 9e7b1ab30b66..9482156b87af 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeMetadata.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeMetadata.java @@ -69,6 +69,7 @@ import io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport.ColumnMappingMode; import io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport.UnsupportedTypeException; import io.trino.plugin.deltalake.transactionlog.DeltaLakeTransactionLogEntry; +import io.trino.plugin.deltalake.transactionlog.MetadataAndProtocolEntries; import io.trino.plugin.deltalake.transactionlog.MetadataEntry; import io.trino.plugin.deltalake.transactionlog.ProtocolEntry; import io.trino.plugin.deltalake.transactionlog.RemoveFileEntry; @@ -78,7 +79,6 @@ import io.trino.plugin.deltalake.transactionlog.TransactionLogEntries; import io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointWriterManager; import io.trino.plugin.deltalake.transactionlog.checkpoint.LastCheckpoint; -import io.trino.plugin.deltalake.transactionlog.checkpoint.MetadataAndProtocolEntries; import io.trino.plugin.deltalake.transactionlog.checkpoint.TransactionLogTail; import io.trino.plugin.deltalake.transactionlog.reader.FileSystemTransactionLogReader; import io.trino.plugin.deltalake.transactionlog.reader.TransactionLogReader; diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/checkpoint/MetadataAndProtocolEntries.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/MetadataAndProtocolEntries.java similarity index 87% rename from plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/checkpoint/MetadataAndProtocolEntries.java rename to plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/MetadataAndProtocolEntries.java index 857778630f26..26ac00de5f5d 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/checkpoint/MetadataAndProtocolEntries.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/MetadataAndProtocolEntries.java @@ -11,10 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.plugin.deltalake.transactionlog.checkpoint; - -import io.trino.plugin.deltalake.transactionlog.MetadataEntry; -import io.trino.plugin.deltalake.transactionlog.ProtocolEntry; +package io.trino.plugin.deltalake.transactionlog; import java.util.Optional; diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java index f9f40dab4cc7..f5cd50dd77af 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java @@ -44,7 +44,6 @@ import io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointEntryIterator; import io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointSchemaManager; import io.trino.plugin.deltalake.transactionlog.checkpoint.LastCheckpoint; -import io.trino.plugin.deltalake.transactionlog.checkpoint.MetadataAndProtocolEntries; import io.trino.plugin.deltalake.transactionlog.reader.TransactionLogReader; import io.trino.plugin.deltalake.transactionlog.reader.TransactionLogReaderFactory; import io.trino.plugin.hive.parquet.ParquetReaderConfig; diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogEntries.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogEntries.java index 5559aaa43651..d1c3530160f3 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogEntries.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogEntries.java @@ -20,7 +20,6 @@ import io.trino.filesystem.TrinoFileSystem; import io.trino.filesystem.TrinoInputFile; import io.trino.filesystem.TrinoInputStream; -import io.trino.plugin.deltalake.transactionlog.checkpoint.MetadataAndProtocolEntries; import io.trino.spi.TrinoException; import java.io.BufferedReader; diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestTransactionLogAccess.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestTransactionLogAccess.java index 0505ea7b7c96..b43d09cc254f 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestTransactionLogAccess.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestTransactionLogAccess.java @@ -26,12 +26,12 @@ import io.trino.plugin.base.metrics.FileFormatDataSourceStats; import io.trino.plugin.deltalake.metastore.NoOpVendedCredentialsProvider; import io.trino.plugin.deltalake.transactionlog.AddFileEntry; +import io.trino.plugin.deltalake.transactionlog.MetadataAndProtocolEntries; import io.trino.plugin.deltalake.transactionlog.MetadataEntry; import io.trino.plugin.deltalake.transactionlog.ProtocolEntry; import io.trino.plugin.deltalake.transactionlog.TableSnapshot; import io.trino.plugin.deltalake.transactionlog.TransactionLogAccess; import io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointSchemaManager; -import io.trino.plugin.deltalake.transactionlog.checkpoint.MetadataAndProtocolEntries; import io.trino.plugin.deltalake.transactionlog.reader.FileSystemTransactionLogReaderFactory; import io.trino.plugin.deltalake.transactionlog.statistics.DeltaLakeFileStatistics; import io.trino.plugin.hive.parquet.ParquetReaderConfig; From 60b81a994fb7b915c0e4cd58e29ce6f4f295de88 Mon Sep 17 00:00:00 2001 From: chenjian2664 Date: Mon, 13 Apr 2026 11:39:29 +0800 Subject: [PATCH 02/10] Introduce builder for `MetadataAndProtocolEntries` Simplify the code by adding a Builder to MetadataAndProtocolEntries to incrementally collect `MetadataEntry` and `ProtocolEntry`. --- .../MetadataAndProtocolEntries.java | 70 +++++++++++++++++-- .../transactionlog/TableSnapshot.java | 36 +++++++--- .../transactionlog/TransactionLogAccess.java | 40 +++++------ .../transactionlog/TransactionLogEntries.java | 16 ++--- 4 files changed, 113 insertions(+), 49 deletions(-) diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/MetadataAndProtocolEntries.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/MetadataAndProtocolEntries.java index 26ac00de5f5d..707686f4bcba 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/MetadataAndProtocolEntries.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/MetadataAndProtocolEntries.java @@ -22,15 +22,75 @@ public record MetadataAndProtocolEntries(Optional metadata, Optio { private static final int INSTANCE_SIZE = instanceSize(MetadataAndProtocolEntries.class); - public MetadataAndProtocolEntries(MetadataEntry metadata, ProtocolEntry protocol) - { - this(Optional.ofNullable(metadata), Optional.ofNullable(protocol)); - } - public long getRetainedSizeInBytes() { return INSTANCE_SIZE + sizeOf(metadata, MetadataEntry::getRetainedSizeInBytes) + sizeOf(protocol, ProtocolEntry::getRetainedSizeInBytes); } + + public static Builder builder() + { + return new Builder(); + } + + public static final class Builder + { + private Optional metadataEntry = Optional.empty(); + private Optional protocolEntry = Optional.empty(); + + public boolean hasMetadata() + { + return metadataEntry.isPresent(); + } + + public boolean hasProtocol() + { + return protocolEntry.isPresent(); + } + + public boolean isFull() + { + return hasMetadata() && hasProtocol(); + } + + public Builder withMetadataEntry(MetadataEntry metadataEntry) + { + this.metadataEntry = Optional.of(metadataEntry); + return this; + } + + public Builder withProtocolEntry(ProtocolEntry protocolEntry) + { + this.protocolEntry = Optional.of(protocolEntry); + return this; + } + + public Builder withEntries(MetadataAndProtocolEntries entries) + { + if (!hasMetadata() && entries.metadata().isPresent()) { + withMetadataEntry(entries.metadata().get()); + } + if (!hasProtocol() && entries.protocol().isPresent()) { + withProtocolEntry(entries.protocol().get()); + } + return this; + } + + public Builder withTransactionLogEntry(DeltaLakeTransactionLogEntry transactionLogEntry) + { + if (metadataEntry.isEmpty() && transactionLogEntry.getMetaData() != null) { + withMetadataEntry(transactionLogEntry.getMetaData()); + } + if (protocolEntry.isEmpty() && transactionLogEntry.getProtocol() != null) { + withProtocolEntry(transactionLogEntry.getProtocol()); + } + return this; + } + + public MetadataAndProtocolEntries build() + { + return new MetadataAndProtocolEntries(metadataEntry, protocolEntry); + } + } } diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TableSnapshot.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TableSnapshot.java index b26e0e263486..a5979a4fa449 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TableSnapshot.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TableSnapshot.java @@ -81,8 +81,7 @@ public class TableSnapshot private final boolean checkpointRowStatisticsWritingEnabled; private final int domainCompactionThreshold; - private Optional cachedMetadata = Optional.empty(); - private Optional cachedProtocol = Optional.empty(); + private MetadataAndProtocolEntries cachedMetadataAndProtocolEntries = MetadataAndProtocolEntries.builder().build(); private TableSnapshot( SchemaTableName table, @@ -181,12 +180,12 @@ public SchemaTableName getTable() public Optional getCachedMetadata() { - return cachedMetadata; + return cachedMetadataAndProtocolEntries.metadata(); } public Optional getCachedProtocol() { - return cachedProtocol; + return cachedMetadataAndProtocolEntries.protocol(); } public String getTableLocation() @@ -194,14 +193,36 @@ public String getTableLocation() return tableLocation; } + public void updateCachedEntries(MetadataAndProtocolEntries updateEntries) + { + this.cachedMetadataAndProtocolEntries = MetadataAndProtocolEntries.builder() + .withEntries(updateEntries) + .withEntries(this.cachedMetadataAndProtocolEntries) + .build(); + } + public void setCachedMetadata(Optional cachedMetadata) { - this.cachedMetadata = cachedMetadata; + if (cachedMetadata.isEmpty()) { + return; + } + + cachedMetadataAndProtocolEntries = MetadataAndProtocolEntries.builder() + .withMetadataEntry(cachedMetadata.get()) + .withEntries(cachedMetadataAndProtocolEntries) + .build(); } public void setCachedProtocol(Optional cachedProtocol) { - this.cachedProtocol = cachedProtocol; + if (cachedProtocol.isEmpty()) { + return; + } + + cachedMetadataAndProtocolEntries = MetadataAndProtocolEntries.builder() + .withProtocolEntry(cachedProtocol.get()) + .withEntries(cachedMetadataAndProtocolEntries) + .build(); } public List getJsonTransactionLogEntries(TrinoFileSystem fileSystem) @@ -221,8 +242,7 @@ public long getRetainedSizeInBytes() + table.getRetainedSizeInBytes() + logTail.getRetainedSizeInBytes() + estimatedSizeOf(tableLocation) - + sizeOf(cachedMetadata, MetadataEntry::getRetainedSizeInBytes) - + sizeOf(cachedProtocol, ProtocolEntry::getRetainedSizeInBytes); + + cachedMetadataAndProtocolEntries.getRetainedSizeInBytes(); } public Stream getCheckpointTransactionLogEntries( diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java index f5cd50dd77af..82fc653c607d 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java @@ -504,49 +504,41 @@ else if (tableSnapshot.getCachedProtocol().isEmpty()) { private MetadataAndProtocolEntries getLatestMetadataAndProtocolEntry(ConnectorSession session, TrinoFileSystem fileSystem, TableSnapshot tableSnapshot) { - Optional latestMetadataEntry = Optional.empty(); - Optional latestProtocolEntry = Optional.empty(); + MetadataAndProtocolEntries.Builder builder = MetadataAndProtocolEntries.builder(); for (Transaction transaction : tableSnapshot.getTransactions().reversed()) { MetadataAndProtocolEntries metadataAndProtocol = transaction.transactionEntries().getMetadataAndProtocol(fileSystem); - if (latestMetadataEntry.isEmpty() && metadataAndProtocol.metadata().isPresent()) { - latestMetadataEntry = metadataAndProtocol.metadata(); - } - if (latestProtocolEntry.isEmpty() && metadataAndProtocol.protocol().isPresent()) { - latestProtocolEntry = metadataAndProtocol.protocol(); - } - if (latestMetadataEntry.isPresent() && latestProtocolEntry.isPresent()) { - tableSnapshot.setCachedMetadata(latestMetadataEntry); - tableSnapshot.setCachedProtocol(latestProtocolEntry); - return new MetadataAndProtocolEntries(latestMetadataEntry, latestProtocolEntry); + builder.withEntries(metadataAndProtocol); + if (builder.isFull()) { + break; } } + if (builder.isFull()) { + MetadataAndProtocolEntries entries = builder.build(); + tableSnapshot.updateCachedEntries(entries); + return entries; + } + MetadataAndProtocolEntries checkpointEntries = getCheckpointEntry( session, tableSnapshot, ImmutableSet.of(METADATA, PROTOCOL), checkpointStream -> { - MetadataEntry metadataEntry = null; - ProtocolEntry protocolEntry = null; for (Iterator it = checkpointStream.iterator(); it.hasNext(); ) { DeltaLakeTransactionLogEntry transactionLogEntry = it.next(); - if (transactionLogEntry.getMetaData() != null) { - metadataEntry = transactionLogEntry.getMetaData(); - } - if (transactionLogEntry.getProtocol() != null) { - protocolEntry = transactionLogEntry.getProtocol(); + builder.withTransactionLogEntry(transactionLogEntry); + if (builder.isFull()) { + return Optional.of(builder.build()); } } - return Optional.of(new MetadataAndProtocolEntries(metadataEntry, protocolEntry)); + return Optional.of(builder.build()); }, fileSystem, fileFormatDataSourceStats) .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata and protocol entry not found in transaction log for table " + tableSnapshot.getTable())); - tableSnapshot.setCachedMetadata(latestMetadataEntry.or(checkpointEntries::metadata)); - tableSnapshot.setCachedProtocol(latestProtocolEntry.or(checkpointEntries::protocol)); - - return new MetadataAndProtocolEntries(tableSnapshot.getCachedMetadata(), tableSnapshot.getCachedProtocol()); + tableSnapshot.updateCachedEntries(checkpointEntries); + return checkpointEntries; } public ProtocolEntry getProtocolEntry(ConnectorSession session, TrinoFileSystem fileSystem, TableSnapshot tableSnapshot) diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogEntries.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogEntries.java index d1c3530160f3..6ebeffe267dd 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogEntries.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogEntries.java @@ -94,23 +94,15 @@ public MetadataAndProtocolEntries getMetadataAndProtocol(TrinoFileSystem fileSys { // There can be at most one metadata and protocol entry per transaction log // We use that stop reading from file when a metadata and protocol entry are found + MetadataAndProtocolEntries.Builder builder = MetadataAndProtocolEntries.builder(); try (Stream logEntryStream = getEntries(fileSystem)) { - Optional metadataEntry = Optional.empty(); - Optional protocolEntry = Optional.empty(); for (Iterator it = logEntryStream.iterator(); it.hasNext(); ) { - DeltaLakeTransactionLogEntry transactionLogEntry = it.next(); - if (transactionLogEntry.getMetaData() != null) { - metadataEntry = Optional.of(transactionLogEntry.getMetaData()); - } - else if (transactionLogEntry.getProtocol() != null) { - protocolEntry = Optional.of(transactionLogEntry.getProtocol()); - } - - if (protocolEntry.isPresent() && metadataEntry.isPresent()) { + builder.withTransactionLogEntry(it.next()); + if (builder.isFull()) { break; } } - return new MetadataAndProtocolEntries(metadataEntry, protocolEntry); + return builder.build(); } } From 04846c8742ab8cfb8918bf6c52420fd9a6794242 Mon Sep 17 00:00:00 2001 From: chenjian2664 Date: Mon, 13 Apr 2026 11:54:43 +0800 Subject: [PATCH 03/10] Read only missing entries from checkpoint --- .../transactionlog/TransactionLogAccess.java | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java index 82fc653c607d..71c6a51ac121 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java @@ -81,6 +81,7 @@ import java.util.regex.Pattern; import java.util.stream.Stream; +import static com.google.common.base.Preconditions.checkState; import static com.google.common.base.Predicates.alwaysTrue; import static com.google.common.base.Throwables.throwIfUnchecked; import static com.google.common.collect.ImmutableList.toImmutableList; @@ -522,7 +523,7 @@ private MetadataAndProtocolEntries getLatestMetadataAndProtocolEntry(ConnectorSe MetadataAndProtocolEntries checkpointEntries = getCheckpointEntry( session, tableSnapshot, - ImmutableSet.of(METADATA, PROTOCOL), + resolveCheckpointReadTypes(builder), checkpointStream -> { for (Iterator it = checkpointStream.iterator(); it.hasNext(); ) { DeltaLakeTransactionLogEntry transactionLogEntry = it.next(); @@ -541,6 +542,22 @@ private MetadataAndProtocolEntries getLatestMetadataAndProtocolEntry(ConnectorSe return checkpointEntries; } + // Resolve checkpoint entry types to read based on which entries are missing for current builder, + // i,e if the builder already hasMetadata, then we could skip reading it. + private static Set resolveCheckpointReadTypes(MetadataAndProtocolEntries.Builder builder) + { + checkState(!builder.isFull(), "Builder is already full"); + ImmutableSet.Builder typesForRead = ImmutableSet.builder(); + if (!builder.hasMetadata()) { + typesForRead.add(METADATA); + } + if (!builder.hasProtocol()) { + typesForRead.add(PROTOCOL); + } + + return typesForRead.build(); + } + public ProtocolEntry getProtocolEntry(ConnectorSession session, TrinoFileSystem fileSystem, TableSnapshot tableSnapshot) { if (tableSnapshot.getCachedProtocol().isEmpty()) { From 42c827500a80250d4da1d4b3c69b5df5af3565f0 Mon Sep 17 00:00:00 2001 From: chenjian2664 Date: Mon, 13 Apr 2026 12:32:48 +0800 Subject: [PATCH 04/10] Simplify `getMetadataAndProtocolEntry` --- .../transactionlog/TableSnapshot.java | 5 +++++ .../transactionlog/TransactionLogAccess.java | 19 ++++--------------- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TableSnapshot.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TableSnapshot.java index a5979a4fa449..e2b0f4a9b4f5 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TableSnapshot.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TableSnapshot.java @@ -193,6 +193,11 @@ public String getTableLocation() return tableLocation; } + public MetadataAndProtocolEntries getCachedEntries() + { + return cachedMetadataAndProtocolEntries; + } + public void updateCachedEntries(MetadataAndProtocolEntries updateEntries) { this.cachedMetadataAndProtocolEntries = MetadataAndProtocolEntries.builder() diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java index 71c6a51ac121..523bcb37e02c 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java @@ -489,23 +489,12 @@ private record FileEntryKey(String path, Optional deletionVectorId) {} public MetadataAndProtocolEntries getMetadataAndProtocolEntry(ConnectorSession session, TrinoFileSystem fileSystem, TableSnapshot tableSnapshot) { - if (tableSnapshot.getCachedMetadata().isEmpty() && tableSnapshot.getCachedProtocol().isEmpty()) { - return getLatestMetadataAndProtocolEntry(session, fileSystem, tableSnapshot); - } - - if (tableSnapshot.getCachedMetadata().isEmpty()) { - getMetadataEntry(session, fileSystem, tableSnapshot); - } - else if (tableSnapshot.getCachedProtocol().isEmpty()) { - getProtocolEntry(session, fileSystem, tableSnapshot); + MetadataAndProtocolEntries.Builder builder = MetadataAndProtocolEntries.builder() + .withEntries(tableSnapshot.getCachedEntries()); + if (builder.isFull()) { + return builder.build(); } - return new MetadataAndProtocolEntries(tableSnapshot.getCachedMetadata(), tableSnapshot.getCachedProtocol()); - } - - private MetadataAndProtocolEntries getLatestMetadataAndProtocolEntry(ConnectorSession session, TrinoFileSystem fileSystem, TableSnapshot tableSnapshot) - { - MetadataAndProtocolEntries.Builder builder = MetadataAndProtocolEntries.builder(); for (Transaction transaction : tableSnapshot.getTransactions().reversed()) { MetadataAndProtocolEntries metadataAndProtocol = transaction.transactionEntries().getMetadataAndProtocol(fileSystem); builder.withEntries(metadataAndProtocol); From 11c91995381099963754f3dcd04b3f39e79cd29a Mon Sep 17 00:00:00 2001 From: chenjian2664 Date: Mon, 13 Apr 2026 13:06:21 +0800 Subject: [PATCH 05/10] Remove stale code and comment --- .../checkpoint/CheckpointWriterManager.java | 10 ---------- .../plugin/deltalake/TestDeltaLakeSplitManager.java | 1 - 2 files changed, 11 deletions(-) diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/checkpoint/CheckpointWriterManager.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/checkpoint/CheckpointWriterManager.java index 6b13a17c0793..98022ababd4f 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/checkpoint/CheckpointWriterManager.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/checkpoint/CheckpointWriterManager.java @@ -28,7 +28,6 @@ import io.trino.plugin.deltalake.transactionlog.DeltaLakeTransactionLogEntry; import io.trino.plugin.deltalake.transactionlog.TableSnapshot; import io.trino.plugin.deltalake.transactionlog.TableSnapshot.MetadataAndProtocolEntry; -import io.trino.plugin.deltalake.transactionlog.TransactionLogAccess; import io.trino.spi.NodeVersion; import io.trino.spi.TrinoException; import io.trino.spi.connector.ConnectorSession; @@ -63,7 +62,6 @@ public class CheckpointWriterManager private final CheckpointSchemaManager checkpointSchemaManager; private final DeltaLakeFileSystemFactory fileSystemFactory; private final String trinoVersion; - private final TransactionLogAccess transactionLogAccess; private final FileFormatDataSourceStats fileFormatDataSourceStats; private final JsonCodec lastCheckpointCodec; private final Executor executorService; @@ -75,7 +73,6 @@ public CheckpointWriterManager( CheckpointSchemaManager checkpointSchemaManager, DeltaLakeFileSystemFactory fileSystemFactory, NodeVersion nodeVersion, - TransactionLogAccess transactionLogAccess, FileFormatDataSourceStats fileFormatDataSourceStats, JsonCodec lastCheckpointCodec, DeltaLakeConfig deltaLakeConfig, @@ -85,7 +82,6 @@ public CheckpointWriterManager( this.checkpointSchemaManager = requireNonNull(checkpointSchemaManager, "checkpointSchemaManager is null"); this.fileSystemFactory = requireNonNull(fileSystemFactory, "fileSystemFactory is null"); this.trinoVersion = nodeVersion.toString(); - this.transactionLogAccess = requireNonNull(transactionLogAccess, "transactionLogAccess is null"); this.fileFormatDataSourceStats = requireNonNull(fileFormatDataSourceStats, "fileFormatDataSourceStats is null"); this.lastCheckpointCodec = requireNonNull(lastCheckpointCodec, "lastCheckpointCodec is null"); this.executorService = requireNonNull(executorService, "ExecutorService is null"); @@ -125,12 +121,6 @@ public void writeCheckpoint(ConnectorSession session, TableSnapshot snapshot, Ve } if (!checkpointLogEntries.isEmpty()) { - // TODO HACK: this call is required only to ensure that cachedMetadataEntry is set in snapshot (https://github.com/trinodb/trino/issues/12032), - // so we can read add entries below this should be reworked so we pass metadata entry explicitly to getCheckpointTransactionLogEntries, - // and we should get rid of `setCachedMetadata` in TableSnapshot to make it immutable. - // Also more proper would be to use metadata entry obtained above in snapshot.getCheckpointTransactionLogEntries to read other checkpoint entries, but using newer one should not do harm. - transactionLogAccess.getMetadataEntry(session, fileSystem, snapshot); - // register metadata entry in writer DeltaLakeTransactionLogEntry metadataLogEntry = checkpointLogEntries.stream() .filter(logEntry -> logEntry.getMetaData() != null) diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeSplitManager.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeSplitManager.java index 12cb207d0168..5d99949ee86a 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeSplitManager.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeSplitManager.java @@ -208,7 +208,6 @@ public Stream getActiveFiles( new CheckpointSchemaManager(typeManager), new DefaultDeltaLakeFileSystemFactory(HDFS_FILE_SYSTEM_FACTORY, new NoOpVendedCredentialsProvider()), new NodeVersion("test_version"), - transactionLogAccess, new FileFormatDataSourceStats(), jsonCodec(LastCheckpoint.class), new DeltaLakeConfig(), From 11024e3a478a944f590315dd73117bf3ada169ca Mon Sep 17 00:00:00 2001 From: chenjian2664 Date: Mon, 13 Apr 2026 14:45:56 +0800 Subject: [PATCH 06/10] Avoid multiple traversals of transaction log entries Avoid multiple traversals of transaction log entries by using `getMetadataAndProtocolEntry` to retrieve both metadata and protocol entries in a single pass, instead of fetching them individually and scanning the log multiple times. --- .../plugin/deltalake/DeltaLakeMetadata.java | 7 +- .../deltalake/DeltaLakePartitionsTable.java | 6 +- .../deltalake/DeltaLakePropertiesTable.java | 11 ++- .../transactionlog/TableSnapshot.java | 12 --- .../transactionlog/TransactionLogAccess.java | 29 ------ .../deltalake/TestTransactionLogAccess.java | 92 +++++++++++++------ .../deltalake/TestingDeltaLakeUtils.java | 10 +- .../transactionlog/TestTableSnapshot.java | 9 +- 8 files changed, 98 insertions(+), 78 deletions(-) diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeMetadata.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeMetadata.java index 9482156b87af..27080fa2448d 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeMetadata.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeMetadata.java @@ -1122,8 +1122,11 @@ public Iterator streamRelationColumns( TransactionLogReader transactionLogReader = transactionLogReaderFactory.createReader(deltaMetastoreTable); VendedCredentialsHandle credentialsHandle = VendedCredentialsHandle.of(deltaMetastoreTable); TableSnapshot snapshot = transactionLogAccess.loadSnapshot(session, transactionLogReader, tableName, tableLocation, Optional.empty(), credentialsHandle); - MetadataEntry metadata = transactionLogAccess.getMetadataEntry(session, fileSystem, snapshot); - ProtocolEntry protocol = transactionLogAccess.getProtocolEntry(session, fileSystem, snapshot); + MetadataAndProtocolEntries metadataAndProtocolEntries = transactionLogAccess.getMetadataAndProtocolEntry(session, fileSystem, snapshot); + MetadataEntry metadata = metadataAndProtocolEntries.metadata() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + snapshot.getTable())); + ProtocolEntry protocol = metadataAndProtocolEntries.protocol() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Protocol not found in transaction log for " + snapshot.getTable())); List columnMetadata = getTableColumnMetadata(metadata, protocol); enqueueUpdateInfo(session, table.getDatabaseName(), table.getTableName(), snapshot.getVersion(), metadata.getSchemaString(), Optional.ofNullable(metadata.getDescription())); relationColumns.put(tableName, RelationColumnsMetadata.forTable(tableName, columnMetadata)); diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakePartitionsTable.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakePartitionsTable.java index ee313bc74d95..63de846b6887 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakePartitionsTable.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakePartitionsTable.java @@ -19,6 +19,7 @@ import io.trino.plugin.deltalake.metastore.DeltaMetastoreTable; import io.trino.plugin.deltalake.metastore.VendedCredentialsHandle; import io.trino.plugin.deltalake.transactionlog.AddFileEntry; +import io.trino.plugin.deltalake.transactionlog.MetadataAndProtocolEntries; import io.trino.plugin.deltalake.transactionlog.MetadataEntry; import io.trino.plugin.deltalake.transactionlog.ProtocolEntry; import io.trino.plugin.deltalake.transactionlog.TableSnapshot; @@ -104,8 +105,9 @@ public DeltaLakePartitionsTable( } TrinoFileSystem fileSystem = fileSystemFactory.create(session, table); - this.metadataEntry = transactionLogAccess.getMetadataEntry(session, fileSystem, tableSnapshot); - this.protocolEntry = transactionLogAccess.getProtocolEntry(session, fileSystem, tableSnapshot); + MetadataAndProtocolEntries metadataAndProtocolEntries = transactionLogAccess.getMetadataAndProtocolEntry(session, fileSystem, tableSnapshot); + this.metadataEntry = metadataAndProtocolEntries.metadata().orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + table.schemaTableName())); + this.protocolEntry = metadataAndProtocolEntries.protocol().orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Protocol not found in transaction log for " + table.schemaTableName())); this.schema = extractSchema(metadataEntry, protocolEntry, typeManager); this.partitionColumns = getPartitionColumns(); diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakePropertiesTable.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakePropertiesTable.java index e55f0cfe9be1..3640562382b3 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakePropertiesTable.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakePropertiesTable.java @@ -17,6 +17,7 @@ import com.google.common.collect.ImmutableSet; import io.trino.filesystem.TrinoFileSystem; import io.trino.plugin.deltalake.metastore.DeltaMetastoreTable; +import io.trino.plugin.deltalake.transactionlog.MetadataAndProtocolEntries; import io.trino.plugin.deltalake.transactionlog.MetadataEntry; import io.trino.plugin.deltalake.transactionlog.ProtocolEntry; import io.trino.plugin.deltalake.transactionlog.TableSnapshot; @@ -37,6 +38,7 @@ import java.util.List; import java.util.Optional; +import static io.trino.plugin.deltalake.DeltaLakeErrorCode.DELTA_LAKE_INVALID_SCHEMA; import static io.trino.spi.type.VarcharType.VARCHAR; import static java.util.Objects.requireNonNull; @@ -86,11 +88,14 @@ public ConnectorPageSource pageSource(ConnectorTransactionHandle transactionHand TrinoFileSystem fileSystem = fileSystemFactory.create(session, table); try { TableSnapshot tableSnapshot = transactionLogAccess.loadSnapshot(session, table, Optional.empty()); - metadataEntry = transactionLogAccess.getMetadataEntry(session, fileSystem, tableSnapshot); - protocolEntry = transactionLogAccess.getProtocolEntry(session, fileSystem, tableSnapshot); + MetadataAndProtocolEntries metadataAndProtocolEntries = transactionLogAccess.getMetadataAndProtocolEntry(session, fileSystem, tableSnapshot); + metadataEntry = metadataAndProtocolEntries.metadata() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + table.schemaTableName())); + protocolEntry = metadataAndProtocolEntries.protocol() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Protocol not found in transaction log for " + table.schemaTableName())); } catch (IOException e) { - throw new TrinoException(DeltaLakeErrorCode.DELTA_LAKE_INVALID_SCHEMA, "Unable to load table metadata from location: " + table.location(), e); + throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Unable to load table metadata from location: " + table.location(), e); } return new FixedPageSource(buildPages(metadataEntry, protocolEntry)); diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TableSnapshot.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TableSnapshot.java index e2b0f4a9b4f5..cab0e8489857 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TableSnapshot.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TableSnapshot.java @@ -218,18 +218,6 @@ public void setCachedMetadata(Optional cachedMetadata) .build(); } - public void setCachedProtocol(Optional cachedProtocol) - { - if (cachedProtocol.isEmpty()) { - return; - } - - cachedMetadataAndProtocolEntries = MetadataAndProtocolEntries.builder() - .withProtocolEntry(cachedProtocol.get()) - .withEntries(cachedMetadataAndProtocolEntries) - .build(); - } - public List getJsonTransactionLogEntries(TrinoFileSystem fileSystem) { return logTail.getFileEntries(fileSystem); diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java index 523bcb37e02c..42dd21452c3f 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java @@ -547,35 +547,6 @@ private static Set resolveCheckpointReadTypes return typesForRead.build(); } - public ProtocolEntry getProtocolEntry(ConnectorSession session, TrinoFileSystem fileSystem, TableSnapshot tableSnapshot) - { - if (tableSnapshot.getCachedProtocol().isEmpty()) { - Optional latestProtocolEntry = tableSnapshot.getTransactions().reversed().stream() - .map(transaction -> transaction.transactionEntries().getMetadataAndProtocol(fileSystem)) - .map(MetadataAndProtocolEntries::protocol) - .filter(Optional::isPresent) - .map(Optional::get) - .findFirst(); - - if (latestProtocolEntry.isEmpty()) { - latestProtocolEntry = getCheckpointEntry( - session, - tableSnapshot, - ImmutableSet.of(PROTOCOL), - checkpointStream -> checkpointStream - .map(DeltaLakeTransactionLogEntry::getProtocol) - .filter(Objects::nonNull) - .reduce((_, second) -> second), - fileSystem, - fileFormatDataSourceStats); - } - - tableSnapshot.setCachedProtocol(latestProtocolEntry); - } - return tableSnapshot.getCachedProtocol() - .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Protocol entry not found in transaction log for table " + tableSnapshot.getTable())); - } - private Optional getCheckpointEntry( ConnectorSession session, TableSnapshot tableSnapshot, diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestTransactionLogAccess.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestTransactionLogAccess.java index b43d09cc254f..10e1224bafd2 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestTransactionLogAccess.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestTransactionLogAccess.java @@ -36,6 +36,7 @@ import io.trino.plugin.deltalake.transactionlog.statistics.DeltaLakeFileStatistics; import io.trino.plugin.hive.parquet.ParquetReaderConfig; import io.trino.plugin.hive.parquet.ParquetWriterConfig; +import io.trino.spi.TrinoException; import io.trino.spi.connector.ColumnMetadata; import io.trino.spi.connector.ConnectorSession; import io.trino.spi.connector.SchemaTableName; @@ -75,6 +76,7 @@ import static io.trino.filesystem.tracing.FileSystemAttributes.FILE_LOCATION; import static io.trino.hdfs.HdfsTestUtils.HDFS_FILE_SYSTEM_FACTORY; import static io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR; +import static io.trino.plugin.deltalake.DeltaLakeErrorCode.DELTA_LAKE_INVALID_SCHEMA; import static io.trino.plugin.deltalake.DeltaTestingConnectorSession.SESSION; import static io.trino.plugin.deltalake.TestingDeltaLakeUtils.createTable; import static io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport.extractColumnMetadata; @@ -185,8 +187,11 @@ public void testGetActiveAddEntries() { setupTransactionLogAccessFromResources("person", "databricks73/person"); - MetadataEntry metadataEntry = transactionLogAccess.getMetadataEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); - ProtocolEntry protocolEntry = transactionLogAccess.getProtocolEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); + MetadataAndProtocolEntries metadataAndProtocolEntries = transactionLogAccess.getMetadataAndProtocolEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); + MetadataEntry metadataEntry = metadataAndProtocolEntries.metadata() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + tableSnapshot.getTable())); + ProtocolEntry protocolEntry = metadataAndProtocolEntries.protocol() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Protocol not found in transaction log for " + tableSnapshot.getTable())); List addFileEntries; try (Stream addFileEntriesStream = transactionLogAccess.getActiveFiles(SESSION, createTable(metadataEntry, protocolEntry), tableSnapshot, TupleDomain.all(), alwaysTrue())) { addFileEntries = addFileEntriesStream.collect(toImmutableList()); @@ -224,8 +229,11 @@ public void testAddFileEntryUppercase() { setupTransactionLogAccessFromResources("uppercase_columns", "databricks73/uppercase_columns"); - MetadataEntry metadataEntry = transactionLogAccess.getMetadataEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); - ProtocolEntry protocolEntry = transactionLogAccess.getProtocolEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); + MetadataAndProtocolEntries metadataAndProtocolEntries = transactionLogAccess.getMetadataAndProtocolEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); + MetadataEntry metadataEntry = metadataAndProtocolEntries.metadata() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + tableSnapshot.getTable())); + ProtocolEntry protocolEntry = metadataAndProtocolEntries.protocol() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Protocol not found in transaction log for " + tableSnapshot.getTable())); try (Stream addFileEntries = transactionLogAccess.getActiveFiles(SESSION, createTable(metadataEntry, protocolEntry), tableSnapshot, TupleDomain.all(), alwaysTrue())) { AddFileEntry addFileEntry = addFileEntries @@ -249,8 +257,11 @@ public void testAddEntryPruning() // - Added in the parquet checkpoint but removed in a JSON commit // - Added in a JSON commit and removed in a later JSON commit setupTransactionLogAccessFromResources("person_test_pruning", "databricks73/person_test_pruning"); - MetadataEntry metadataEntry = transactionLogAccess.getMetadataEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); - ProtocolEntry protocolEntry = transactionLogAccess.getProtocolEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); + MetadataAndProtocolEntries metadataAndProtocolEntries = transactionLogAccess.getMetadataAndProtocolEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); + MetadataEntry metadataEntry = metadataAndProtocolEntries.metadata() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + tableSnapshot.getTable())); + ProtocolEntry protocolEntry = metadataAndProtocolEntries.protocol() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Protocol not found in transaction log for " + tableSnapshot.getTable())); try (Stream addFileEntries = transactionLogAccess.getActiveFiles(SESSION, createTable(metadataEntry, protocolEntry), tableSnapshot, TupleDomain.all(), alwaysTrue())) { Set paths = addFileEntries .map(AddFileEntry::getPath) @@ -266,8 +277,11 @@ public void testAddEntryOverrides() throws Exception { setupTransactionLogAccessFromResources("person_test_pruning", "databricks73/person_test_pruning"); - MetadataEntry metadataEntry = transactionLogAccess.getMetadataEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); - ProtocolEntry protocolEntry = transactionLogAccess.getProtocolEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); + MetadataAndProtocolEntries metadataAndProtocolEntries = transactionLogAccess.getMetadataAndProtocolEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); + MetadataEntry metadataEntry = metadataAndProtocolEntries.metadata() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + tableSnapshot.getTable())); + ProtocolEntry protocolEntry = metadataAndProtocolEntries.protocol() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Protocol not found in transaction log for " + tableSnapshot.getTable())); List addFileEntries; try (Stream addFileEntryStream = transactionLogAccess.getActiveFiles(SESSION, createTable(metadataEntry, protocolEntry), tableSnapshot, TupleDomain.all(), alwaysTrue())) { addFileEntries = addFileEntryStream.collect(toImmutableList()); @@ -292,8 +306,11 @@ public void testAddRemoveAdd() throws Exception { setupTransactionLogAccessFromResources("person_test_pruning", "databricks73/person_test_pruning"); - MetadataEntry metadataEntry = transactionLogAccess.getMetadataEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); - ProtocolEntry protocolEntry = transactionLogAccess.getProtocolEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); + MetadataAndProtocolEntries metadataAndProtocolEntries = transactionLogAccess.getMetadataAndProtocolEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); + MetadataEntry metadataEntry = metadataAndProtocolEntries.metadata() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + tableSnapshot.getTable())); + ProtocolEntry protocolEntry = metadataAndProtocolEntries.protocol() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Protocol not found in transaction log for " + tableSnapshot.getTable())); try (Stream addFileEntries = transactionLogAccess.getActiveFiles(SESSION, createTable(metadataEntry, protocolEntry), tableSnapshot, TupleDomain.all(), alwaysTrue())) { // Test data contains an entry added by the parquet checkpoint, removed by a JSON action, and then added back by a later JSON action List activeEntries = addFileEntries @@ -374,8 +391,11 @@ private void testAllGetActiveAddEntries(String tableName, String resourcePath) throws Exception { setupTransactionLogAccessFromResources(tableName, resourcePath); - MetadataEntry metadataEntry = transactionLogAccess.getMetadataEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); - ProtocolEntry protocolEntry = transactionLogAccess.getProtocolEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); + MetadataAndProtocolEntries metadataAndProtocolEntries = transactionLogAccess.getMetadataAndProtocolEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); + MetadataEntry metadataEntry = metadataAndProtocolEntries.metadata() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + tableSnapshot.getTable())); + ProtocolEntry protocolEntry = metadataAndProtocolEntries.protocol() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Protocol not found in transaction log for " + tableSnapshot.getTable())); try (Stream addFileEntries = transactionLogAccess.getActiveFiles(SESSION, createTable(metadataEntry, protocolEntry), tableSnapshot, TupleDomain.all(), alwaysTrue())) { Set paths = addFileEntries .map(AddFileEntry::getPath) @@ -400,7 +420,9 @@ private void testGetProtocolEntry(String tableName, String resourcePath) { setupTransactionLogAccessFromResources(tableName, resourcePath); - ProtocolEntry protocolEntry = transactionLogAccess.getProtocolEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); + ProtocolEntry protocolEntry = transactionLogAccess.getMetadataAndProtocolEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot) + .protocol() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Protocol not found in transaction log for " + tableSnapshot.getTable())); assertThat(protocolEntry.minReaderVersion()).isEqualTo(1); assertThat(protocolEntry.minWriterVersion()).isEqualTo(2); } @@ -445,8 +467,11 @@ public void testUpdatingTailEntriesNoCheckpoint() File resourceDir = new File(getClass().getClassLoader().getResource("databricks73/person/_delta_log").toURI()); copyTransactionLogEntry(0, 7, resourceDir, transactionLogDir); setupTransactionLogAccess(tableName, tableDir.toURI().toString()); - MetadataEntry metadataEntry = transactionLogAccess.getMetadataEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); - ProtocolEntry protocolEntry = transactionLogAccess.getProtocolEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); + MetadataAndProtocolEntries metadataAndProtocolEntries = transactionLogAccess.getMetadataAndProtocolEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); + MetadataEntry metadataEntry = metadataAndProtocolEntries.metadata() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + tableSnapshot.getTable())); + ProtocolEntry protocolEntry = metadataAndProtocolEntries.protocol() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Protocol not found in transaction log for " + tableSnapshot.getTable())); try (Stream activeDataFiles = transactionLogAccess.getActiveFiles(SESSION, createTable(metadataEntry, protocolEntry), tableSnapshot, TupleDomain.all(), alwaysTrue())) { Set dataFiles = ImmutableSet.of( "age=42/part-00000-b82d8859-84a0-4f05-872c-206b07dd54f0.c000.snappy.parquet", @@ -489,8 +514,11 @@ public void testLoadingTailEntriesPastCheckpoint() copyTransactionLogEntry(0, 8, resourceDir, transactionLogDir); setupTransactionLogAccess(tableName, tableDir.toURI().toString()); - MetadataEntry metadataEntry = transactionLogAccess.getMetadataEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); - ProtocolEntry protocolEntry = transactionLogAccess.getProtocolEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); + MetadataAndProtocolEntries metadataAndProtocolEntries = transactionLogAccess.getMetadataAndProtocolEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); + MetadataEntry metadataEntry = metadataAndProtocolEntries.metadata() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + tableSnapshot.getTable())); + ProtocolEntry protocolEntry = metadataAndProtocolEntries.protocol() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Protocol not found in transaction log for " + tableSnapshot.getTable())); try (Stream activeDataFiles = transactionLogAccess.getActiveFiles(SESSION, createTable(metadataEntry, protocolEntry), tableSnapshot, TupleDomain.all(), alwaysTrue())) { Set dataFiles = ImmutableSet.of( "age=21/part-00000-3d546786-bedc-407f-b9f7-e97aa12cce0f.c000.snappy.parquet", @@ -545,8 +573,11 @@ public void testIncrementalCacheUpdates() new ParquetWriterConfig()) .getSessionProperties()) .build(); - MetadataEntry metadataEntry = transactionLogAccess.getMetadataEntry(activeDataFileCacheSession, tracingFileSystemFactory.create(activeDataFileCacheSession, tableLocation), tableSnapshot); - ProtocolEntry protocolEntry = transactionLogAccess.getProtocolEntry(activeDataFileCacheSession, tracingFileSystemFactory.create(activeDataFileCacheSession, tableLocation), tableSnapshot); + MetadataAndProtocolEntries metadataAndProtocolEntries = transactionLogAccess.getMetadataAndProtocolEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); + MetadataEntry metadataEntry = metadataAndProtocolEntries.metadata() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + tableSnapshot.getTable())); + ProtocolEntry protocolEntry = metadataAndProtocolEntries.protocol() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Protocol not found in transaction log for " + tableSnapshot.getTable())); Set originalDataFiles = ImmutableSet.of( "age=42/part-00000-b26c891a-7288-4d96-9d3b-bef648f12a34.c000.snappy.parquet", @@ -632,8 +663,11 @@ public void testSnapshotsAreConsistent() Files.copy(new File(resourceDir, LAST_CHECKPOINT_FILENAME).toPath(), new File(transactionLogDir, LAST_CHECKPOINT_FILENAME).toPath()); setupTransactionLogAccess(tableName, tableDir.toURI().toString()); - MetadataEntry metadataEntry = transactionLogAccess.getMetadataEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); - ProtocolEntry protocolEntry = transactionLogAccess.getProtocolEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); + MetadataAndProtocolEntries metadataAndProtocolEntries = transactionLogAccess.getMetadataAndProtocolEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); + MetadataEntry metadataEntry = metadataAndProtocolEntries.metadata() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + tableSnapshot.getTable())); + ProtocolEntry protocolEntry = metadataAndProtocolEntries.protocol() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Protocol not found in transaction log for " + tableSnapshot.getTable())); List expectedDataFiles; try (Stream addFileEntryStream = transactionLogAccess.getActiveFiles(SESSION, createTable(metadataEntry, protocolEntry), tableSnapshot, TupleDomain.all(), alwaysTrue())) { expectedDataFiles = addFileEntryStream.collect(toImmutableList()); @@ -658,7 +692,7 @@ public void testSnapshotsAreConsistent() } assertThat(expectedDataFiles).hasSize(dataFilesWithFixedVersion.size()); - List columns = extractColumnMetadata(transactionLogAccess.getMetadataEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot), transactionLogAccess.getProtocolEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot), TESTING_TYPE_MANAGER); + List columns = extractColumnMetadata(metadataEntry, protocolEntry, TESTING_TYPE_MANAGER); for (int i = 0; i < expectedDataFiles.size(); i++) { AddFileEntry expected = expectedDataFiles.get(i); AddFileEntry actual = dataFilesWithFixedVersion.get(i); @@ -719,8 +753,11 @@ public void testParquetStructStatistics() String tableName = "parquet_struct_statistics"; setupTransactionLogAccess(tableName, getClass().getClassLoader().getResource("databricks73/pruning/" + tableName).toURI().toString()); - MetadataEntry metadataEntry = transactionLogAccess.getMetadataEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); - ProtocolEntry protocolEntry = transactionLogAccess.getProtocolEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); + MetadataAndProtocolEntries metadataAndProtocolEntries = transactionLogAccess.getMetadataAndProtocolEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); + MetadataEntry metadataEntry = metadataAndProtocolEntries.metadata() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + tableSnapshot.getTable())); + ProtocolEntry protocolEntry = metadataAndProtocolEntries.protocol() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Protocol not found in transaction log for " + tableSnapshot.getTable())); List addFileEntries; try (Stream addFileEntryStream = transactionLogAccess.getActiveFiles(SESSION, createTable(metadataEntry, protocolEntry), tableSnapshot, TupleDomain.all(), alwaysTrue())) { addFileEntries = addFileEntryStream.collect(toImmutableList()); @@ -805,8 +842,11 @@ public void testLoadSnapshotWithEndVersion() String tableName = "person"; String tableDir = getClass().getClassLoader().getResource("databricks73/" + tableName).toURI().toString(); setupTransactionLogAccess("person", getClass().getClassLoader().getResource("databricks73/person").toString(), new DeltaLakeConfig(), Optional.of(9L)); - MetadataEntry metadataEntry = transactionLogAccess.getMetadataEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); - ProtocolEntry protocolEntry = transactionLogAccess.getProtocolEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); + MetadataAndProtocolEntries metadataAndProtocolEntries = transactionLogAccess.getMetadataAndProtocolEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); + MetadataEntry metadataEntry = metadataAndProtocolEntries.metadata() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + tableSnapshot.getTable())); + ProtocolEntry protocolEntry = metadataAndProtocolEntries.protocol() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Protocol not found in transaction log for " + tableSnapshot.getTable())); // Version 10 has a checkpoint file transactionLogAccess.flushCache(); diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestingDeltaLakeUtils.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestingDeltaLakeUtils.java index fe535ca957c0..e02b5bbced30 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestingDeltaLakeUtils.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestingDeltaLakeUtils.java @@ -17,10 +17,12 @@ import com.google.common.collect.ImmutableMap; import io.trino.filesystem.TrinoFileSystemFactory; import io.trino.plugin.deltalake.transactionlog.AddFileEntry; +import io.trino.plugin.deltalake.transactionlog.MetadataAndProtocolEntries; import io.trino.plugin.deltalake.transactionlog.MetadataEntry; import io.trino.plugin.deltalake.transactionlog.ProtocolEntry; import io.trino.plugin.deltalake.transactionlog.TableSnapshot; import io.trino.plugin.deltalake.transactionlog.TransactionLogAccess; +import io.trino.spi.TrinoException; import io.trino.spi.connector.SchemaTableName; import io.trino.spi.predicate.TupleDomain; import io.trino.testing.PlanTester; @@ -35,6 +37,7 @@ import static com.google.common.base.Predicates.alwaysTrue; import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.trino.plugin.deltalake.DeltaLakeErrorCode.DELTA_LAKE_INVALID_SCHEMA; import static io.trino.plugin.deltalake.DeltaLakeQueryRunner.DELTA_CATALOG; import static io.trino.plugin.deltalake.DeltaTestingConnectorSession.SESSION; import static java.nio.file.StandardCopyOption.REPLACE_EXISTING; @@ -98,8 +101,11 @@ public static List getTableActiveFiles(TransactionLogAccess transa transactionLogAccess.flushCache(); TableSnapshot snapshot = transactionLogAccess.loadSnapshot(SESSION, createTable(dummyTable, tableLocation), Optional.empty()); - MetadataEntry metadataEntry = transactionLogAccess.getMetadataEntry(SESSION, fileSystemFactory.create(SESSION), snapshot); - ProtocolEntry protocolEntry = transactionLogAccess.getProtocolEntry(SESSION, fileSystemFactory.create(SESSION), snapshot); + MetadataAndProtocolEntries metadataAndProtocolEntries = transactionLogAccess.getMetadataAndProtocolEntry(SESSION, fileSystemFactory.create(SESSION), snapshot); + MetadataEntry metadataEntry = metadataAndProtocolEntries.metadata() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + snapshot.getTable())); + ProtocolEntry protocolEntry = metadataAndProtocolEntries.protocol() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Protocol not found in transaction log for " + snapshot.getTable())); try (Stream addFileEntries = transactionLogAccess.getActiveFiles(SESSION, createTable(metadataEntry, protocolEntry), snapshot, TupleDomain.all(), alwaysTrue())) { return addFileEntries.collect(toImmutableList()); } diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/transactionlog/TestTableSnapshot.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/transactionlog/TestTableSnapshot.java index bd3be665cbcf..19ef74766a21 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/transactionlog/TestTableSnapshot.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/transactionlog/TestTableSnapshot.java @@ -31,6 +31,7 @@ import io.trino.plugin.deltalake.transactionlog.reader.FileSystemTransactionLogReader; import io.trino.plugin.deltalake.transactionlog.reader.FileSystemTransactionLogReaderFactory; import io.trino.plugin.hive.parquet.ParquetReaderConfig; +import io.trino.spi.TrinoException; import io.trino.spi.connector.SchemaTableName; import io.trino.spi.predicate.TupleDomain; import io.trino.spi.type.TypeManager; @@ -54,6 +55,7 @@ import static io.trino.filesystem.tracing.FileSystemAttributes.FILE_LOCATION; import static io.trino.hdfs.HdfsTestUtils.HDFS_FILE_SYSTEM_FACTORY; import static io.trino.plugin.deltalake.DeltaLakeConfig.DEFAULT_TRANSACTION_LOG_MAX_CACHED_SIZE; +import static io.trino.plugin.deltalake.DeltaLakeErrorCode.DELTA_LAKE_INVALID_SCHEMA; import static io.trino.plugin.deltalake.transactionlog.TableSnapshot.MetadataAndProtocolEntry; import static io.trino.plugin.deltalake.transactionlog.TableSnapshot.load; import static io.trino.plugin.deltalake.transactionlog.TransactionLogParser.readLastCheckpoint; @@ -158,8 +160,11 @@ public void readsCheckpointFile() executorService, new FileSystemTransactionLogReaderFactory(tracingFileSystemFactory)); TrinoFileSystem fileSystem = tracingFileSystemFactory.create(SESSION, tableLocation); - MetadataEntry metadataEntry = transactionLogAccess.getMetadataEntry(SESSION, fileSystem, tableSnapshot); - ProtocolEntry protocolEntry = transactionLogAccess.getProtocolEntry(SESSION, fileSystem, tableSnapshot); + MetadataAndProtocolEntries metadataAndProtocolEntries = transactionLogAccess.getMetadataAndProtocolEntry(SESSION, fileSystem, tableSnapshot); + MetadataEntry metadataEntry = metadataAndProtocolEntries.metadata() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + tableSnapshot.getTable())); + ProtocolEntry protocolEntry = metadataAndProtocolEntries.protocol() + .orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Protocol not found in transaction log for " + tableSnapshot.getTable())); tableSnapshot.setCachedMetadata(Optional.of(metadataEntry)); try (Stream stream = tableSnapshot.getCheckpointTransactionLogEntries( SESSION, From 94ddf53a482c92c2b07433f83d3181d7096fa9e7 Mon Sep 17 00:00:00 2001 From: chenjian2664 Date: Mon, 13 Apr 2026 14:49:22 +0800 Subject: [PATCH 07/10] Remove redundant call in `TestTransactionLogAccess` --- .../io/trino/plugin/deltalake/TestTransactionLogAccess.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestTransactionLogAccess.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestTransactionLogAccess.java index 10e1224bafd2..21b2e184316d 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestTransactionLogAccess.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestTransactionLogAccess.java @@ -337,7 +337,6 @@ private void testAllGetMetadataEntry(String tableName, String resourcePath) { setupTransactionLogAccessFromResources(tableName, resourcePath); - transactionLogAccess.getMetadataEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); MetadataEntry metadataEntry = transactionLogAccess.getMetadataEntry(SESSION, tracingFileSystemFactory.create(SESSION, tableLocation), tableSnapshot); assertThat(metadataEntry.getOriginalPartitionColumns()).containsOnly("age"); @@ -363,7 +362,6 @@ private void testGetMetadataAndProtocolEntry(String tableName, String resourcePa setupTransactionLogAccessFromResources(tableName, resourcePath); TrinoFileSystem fileSystem = tracingFileSystemFactory.create(SESSION, tableLocation); - transactionLogAccess.getMetadataAndProtocolEntry(SESSION, fileSystem, tableSnapshot); MetadataAndProtocolEntries logEntries = transactionLogAccess.getMetadataAndProtocolEntry(SESSION, fileSystem, tableSnapshot); MetadataEntry metadataEntry = logEntries.metadata().orElseThrow(); From 439f1745303bf19bceb104637476ef20b6faf0fe Mon Sep 17 00:00:00 2001 From: chenjian2664 Date: Tue, 14 Apr 2026 11:50:35 +0800 Subject: [PATCH 08/10] Store extended statistics filename in Delta Lake `CommitInfoEntry` Previously, extended statistics were always written to a fixed path (`_delta_log/_trino_meta/extended_stats.json`), which made updates non-atomic with respect to the transaction log: a crash between writing the stats file and committing the transaction could leave stale or inconsistent statistics visible to readers. This change records the extended statistics filename in the `commitInfo`, when updating extended stats, we first write the statistics file, then leverage the commit log mechanism to handle concurrent conflicts. --- .../CorruptedDeltaLakeTableHandle.java | 4 +- .../deltalake/DeltaLakeInsertTableHandle.java | 5 +- .../plugin/deltalake/DeltaLakeMetadata.java | 262 ++++++++++-------- .../deltalake/DeltaLakeTableHandle.java | 35 ++- .../plugin/deltalake/LocatedTableHandle.java | 4 + .../procedure/DeltaTableOptimizeHandle.java | 17 +- .../procedure/DropExtendedStatsProcedure.java | 3 +- .../CachingExtendedStatisticsAccess.java | 31 ++- .../statistics/ExtendedStatisticsAccess.java | 23 +- .../FileBasedTableStatisticsProvider.java | 3 +- .../statistics/MetaDirStatisticsAccess.java | 44 ++- .../transactionlog/CommitInfoEntry.java | 8 +- .../MetadataAndProtocolEntries.java | 27 +- .../transactionlog/TransactionLogAccess.java | 41 ++- ...stDeltaLakeAlluxioCacheFileOperations.java | 109 +++++--- .../deltalake/TestDeltaLakeMetadata.java | 3 +- ...DeltaLakeNodeLocalDynamicSplitPruning.java | 6 +- .../deltalake/TestDeltaLakeSplitManager.java | 3 +- .../deltalake/TestTransactionLogAccess.java | 3 +- .../deltalake/TestingDeltaLakeUtils.java | 6 +- ...aLakeFileBasedTableStatisticsProvider.java | 14 +- .../TestMetaDirStatisticsAccess.java | 184 ++++++++++++ 22 files changed, 615 insertions(+), 220 deletions(-) create mode 100644 plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/statistics/TestMetaDirStatisticsAccess.java diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/CorruptedDeltaLakeTableHandle.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/CorruptedDeltaLakeTableHandle.java index 6c9dc6839d8e..3542cc5215a1 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/CorruptedDeltaLakeTableHandle.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/CorruptedDeltaLakeTableHandle.java @@ -26,7 +26,8 @@ public record CorruptedDeltaLakeTableHandle( boolean catalogOwned, boolean managed, String location, - TrinoException originalException) + TrinoException originalException, + Optional extendedStatsFile) implements LocatedTableHandle { public CorruptedDeltaLakeTableHandle @@ -34,6 +35,7 @@ public record CorruptedDeltaLakeTableHandle( requireNonNull(schemaTableName, "schemaTableName is null"); requireNonNull(location, "location is null"); requireNonNull(originalException, "originalException is null"); + requireNonNull(extendedStatsFile, "extendedStatsFile is null"); } public TrinoException createException() diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeInsertTableHandle.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeInsertTableHandle.java index 60f703ef0454..41047d9fe1f1 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeInsertTableHandle.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeInsertTableHandle.java @@ -21,6 +21,7 @@ import io.trino.spi.connector.SchemaTableName; import java.util.List; +import java.util.Optional; import static java.util.Objects.requireNonNull; @@ -32,7 +33,8 @@ public record DeltaLakeInsertTableHandle( List inputColumns, long readVersion, boolean retriesEnabled, - VendedCredentialsHandle credentialsHandle) + VendedCredentialsHandle credentialsHandle, + Optional extendedStatisticsFile) implements ConnectorInsertTableHandle { public DeltaLakeInsertTableHandle @@ -43,6 +45,7 @@ public record DeltaLakeInsertTableHandle( inputColumns = ImmutableList.copyOf(requireNonNull(inputColumns, "inputColumns is null")); requireNonNull(location, "location is null"); requireNonNull(credentialsHandle, "credentialsHandle is null"); + requireNonNull(extendedStatisticsFile, "extendedStatisticsFile is null"); } @Override diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeMetadata.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeMetadata.java index 27080fa2448d..27da4958adcd 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeMetadata.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeMetadata.java @@ -268,6 +268,7 @@ import static io.trino.plugin.deltalake.metastore.HiveMetastoreBackedDeltaLakeMetastore.convertToDeltaMetastoreTable; import static io.trino.plugin.deltalake.metastore.HiveMetastoreBackedDeltaLakeMetastore.verifyDeltaLakeTable; import static io.trino.plugin.deltalake.procedure.DeltaLakeTableProcedureId.OPTIMIZE; +import static io.trino.plugin.deltalake.statistics.MetaDirStatisticsAccess.STATISTICS_FILE; import static io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport.APPEND_ONLY_CONFIGURATION_KEY; import static io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport.COLUMN_MAPPING_PHYSICAL_NAME_CONFIGURATION_KEY; import static io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport.ColumnMappingMode.ID; @@ -401,6 +402,7 @@ public class DeltaLakeMetadata public static final String OPTIMIZE_OPERATION = "OPTIMIZE"; public static final String SET_TBLPROPERTIES_OPERATION = "SET TBLPROPERTIES"; public static final String CHANGE_COLUMN_OPERATION = "CHANGE COLUMN"; + public static final String ANALYZE = "ANALYZE"; public static final int DEFAULT_READER_VERSION = 1; public static final int DEFAULT_WRITER_VERSION = 2; // The highest reader and writer versions Trino supports @@ -727,18 +729,30 @@ public LocatedTableHandle getTableHandle( } catch (TrinoException e) { if (e.getErrorCode().equals(DELTA_LAKE_INVALID_SCHEMA.toErrorCode())) { - return new CorruptedDeltaLakeTableHandle(tableName, table.catalogOwned(), managed, tableLocation, e); + return new CorruptedDeltaLakeTableHandle(tableName, table.catalogOwned(), managed, tableLocation, e, Optional.empty()); } throw e; } MetadataEntry metadataEntry = logEntries.metadata().orElse(null); if (metadataEntry == null) { - return new CorruptedDeltaLakeTableHandle(tableName, table.catalogOwned(), managed, tableLocation, new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + tableSnapshot.getTable())); + return new CorruptedDeltaLakeTableHandle( + tableName, + table.catalogOwned(), + managed, + tableLocation, + new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + tableSnapshot.getTable()), + logEntries.commitInfo().flatMap(CommitInfoEntry::extendedStatsFile)); } ProtocolEntry protocolEntry = logEntries.protocol().orElse(null); if (protocolEntry == null) { - return new CorruptedDeltaLakeTableHandle(tableName, table.catalogOwned(), managed, tableLocation, new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Protocol not found in transaction log for " + tableSnapshot.getTable())); + return new CorruptedDeltaLakeTableHandle( + tableName, + table.catalogOwned(), + managed, + tableLocation, + new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Protocol not found in transaction log for " + tableSnapshot.getTable()), + logEntries.commitInfo().flatMap(CommitInfoEntry::extendedStatsFile)); } if (protocolEntry.minReaderVersion() > MAX_READER_VERSION) { LOG.debug("Skip %s because the reader version is unsupported: %d", tableName, protocolEntry.minReaderVersion()); @@ -768,7 +782,8 @@ public LocatedTableHandle getTableHandle( Optional.empty(), Optional.empty(), tableSnapshot.getVersion(), - endVersion.isPresent()); + endVersion.isPresent(), + logEntries.commitInfo().flatMap(CommitInfoEntry::extendedStatsFile)); } @Override @@ -1357,7 +1372,7 @@ public void createTable(ConnectorSession session, ConnectorTableMetadata tableMe } } protocolEntry = protocolEntryForTable(ProtocolEntry.builder(tableHandle.getProtocolEntry()), containsTimestampType, tableMetadata.getProperties()); - statisticsAccess.deleteExtendedStatistics(session, schemaTableName, location, tableHandle.toCredentialsHandle()); + statisticsAccess.deleteExtendedStatistics(session, schemaTableName, location, tableHandle.extendedStatsFile().orElse(STATISTICS_FILE), tableHandle.toCredentialsHandle()); } else { setRollback(() -> deleteRecursivelyIfExists(fileSystem, deltaLogDirectory)); @@ -1376,7 +1391,8 @@ public void createTable(ConnectorSession session, ConnectorTableMetadata tableMe saveMode == SaveMode.REPLACE ? CREATE_OR_REPLACE_TABLE_OPERATION : CREATE_TABLE_OPERATION, session, protocolEntry, - metadataEntry); + metadataEntry, + Optional.empty()); transactionLogWriter.flush(); @@ -1731,6 +1747,30 @@ public Optional finishCreateTable( } transactionLogWriter = transactionLogWriterFactory.createFileSystemWriter(session, location, VendedCredentialsHandle.empty(location)); } + + Optional extendedStatsFile = Optional.empty(); + if (isCollectExtendedStatisticsColumnStatisticsOnWrite(session) && !computedStatistics.isEmpty()) { + Optional maxFileModificationTime = dataFileInfos.stream() + .map(DataFileInfo::creationTime) + .max(Long::compare) + .map(Instant::ofEpochMilli); + Map physicalColumnMapping = DeltaLakeSchemaSupport.getColumnMetadata(schemaString, typeManager, columnMappingMode, handle.partitionedBy()).stream() + .map(e -> Map.entry(e.name(), e.physicalName())) + .collect(toImmutableMap(Entry::getKey, Entry::getValue)); + + extendedStatsFile = Optional.of(updateTableStatistics( + session, + Optional.empty(), + schemaTableName, + location, + handle.toCredentialsHandle(), + maxFileModificationTime, + computedStatistics, + columnNames, + Optional.of(physicalColumnMapping), + Optional.empty())); + } + appendTableEntries( commitVersion, transactionLogWriter, @@ -1742,7 +1782,8 @@ public Optional finishCreateTable( .setSchemaString(schemaString) .setPartitionColumns(handle.partitionedBy()) .setConfiguration(configurationForNewTable(handle.checkpointInterval(), handle.changeDataFeedEnabled(), handle.deletionVectorsEnabled(), columnMappingMode, handle.maxColumnId())) - .build()); + .build(), + extendedStatsFile); appendAddFileEntries(transactionLogWriter, dataFileInfos, physicalPartitionNames, columnNames, true); if (handle.readVersion().isPresent()) { long writeTimestamp = Instant.now().toEpochMilli(); @@ -1762,28 +1803,6 @@ public Optional finishCreateTable( writeCheckpointIfNeeded(session, schemaTableName, handle.location(), handle.toCredentialsHandle(), handle.readVersion().getAsLong(), handle.checkpointInterval(), commitVersion, handle.existingColumns(), Optional.of(handle.inputColumns())); } - if (isCollectExtendedStatisticsColumnStatisticsOnWrite(session) && !computedStatistics.isEmpty()) { - Optional maxFileModificationTime = dataFileInfos.stream() - .map(DataFileInfo::creationTime) - .max(Long::compare) - .map(Instant::ofEpochMilli); - Map physicalColumnMapping = DeltaLakeSchemaSupport.getColumnMetadata(schemaString, typeManager, columnMappingMode, handle.partitionedBy()).stream() - .map(e -> Map.entry(e.name(), e.physicalName())) - .collect(toImmutableMap(Entry::getKey, Entry::getValue)); - - updateTableStatistics( - session, - Optional.empty(), - schemaTableName, - location, - handle.toCredentialsHandle(), - maxFileModificationTime, - computedStatistics, - columnNames, - Optional.of(physicalColumnMapping), - true); - } - Table table = buildTable(session, schemaTableName, location, handle.external(), handle.comment(), commitVersion, handle.schemaString()); PrincipalPrivileges principalPrivileges = buildInitialPrivilegeSet(table.getOwner().orElseThrow()); @@ -1844,7 +1863,8 @@ public void setTableComment(ConnectorSession session, ConnectorTableHandle table protocolEntry, MetadataEntry.builder(handle.getMetadataEntry()) .setDescription(comment) - .build()); + .build(), + handle.extendedStatsFile()); transactionLogWriter.flush(); enqueueUpdateInfo(session, handle.getSchemaName(), handle.getTableName(), commitVersion, metadataEntry.getSchemaString(), comment); } @@ -1884,7 +1904,8 @@ public void setColumnComment(ConnectorSession session, ConnectorTableHandle tabl protocolEntry, MetadataEntry.builder(deltaLakeTableHandle.getMetadataEntry()) .setSchemaString(schemaString) - .build()); + .build(), + deltaLakeTableHandle.extendedStatsFile()); transactionLogWriter.flush(); enqueueUpdateInfo( session, @@ -1987,7 +2008,8 @@ public void addColumn(ConnectorSession session, ConnectorTableHandle tableHandle MetadataEntry.builder(handle.getMetadataEntry()) .setSchemaString(schemaString) .setConfiguration(configuration) - .build()); + .build(), + handle.extendedStatsFile()); transactionLogWriter.flush(); enqueueUpdateInfo( session, @@ -2049,6 +2071,17 @@ public void dropColumn(ConnectorSession session, ConnectorTableHandle tableHandl throw new TrinoException(NOT_SUPPORTED, "Dropping the last non-partition column is unsupported"); } + Optional extendedStatsFile = statisticsAccess.readExtendedStatistics(session, table.getSchemaTableName(), table.getLocation(), table.extendedStatsFile().orElse(STATISTICS_FILE), table.toCredentialsHandle()).map(existingStatistics -> { + ExtendedStatistics statistics = new ExtendedStatistics( + existingStatistics.getAlreadyAnalyzedModifiedTimeMax(), + existingStatistics.getColumnStatistics().entrySet().stream() + .filter(stats -> !stats.getKey().equalsIgnoreCase(toPhysicalColumnName(dropColumnName, lowerCaseToExactColumnNames, Optional.of(physicalColumnNameMapping)))) + .collect(toImmutableMap(Entry::getKey, Entry::getValue)), + existingStatistics.getAnalyzedColumns() + .map(analyzedColumns -> analyzedColumns.stream().filter(column -> !column.equalsIgnoreCase(dropColumnName)).collect(toImmutableSet()))); + return statisticsAccess.updateExtendedStatistics(session, table.getSchemaTableName(), table.getLocation(), table.extendedStatsFile(), table.toCredentialsHandle(), statistics); + }); + String schemaString = serializeSchemaAsJson(deltaTable); try { TransactionLogWriter transactionLogWriter = transactionLogWriterFactory.createWriter(session, table); @@ -2060,29 +2093,14 @@ public void dropColumn(ConnectorSession session, ConnectorTableHandle tableHandl protocolEntry, MetadataEntry.builder(metadataEntry) .setSchemaString(schemaString) - .build()); + .build(), + extendedStatsFile); transactionLogWriter.flush(); enqueueUpdateInfo(session, table.getSchemaName(), table.getTableName(), commitVersion, schemaString, Optional.ofNullable(metadataEntry.getDescription())); } catch (Exception e) { throw new TrinoException(DELTA_LAKE_BAD_WRITE, format("Unable to drop '%s' column from: %s.%s", dropColumnName, table.getSchemaName(), table.getTableName()), e); } - - try { - statisticsAccess.readExtendedStatistics(session, table.getSchemaTableName(), table.getLocation(), table.toCredentialsHandle()).ifPresent(existingStatistics -> { - ExtendedStatistics statistics = new ExtendedStatistics( - existingStatistics.getAlreadyAnalyzedModifiedTimeMax(), - existingStatistics.getColumnStatistics().entrySet().stream() - .filter(stats -> !stats.getKey().equalsIgnoreCase(toPhysicalColumnName(dropColumnName, lowerCaseToExactColumnNames, Optional.of(physicalColumnNameMapping)))) - .collect(toImmutableMap(Entry::getKey, Entry::getValue)), - existingStatistics.getAnalyzedColumns() - .map(analyzedColumns -> analyzedColumns.stream().filter(column -> !column.equalsIgnoreCase(dropColumnName)).collect(toImmutableSet()))); - statisticsAccess.updateExtendedStatistics(session, table.getSchemaTableName(), table.getLocation(), table.toCredentialsHandle(), statistics); - }); - } - catch (Exception e) { - LOG.warn(e, "Failed to update extended statistics when dropping %s column from %s table", dropColumnName, table.schemaTableName()); - } } @Override @@ -2129,7 +2147,8 @@ public void renameColumn(ConnectorSession session, ConnectorTableHandle tableHan MetadataEntry.builder(metadataEntry) .setSchemaString(schemaString) .setPartitionColumns(partitionColumns) - .build()); + .build(), + table.extendedStatsFile()); transactionLogWriter.flush(); enqueueUpdateInfo(session, table.getSchemaName(), table.getTableName(), commitVersion, schemaString, Optional.ofNullable(metadataEntry.getDescription())); // Don't update extended statistics because it uses physical column names internally @@ -2167,7 +2186,8 @@ public void dropNotNullConstraint(ConnectorSession session, ConnectorTableHandle protocolEntry, MetadataEntry.builder(metadataEntry) .setSchemaString(schemaString) - .build()); + .build(), + table.extendedStatsFile()); transactionLogWriter.flush(); enqueueUpdateInfo(session, table.getSchemaName(), table.getTableName(), commitVersion, schemaString, Optional.ofNullable(metadataEntry.getDescription())); } @@ -2182,9 +2202,10 @@ private void appendTableEntries( String operation, ConnectorSession session, ProtocolEntry protocolEntry, - MetadataEntry metadataEntry) + MetadataEntry metadataEntry, + Optional extendedStatsFile) { - transactionLogWriter.appendCommitInfoEntry(getCommitInfoEntry(session, IsolationLevel.WRITESERIALIZABLE, commitVersion, metadataEntry.getCreatedTime(), operation, 0, true)); + transactionLogWriter.appendCommitInfoEntry(getCommitInfoEntry(session, IsolationLevel.WRITESERIALIZABLE, commitVersion, metadataEntry.getCreatedTime(), operation, 0, true, extendedStatsFile)); transactionLogWriter.appendProtocolEntry(protocolEntry); transactionLogWriter.appendMetadataEntry(metadataEntry); @@ -2278,7 +2299,8 @@ private DeltaLakeInsertTableHandle createInsertHandle(RetryMode retryMode, Delta inputColumns, table.getReadVersion(), retryMode != NO_RETRIES, - table.toCredentialsHandle()); + table.toCredentialsHandle(), + table.extendedStatsFile()); } private void checkAllColumnsPassedOnInsert(ConnectorTableMetadata tableMetadata, List insertColumns) @@ -2315,35 +2337,16 @@ public Optional finishInsert( cleanExtraOutputFiles(fileSystemFactory.create(session, handle.credentialsHandle()), session.getQueryId(), Location.of(handle.location()), dataFileInfos); } + Optional extendedStatsFile = getExtendedStatsFileForInsert(session, computedStatistics, dataFileInfos, handle); boolean writeCommitted = false; try { IsolationLevel isolationLevel = getIsolationLevel(handle.metadataEntry()); AtomicReference readVersion = new AtomicReference<>(handle.readVersion()); long commitVersion = Failsafe.with(TRANSACTION_CONFLICT_RETRY_POLICY) - .get(context -> commitInsertOperation(session, handle, sourceTableHandles, isolationLevel, dataFileInfos, readVersion, context.getAttemptCount())); + .get(context -> commitInsertOperation(session, handle, sourceTableHandles, isolationLevel, dataFileInfos, readVersion, context.getAttemptCount(), extendedStatsFile)); writeCommitted = true; writeCheckpointIfNeeded(session, handle.tableName(), handle.location(), handle.credentialsHandle(), handle.readVersion(), handle.metadataEntry().getCheckpointInterval(), commitVersion, Optional.empty(), Optional.empty()); enqueueUpdateInfo(session, handle.tableName().getSchemaName(), handle.tableName().getTableName(), commitVersion, handle.metadataEntry().getSchemaString(), Optional.ofNullable(handle.metadataEntry().getDescription())); - - if (isCollectExtendedStatisticsColumnStatisticsOnWrite(session) && !computedStatistics.isEmpty() && !dataFileInfos.isEmpty()) { - // TODO (https://github.com/trinodb/trino/issues/16088) Add synchronization when version conflict for INSERT is resolved. - Optional maxFileModificationTime = dataFileInfos.stream() - .map(DataFileInfo::creationTime) - .max(Long::compare) - .map(Instant::ofEpochMilli); - updateTableStatistics( - session, - Optional.empty(), - handle.tableName(), - handle.location(), - handle.credentialsHandle(), - maxFileModificationTime, - computedStatistics, - getExactColumnNames(handle.metadataEntry()), - Optional.of(extractSchema(handle.metadataEntry(), handle.protocolEntry(), typeManager).stream() - .collect(toImmutableMap(DeltaLakeColumnMetadata::name, DeltaLakeColumnMetadata::physicalName))), - true); - } } catch (Exception e) { if (!writeCommitted) { @@ -2356,6 +2359,31 @@ public Optional finishInsert( return Optional.empty(); } + private Optional getExtendedStatsFileForInsert(ConnectorSession session, Collection computedStatistics, List dataFileInfos, DeltaLakeInsertTableHandle handle) + { + Optional extendedStatsFile = Optional.empty(); + if (isCollectExtendedStatisticsColumnStatisticsOnWrite(session) && !computedStatistics.isEmpty() && !dataFileInfos.isEmpty()) { + // TODO (https://github.com/trinodb/trino/issues/16088) Add synchronization when version conflict for INSERT is resolved. + Optional maxFileModificationTime = dataFileInfos.stream() + .map(DataFileInfo::creationTime) + .max(Long::compare) + .map(Instant::ofEpochMilli); + extendedStatsFile = Optional.of(updateTableStatistics( + session, + Optional.empty(), + handle.tableName(), + handle.location(), + handle.credentialsHandle(), + maxFileModificationTime, + computedStatistics, + getExactColumnNames(handle.metadataEntry()), + Optional.of(extractSchema(handle.metadataEntry(), handle.protocolEntry(), typeManager).stream() + .collect(toImmutableMap(DeltaLakeColumnMetadata::name, DeltaLakeColumnMetadata::physicalName))), + handle.extendedStatisticsFile())); + } + return extendedStatsFile; + } + private long commitInsertOperation( ConnectorSession session, DeltaLakeInsertTableHandle handle, @@ -2363,7 +2391,8 @@ private long commitInsertOperation( IsolationLevel isolationLevel, List dataFileInfos, AtomicReference readVersion, - int attemptCount) + int attemptCount, + Optional extendedStatsFile) throws IOException { TrinoFileSystem fileSystem = fileSystemFactory.create(session, handle.credentialsHandle()); @@ -2375,7 +2404,7 @@ private long commitInsertOperation( .collect(toImmutableList()); checkForConcurrentTransactionConflicts(session, fileSystem, enforcedSourcePartitionConstraints, isolationLevel, currentVersion, readVersion, handle.location(), attemptCount); long commitVersion = currentVersion + 1; - writeTransactionLogForInsertOperation(session, handle, sameAsTargetSourceTableHandles.isEmpty(), isolationLevel, dataFileInfos, commitVersion, currentVersion); + writeTransactionLogForInsertOperation(session, handle, sameAsTargetSourceTableHandles.isEmpty(), isolationLevel, dataFileInfos, commitVersion, currentVersion, extendedStatsFile); return commitVersion; } @@ -2516,12 +2545,13 @@ private void writeTransactionLogForInsertOperation( IsolationLevel isolationLevel, List dataFileInfos, long commitVersion, - long currentVersion) + long currentVersion, + Optional extendedStatsFile) throws IOException { // it is not obvious why we need to persist this readVersion TransactionLogWriter transactionLogWriter = transactionLogWriterFactory.createWriter(session, insertTableHandle.location(), insertTableHandle.metadataEntry(), insertTableHandle.protocolEntry(), insertTableHandle.credentialsHandle()); - transactionLogWriter.appendCommitInfoEntry(getCommitInfoEntry(session, isolationLevel, commitVersion, Instant.now().toEpochMilli(), INSERT_OPERATION, currentVersion, isBlindAppend)); + transactionLogWriter.appendCommitInfoEntry(getCommitInfoEntry(session, isolationLevel, commitVersion, Instant.now().toEpochMilli(), INSERT_OPERATION, currentVersion, isBlindAppend, extendedStatsFile)); ColumnMappingMode columnMappingMode = getColumnMappingMode(insertTableHandle.metadataEntry(), insertTableHandle.protocolEntry()); List partitionColumns = getPartitionColumns( @@ -2756,7 +2786,8 @@ private long commitMergeOperation( checkForConcurrentTransactionConflicts(session, fileSystem, enforcedSourcePartitionConstraints, isolationLevel, currentVersion, readVersion, handle.getLocation(), attemptCount); long commitVersion = currentVersion + 1; - transactionLogWriter.appendCommitInfoEntry(getCommitInfoEntry(session, isolationLevel, commitVersion, createdTime, MERGE_OPERATION, handle.getReadVersion(), sameAsTargetSourceTableHandles.isEmpty())); + // TODO: we should update the extended stats for MERGE operation + transactionLogWriter.appendCommitInfoEntry(getCommitInfoEntry(session, isolationLevel, commitVersion, createdTime, MERGE_OPERATION, handle.getReadVersion(), sameAsTargetSourceTableHandles.isEmpty(), handle.extendedStatsFile())); // TODO: Delta writes another field "operationMetrics" (https://github.com/trinodb/trino/issues/12005) long writeTimestamp = Instant.now().toEpochMilli(); @@ -2879,7 +2910,8 @@ private Optional getTableHandleForOptimize(DeltaLak Optional.empty(), retryMode != NO_RETRIES, tableHandle.getEnforcedPartitionConstraint(), - tableHandle.toCredentialsHandle()), + tableHandle.toCredentialsHandle(), + tableHandle.extendedStatsFile()), tableHandle.getLocation())); } @@ -3032,7 +3064,8 @@ private long commitOptimizeOperation( createdTime, OPTIMIZE_OPERATION, optimizeHandle.getCurrentVersion().orElseThrow(() -> new IllegalArgumentException("currentVersion not set")), - false)); + false, + optimizeHandle.getExtendedStatsFile())); // TODO: Delta writes another field "operationMetrics" that I haven't // seen before. It contains delete/update metrics. Investigate/include it. @@ -3336,7 +3369,8 @@ private CommitInfoEntry getCommitInfoEntry( long createdTime, String operation, long readVersion, - boolean isBlindAppend) + boolean isBlindAppend, + Optional extendedStatsFile) { return new CommitInfoEntry( commitVersion, @@ -3352,7 +3386,8 @@ private CommitInfoEntry getCommitInfoEntry( readVersion, isolationLevel.getValue(), Optional.of(isBlindAppend), - ImmutableMap.of()); + ImmutableMap.of(), + extendedStatsFile.or(() -> Optional.of(STATISTICS_FILE))); } @Override @@ -3396,7 +3431,7 @@ public void setTableProperties(ConnectorSession session, ConnectorTableHandle ta try { TransactionLogWriter transactionLogWriter = transactionLogWriterFactory.createWriter(session, handle); - transactionLogWriter.appendCommitInfoEntry(getCommitInfoEntry(session, IsolationLevel.WRITESERIALIZABLE, commitVersion, createdTime, SET_TBLPROPERTIES_OPERATION, readVersion, true)); + transactionLogWriter.appendCommitInfoEntry(getCommitInfoEntry(session, IsolationLevel.WRITESERIALIZABLE, commitVersion, createdTime, SET_TBLPROPERTIES_OPERATION, readVersion, true, handle.extendedStatsFile())); protocolEntry.ifPresent(transactionLogWriter::appendProtocolEntry); metadataEntry.ifPresent(transactionLogWriter::appendMetadataEntry); @@ -3589,7 +3624,8 @@ else if (!partitionColumns.contains(column)) { false, Optional.empty(), tableHandle.getReadVersion(), - tableHandle.isTimeTravel()); + tableHandle.isTimeTravel(), + tableHandle.extendedStatsFile()); if (tableHandle.getEnforcedPartitionConstraint().equals(newHandle.getEnforcedPartitionConstraint()) && tableHandle.getNonPartitionConstraint().equals(newHandle.getNonPartitionConstraint()) && @@ -3824,7 +3860,7 @@ public ConnectorAnalyzeMetadata getStatisticsCollectionMetadata(ConnectorSession Optional statistics = Optional.empty(); if (analyzeMode == INCREMENTAL) { - statistics = statisticsAccess.readExtendedStatistics(session, handle.getSchemaTableName(), handle.getLocation(), handle.toCredentialsHandle()); + statistics = statisticsAccess.readExtendedStatistics(session, handle.getSchemaTableName(), handle.getLocation(), handle.extendedStatsFile().orElse(STATISTICS_FILE), handle.toCredentialsHandle()); } Optional alreadyAnalyzedModifiedTimeMax = statistics.map(ExtendedStatistics::getAlreadyAnalyzedModifiedTimeMax); @@ -3878,7 +3914,8 @@ public ConnectorAnalyzeMetadata getStatisticsCollectionMetadata(ConnectorSession Optional.empty(), Optional.of(analyzeHandle), handle.getReadVersion(), - handle.isTimeTravel()); + handle.isTimeTravel(), + handle.extendedStatsFile()); TableStatisticsMetadata statisticsMetadata = getStatisticsCollectionMetadata( columnsMetadata.stream().map(DeltaLakeColumnMetadata::columnMetadata).collect(toImmutableList()), analyzeColumnNames.orElse(allColumnNames), @@ -3906,7 +3943,8 @@ public TableStatisticsMetadata getStatisticsCollectionMetadataForWrite(Connector LocatedTableHandle table = getTableHandle(session, tableMetadata.getTable(), Optional.empty(), Optional.empty()); VendedCredentialsHandle credentialsHandle = Optional.ofNullable(table).map(LocatedTableHandle::toCredentialsHandle) .orElseGet(() -> VendedCredentialsHandle.empty(tableLocation)); - existingStatistics = statisticsAccess.readExtendedStatistics(session, tableMetadata.getTable(), tableLocation, credentialsHandle); + Optional extendedStatsFile = Optional.ofNullable(table).flatMap(LocatedTableHandle::extendedStatsFile); + existingStatistics = statisticsAccess.readExtendedStatistics(session, tableMetadata.getTable(), tableLocation, extendedStatsFile.orElse(STATISTICS_FILE), credentialsHandle); analyzeColumnNames = existingStatistics.flatMap(ExtendedStatistics::getAnalyzedColumns); } @@ -3997,7 +4035,7 @@ public void finishStatisticsCollection(ConnectorSession session, ConnectorTableH Optional maxFileModificationTime = getMaxFileModificationTime(computedStatistics); Map physicalColumnNameMapping = extractSchema(tableHandle.getMetadataEntry(), tableHandle.getProtocolEntry(), typeManager).stream() .collect(toImmutableMap(DeltaLakeColumnMetadata::name, DeltaLakeColumnMetadata::physicalName)); - updateTableStatistics( + String extendedStatsFile = updateTableStatistics( session, Optional.of(analyzeHandle), tableHandle.getSchemaTableName(), @@ -4007,7 +4045,26 @@ public void finishStatisticsCollection(ConnectorSession session, ConnectorTableH computedStatistics, getExactColumnNames(tableHandle.getMetadataEntry()), Optional.of(physicalColumnNameMapping), - false); + tableHandle.extendedStatsFile()); + + try { + Failsafe.with(TRANSACTION_CONFLICT_RETRY_POLICY) + .run(() -> commitAnalyzeOperation(session, extendedStatsFile, tableHandle)); + } + catch (Exception e) { + throw new TrinoException(DELTA_LAKE_BAD_WRITE, format("Unable to analyze on table: %s", tableHandle.getSchemaName()), e); + } + } + + private void commitAnalyzeOperation(ConnectorSession session, String extendedStatsFile, DeltaLakeTableHandle tableHandle) + throws IOException + { + TransactionLogWriter writer = transactionLogWriterFactory.createWriter(session, tableHandle); + TrinoFileSystem fileSystem = fileSystemFactory.create(session, tableHandle); + long currentVersion = getMandatoryCurrentVersion(fileSystem, tableHandle.getLocation(), tableHandle.getReadVersion()); + writer.appendCommitInfoEntry( + getCommitInfoEntry(session, IsolationLevel.WRITESERIALIZABLE, currentVersion + 1, Instant.now().toEpochMilli(), ANALYZE, tableHandle.getReadVersion(), false, Optional.of(extendedStatsFile))); + writer.flush(); } private void generateMissingFileStatistics(ConnectorSession session, DeltaLakeTableHandle tableHandle, Collection computedStatistics) @@ -4060,7 +4117,7 @@ private void generateMissingFileStatistics(ConnectorSession session, DeltaLakeTa long readVersion = tableHandle.getReadVersion(); long commitVersion = readVersion + 1; TransactionLogWriter transactionLogWriter = transactionLogWriterFactory.createWriter(session, tableHandle); - transactionLogWriter.appendCommitInfoEntry(getCommitInfoEntry(session, IsolationLevel.WRITESERIALIZABLE, commitVersion, createdTime, OPTIMIZE_OPERATION, readVersion, true)); + transactionLogWriter.appendCommitInfoEntry(getCommitInfoEntry(session, IsolationLevel.WRITESERIALIZABLE, commitVersion, createdTime, OPTIMIZE_OPERATION, readVersion, true, tableHandle.extendedStatsFile())); updatedAddFileEntries.forEach(transactionLogWriter::appendAddFileEntry); transactionLogWriter.flush(); } @@ -4089,7 +4146,7 @@ private AddFileEntry prepareUpdatedAddFileEntry(ComputedStatistics stats, AddFil } } - private void updateTableStatistics( + private String updateTableStatistics( ConnectorSession session, Optional analyzeHandle, SchemaTableName schemaTableName, @@ -4099,12 +4156,12 @@ private void updateTableStatistics( Collection computedStatistics, List originalColumnNames, Optional> physicalColumnNameMapping, - boolean ignoreFailure) + Optional previousExtendedStatsFile) { Optional oldStatistics = Optional.empty(); boolean loadExistingStats = analyzeHandle.isEmpty() || analyzeHandle.get().analyzeMode() == INCREMENTAL; if (loadExistingStats) { - oldStatistics = statisticsAccess.readExtendedStatistics(session, schemaTableName, location, credentialsHandle); + oldStatistics = statisticsAccess.readExtendedStatistics(session, schemaTableName, location, previousExtendedStatsFile.orElse(STATISTICS_FILE), credentialsHandle); } // more elaborate logic for handling statistics model evaluation may need to be introduced in the future @@ -4170,18 +4227,7 @@ private void updateTableStatistics( mergedColumnStatistics, analyzedColumns); - try { - statisticsAccess.updateExtendedStatistics(session, schemaTableName, location, credentialsHandle, mergedExtendedStatistics); - } - catch (Exception e) { - if (ignoreFailure) { - // We can't fail here as transaction was already committed - LOG.error(e, "Failed to write extended statistics for the table %s", schemaTableName); - } - else { - throw e; - } - } + return statisticsAccess.updateExtendedStatistics(session, schemaTableName, location, previousExtendedStatsFile, credentialsHandle, mergedExtendedStatistics); } private static String toPhysicalColumnName(String columnName, Map lowerCaseToExactColumnNames, Optional> physicalColumnNameMapping) @@ -4394,7 +4440,7 @@ private CommitDeleteOperationResult commitDeleteOperation( long currentVersion = getMandatoryCurrentVersion(fileSystem, tableLocation, readVersion.get()); checkForConcurrentTransactionConflicts(session, fileSystem, ImmutableList.of(tableHandle.getEnforcedPartitionConstraint()), isolationLevel, currentVersion, readVersion, tableHandle.getLocation(), attemptCount); long commitVersion = currentVersion + 1; - transactionLogWriter.appendCommitInfoEntry(getCommitInfoEntry(session, isolationLevel, commitVersion, writeTimestamp, operation, tableHandle.getReadVersion(), false)); + transactionLogWriter.appendCommitInfoEntry(getCommitInfoEntry(session, isolationLevel, commitVersion, writeTimestamp, operation, tableHandle.getReadVersion(), false, tableHandle.extendedStatsFile())); Domain pathDomain = getPathDomain(tableHandle.getNonPartitionConstraint()); Domain fileModifiedDomain = getFileModifiedTimeDomain(tableHandle.getNonPartitionConstraint()); diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeTableHandle.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeTableHandle.java index 12af6a3dcf20..8f2049c27cd0 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeTableHandle.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeTableHandle.java @@ -57,6 +57,9 @@ public class DeltaLakeTableHandle // Used only for validation when config property delta.query-partition-filter-required is enabled. private final Set constraintColumns; + // For extended stats + private final Optional extendedStatsFile; + @JsonCreator public DeltaLakeTableHandle( @JsonProperty("schemaName") String schemaName, @@ -71,7 +74,8 @@ public DeltaLakeTableHandle( @JsonProperty("projectedColumns") Optional> projectedColumns, @JsonProperty("analyzeHandle") Optional analyzeHandle, @JsonProperty("readVersion") long readVersion, - @JsonProperty("timeTravel") boolean timeTravel) + @JsonProperty("timeTravel") boolean timeTravel, + @JsonProperty("extendedStatsFile") Optional extendedStatsFile) { this( schemaName, @@ -90,7 +94,8 @@ public DeltaLakeTableHandle( false, Optional.empty(), readVersion, - timeTravel); + timeTravel, + extendedStatsFile); } public DeltaLakeTableHandle( @@ -110,7 +115,8 @@ public DeltaLakeTableHandle( boolean isOptimize, Optional maxScannedFileSize, long readVersion, - boolean timeTravel) + boolean timeTravel, + Optional extendedStatsFile) { this.schemaName = requireNonNull(schemaName, "schemaName is null"); this.tableName = requireNonNull(tableName, "tableName is null"); @@ -129,6 +135,7 @@ public DeltaLakeTableHandle( this.readVersion = readVersion; this.timeTravel = timeTravel; this.constraintColumns = ImmutableSet.copyOf(requireNonNull(constraintColumns, "constraintColumns is null")); + this.extendedStatsFile = requireNonNull(extendedStatsFile, "extendedStatsFileName is null"); } public DeltaLakeTableHandle withProjectedColumns(Set projectedColumns) @@ -150,7 +157,8 @@ public DeltaLakeTableHandle withProjectedColumns(Set proj isOptimize, maxScannedFileSize, readVersion, - timeTravel); + timeTravel, + extendedStatsFile); } public DeltaLakeTableHandle forOptimize(boolean recordScannedFiles, DataSize maxScannedFileSize) @@ -172,7 +180,8 @@ public DeltaLakeTableHandle forOptimize(boolean recordScannedFiles, DataSize max true, Optional.of(maxScannedFileSize), readVersion, - timeTravel); + timeTravel, + extendedStatsFile); } public DeltaLakeTableHandle forMerge() @@ -194,7 +203,8 @@ public DeltaLakeTableHandle forMerge() isOptimize, maxScannedFileSize, readVersion, - timeTravel); + timeTravel, + extendedStatsFile); } @Override @@ -329,6 +339,13 @@ public boolean isTimeTravel() return timeTravel; } + @Override + @JsonProperty("extendedStatsFile") + public Optional extendedStatsFile() + { + return extendedStatsFile; + } + @Override public String toString() { @@ -361,7 +378,8 @@ public boolean equals(Object o) isOptimize == that.isOptimize && Objects.equals(maxScannedFileSize, that.maxScannedFileSize) && readVersion == that.readVersion && - timeTravel == that.timeTravel; + timeTravel == that.timeTravel && + Objects.equals(extendedStatsFile, that.extendedStatsFile); } @Override @@ -383,6 +401,7 @@ public int hashCode() isOptimize, maxScannedFileSize, readVersion, - timeTravel); + timeTravel, + extendedStatsFile); } } diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/LocatedTableHandle.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/LocatedTableHandle.java index 177dfe1abf5e..cdf019c3d04d 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/LocatedTableHandle.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/LocatedTableHandle.java @@ -17,6 +17,8 @@ import io.trino.spi.connector.ConnectorTableHandle; import io.trino.spi.connector.SchemaTableName; +import java.util.Optional; + public interface LocatedTableHandle extends ConnectorTableHandle { @@ -27,4 +29,6 @@ public interface LocatedTableHandle String location(); VendedCredentialsHandle toCredentialsHandle(); + + Optional extendedStatsFile(); } diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/procedure/DeltaTableOptimizeHandle.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/procedure/DeltaTableOptimizeHandle.java index 475e99257b06..8ad9568067ee 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/procedure/DeltaTableOptimizeHandle.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/procedure/DeltaTableOptimizeHandle.java @@ -41,6 +41,7 @@ public class DeltaTableOptimizeHandle private final boolean retriesEnabled; private final TupleDomain enforcedPartitionConstraint; private final VendedCredentialsHandle credentialsHandle; + private final Optional extendedStatsFile; @JsonCreator public DeltaTableOptimizeHandle( @@ -52,7 +53,8 @@ public DeltaTableOptimizeHandle( Optional currentVersion, boolean retriesEnabled, TupleDomain enforcedPartitionConstraint, - VendedCredentialsHandle credentialsHandle) + VendedCredentialsHandle credentialsHandle, + Optional extendedStatsFile) { this.metadataEntry = requireNonNull(metadataEntry, "metadataEntry is null"); this.protocolEntry = requireNonNull(protocolEntry, "protocolEntry is null"); @@ -63,6 +65,7 @@ public DeltaTableOptimizeHandle( this.retriesEnabled = retriesEnabled; this.enforcedPartitionConstraint = requireNonNull(enforcedPartitionConstraint, "enforcedPartitionConstraint is null"); this.credentialsHandle = requireNonNull(credentialsHandle, "credentialsHandle is null"); + this.extendedStatsFile = requireNonNull(extendedStatsFile, "extendedStatsFile is null"); } public DeltaTableOptimizeHandle withCurrentVersion(long currentVersion) @@ -77,7 +80,8 @@ public DeltaTableOptimizeHandle withCurrentVersion(long currentVersion) Optional.of(currentVersion), retriesEnabled, enforcedPartitionConstraint, - credentialsHandle); + credentialsHandle, + extendedStatsFile); } public DeltaTableOptimizeHandle withEnforcedPartitionConstraint(TupleDomain enforcedPartitionConstraint) @@ -91,7 +95,8 @@ public DeltaTableOptimizeHandle withEnforcedPartitionConstraint(TupleDomain getExtendedStatsFile() + { + return extendedStatsFile; + } } diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/procedure/DropExtendedStatsProcedure.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/procedure/DropExtendedStatsProcedure.java index 46c788f42a30..d99deee90528 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/procedure/DropExtendedStatsProcedure.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/procedure/DropExtendedStatsProcedure.java @@ -32,6 +32,7 @@ import java.util.Optional; import static io.trino.plugin.base.util.Procedures.checkProcedureArgument; +import static io.trino.plugin.deltalake.statistics.MetaDirStatisticsAccess.STATISTICS_FILE; import static io.trino.spi.StandardErrorCode.INVALID_PROCEDURE_ARGUMENT; import static io.trino.spi.type.VarcharType.VARCHAR; import static java.lang.String.format; @@ -88,7 +89,7 @@ public void dropStats(ConnectorSession session, ConnectorAccessControl accessCon throw new TrinoException(INVALID_PROCEDURE_ARGUMENT, format("Table '%s' does not exist", name)); } accessControl.checkCanInsertIntoTable(null, name); - statsAccess.deleteExtendedStatistics(session, name, tableHandle.location(), tableHandle.toCredentialsHandle()); + statsAccess.deleteExtendedStatistics(session, name, tableHandle.location(), tableHandle.extendedStatsFile().orElse(STATISTICS_FILE), tableHandle.toCredentialsHandle()); } } } diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/CachingExtendedStatisticsAccess.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/CachingExtendedStatisticsAccess.java index 58328f3c806b..46daea8efc0f 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/CachingExtendedStatisticsAccess.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/CachingExtendedStatisticsAccess.java @@ -58,10 +58,15 @@ public CachingExtendedStatisticsAccess(@ForCachingExtendedStatisticsAccess Exten } @Override - public Optional readExtendedStatistics(ConnectorSession session, SchemaTableName schemaTableName, String tableLocation, VendedCredentialsHandle credentialsHandle) + public Optional readExtendedStatistics( + ConnectorSession session, + SchemaTableName schemaTableName, + String tableLocation, + String extendedStatsFile, + VendedCredentialsHandle credentialsHandle) { try { - return uncheckedCacheGet(cache, new CacheKey(schemaTableName, tableLocation), () -> delegate.readExtendedStatistics(session, schemaTableName, tableLocation, credentialsHandle)); + return uncheckedCacheGet(cache, new CacheKey(schemaTableName, tableLocation, extendedStatsFile), () -> delegate.readExtendedStatistics(session, schemaTableName, tableLocation, extendedStatsFile, credentialsHandle)); } catch (UncheckedExecutionException e) { throwIfInstanceOf(e.getCause(), TrinoException.class); @@ -70,17 +75,24 @@ public Optional readExtendedStatistics(ConnectorSession sess } @Override - public void updateExtendedStatistics(ConnectorSession session, SchemaTableName schemaTableName, String tableLocation, VendedCredentialsHandle credentialsHandle, ExtendedStatistics statistics) + public String updateExtendedStatistics( + ConnectorSession session, + SchemaTableName schemaTableName, + String tableLocation, + Optional previousExtendedStatsFile, + VendedCredentialsHandle credentialsHandle, + ExtendedStatistics statistics) { - delegate.updateExtendedStatistics(session, schemaTableName, tableLocation, credentialsHandle, statistics); - cache.invalidate(new CacheKey(schemaTableName, tableLocation)); + String extendedStatsFile = delegate.updateExtendedStatistics(session, schemaTableName, tableLocation, previousExtendedStatsFile, credentialsHandle, statistics); + cache.invalidate(new CacheKey(schemaTableName, tableLocation, extendedStatsFile)); + return extendedStatsFile; } @Override - public void deleteExtendedStatistics(ConnectorSession session, SchemaTableName schemaTableName, String tableLocation, VendedCredentialsHandle credentialsHandle) + public void deleteExtendedStatistics(ConnectorSession session, SchemaTableName schemaTableName, String tableLocation, String extendedStatsFile, VendedCredentialsHandle credentialsHandle) { - delegate.deleteExtendedStatistics(session, schemaTableName, tableLocation, credentialsHandle); - cache.invalidate(new CacheKey(schemaTableName, tableLocation)); + delegate.deleteExtendedStatistics(session, schemaTableName, tableLocation, extendedStatsFile, credentialsHandle); + cache.invalidate(new CacheKey(schemaTableName, tableLocation, extendedStatsFile)); } public void invalidateCache() @@ -102,12 +114,13 @@ public void invalidateCache(SchemaTableName schemaTableName, Optional ta @BindingAnnotation public @interface ForCachingExtendedStatisticsAccess {} - private record CacheKey(SchemaTableName tableName, String location) + private record CacheKey(SchemaTableName tableName, String location, String extendedStatsFile) { CacheKey { requireNonNull(tableName, "tableName is null"); requireNonNull(location, "location is null"); + requireNonNull(extendedStatsFile, "extendedStatsFile is null"); } } } diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/ExtendedStatisticsAccess.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/ExtendedStatisticsAccess.java index 1edfee17e908..2d522358aaf0 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/ExtendedStatisticsAccess.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/ExtendedStatisticsAccess.java @@ -21,22 +21,43 @@ public interface ExtendedStatisticsAccess { + /** + * Reads the extended statistics for a table from the given stats file. + * + * @return the extended statistics, or {@link Optional#empty()} if the file does not exist + */ Optional readExtendedStatistics( ConnectorSession session, SchemaTableName schemaTableName, String tableLocation, + String extendedStatsFile, VendedCredentialsHandle credentialsHandle); - void updateExtendedStatistics( + /** + * Writes the given extended statistics to a new file under the table's stats directory, + * then deletes the previous stats file if one existed. + *

+ * The caller must persist the returned filename in the + * commitInfo + * entry of the transaction log; otherwise the written statistics will not be discoverable on subsequent reads. + * + * @return the name of the newly written stats file + */ + String updateExtendedStatistics( ConnectorSession session, SchemaTableName schemaTableName, String tableLocation, + Optional previousExtendedStatsFile, VendedCredentialsHandle credentialsHandle, ExtendedStatistics statistics); + /** + * Deletes the extended statistics file for a table. No-op if the file does not exist. + */ void deleteExtendedStatistics( ConnectorSession session, SchemaTableName schemaTableName, String tableLocation, + String extendedStatsFile, VendedCredentialsHandle credentialsHandle); } diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/FileBasedTableStatisticsProvider.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/FileBasedTableStatisticsProvider.java index a92426382ec7..0ccb52722e80 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/FileBasedTableStatisticsProvider.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/FileBasedTableStatisticsProvider.java @@ -50,6 +50,7 @@ import static io.trino.plugin.deltalake.DeltaLakeMetadata.createStatisticsPredicate; import static io.trino.plugin.deltalake.DeltaLakeSessionProperties.isExtendedStatisticsEnabled; import static io.trino.plugin.deltalake.DeltaLakeSplitManager.buildSplitPath; +import static io.trino.plugin.deltalake.statistics.MetaDirStatisticsAccess.STATISTICS_FILE; import static io.trino.plugin.deltalake.util.DeltaLakeDomains.fileModifiedTimeMatchesPredicate; import static io.trino.plugin.deltalake.util.DeltaLakeDomains.fileSizeMatchesPredicate; import static io.trino.plugin.deltalake.util.DeltaLakeDomains.getFileModifiedTimeDomain; @@ -212,7 +213,7 @@ public TableStatistics getTableStatistics(ConnectorSession session, DeltaLakeTab Optional statistics = Optional.empty(); if (isExtendedStatisticsEnabled(session)) { - statistics = statisticsAccess.readExtendedStatistics(session, tableHandle.getSchemaTableName(), tableHandle.getLocation(), tableHandle.toCredentialsHandle()); + statistics = statisticsAccess.readExtendedStatistics(session, tableHandle.getSchemaTableName(), tableHandle.getLocation(), tableHandle.extendedStatsFile().orElse(STATISTICS_FILE), tableHandle.toCredentialsHandle()); } for (DeltaLakeColumnHandle column : columns) { diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/MetaDirStatisticsAccess.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/MetaDirStatisticsAccess.java index cbd00e206776..6da2de6a1d86 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/MetaDirStatisticsAccess.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/MetaDirStatisticsAccess.java @@ -29,6 +29,7 @@ import java.io.IOException; import java.io.InputStream; import java.util.Optional; +import java.util.UUID; import static io.trino.plugin.deltalake.DeltaLakeErrorCode.DELTA_LAKE_FILESYSTEM_ERROR; import static io.trino.plugin.deltalake.transactionlog.TransactionLogUtil.TRANSACTION_LOG_DIRECTORY; @@ -40,10 +41,12 @@ public class MetaDirStatisticsAccess implements ExtendedStatisticsAccess { public static final String STATISTICS_META_DIR = TRANSACTION_LOG_DIRECTORY + "/_trino_meta"; // store inside TL directory so it is not deleted by VACUUM - private static final String STATISTICS_FILE = "extended_stats.json"; + // For backward compatibility + @Deprecated + public static final String STATISTICS_FILE = "extended_stats.json"; + @Deprecated public static final String STARBURST_META_DIR = TRANSACTION_LOG_DIRECTORY + "/_starburst_meta"; - private static final String STARBURST_STATISTICS_FILE = "extendeded_stats.json"; private final DeltaLakeFileSystemFactory fileSystemFactory; private final JsonCodec statisticsCodec; @@ -62,22 +65,11 @@ public Optional readExtendedStatistics( ConnectorSession session, SchemaTableName schemaTableName, String tableLocation, + String extendedStatsFile, VendedCredentialsHandle credentialsHandle) { - Location location = Location.of(tableLocation); - return readExtendedStatistics(session, location, credentialsHandle, STATISTICS_META_DIR, STATISTICS_FILE) - .or(() -> readExtendedStatistics(session, location, credentialsHandle, STARBURST_META_DIR, STARBURST_STATISTICS_FILE)); - } - - private Optional readExtendedStatistics( - ConnectorSession session, - Location tableLocation, - VendedCredentialsHandle credentialsHandle, - String statisticsDirectory, - String statisticsFile) - { + Location statisticsPath = resolveStatsFileLocation(tableLocation, extendedStatsFile); try { - Location statisticsPath = tableLocation.appendPath(statisticsDirectory).appendPath(statisticsFile); TrinoInputFile inputFile = fileSystemFactory.create(session, credentialsHandle).newInputFile(statisticsPath); try (InputStream inputStream = inputFile.newStream()) { return Optional.of(decodeAndRethrowIfNotFound(statisticsCodec, inputStream)); @@ -92,24 +84,23 @@ private Optional readExtendedStatistics( } @Override - public void updateExtendedStatistics( + public String updateExtendedStatistics( ConnectorSession session, SchemaTableName schemaTableName, String tableLocation, + Optional previousStatsFile, VendedCredentialsHandle credentialsHandle, ExtendedStatistics statistics) { + String currentStatsFile = UUID.randomUUID().toString() + ".extended_stats.json"; try { - Location statisticsPath = Location.of(tableLocation).appendPath(STATISTICS_META_DIR).appendPath(STATISTICS_FILE); + Location statisticsPath = resolveStatsFileLocation(tableLocation, currentStatsFile); TrinoFileSystem fileSystem = fileSystemFactory.create(session, credentialsHandle); fileSystem.newOutputFile(statisticsPath).createOrOverwrite(statisticsCodec.toJsonBytes(statistics)); - // Remove outdated Starburst stats file, if it exists. - Location starburstStatisticsPath = Location.of(tableLocation).appendPath(STARBURST_META_DIR).appendPath(STARBURST_STATISTICS_FILE); - if (fileSystem.newInputFile(starburstStatisticsPath).exists()) { - fileSystem.deleteFile(starburstStatisticsPath); - } + previousStatsFile.ifPresent(previousFile -> deleteExtendedStatistics(session, schemaTableName, tableLocation, previousFile, credentialsHandle)); + return currentStatsFile; } catch (IOException e) { throw new TrinoException(DELTA_LAKE_FILESYSTEM_ERROR, "Failed to store statistics with table location: " + tableLocation, e); @@ -117,9 +108,9 @@ public void updateExtendedStatistics( } @Override - public void deleteExtendedStatistics(ConnectorSession session, SchemaTableName schemaTableName, String tableLocation, VendedCredentialsHandle credentialsHandle) + public void deleteExtendedStatistics(ConnectorSession session, SchemaTableName schemaTableName, String tableLocation, String extendedStatsFile, VendedCredentialsHandle credentialsHandle) { - Location statisticsPath = Location.of(tableLocation).appendPath(STATISTICS_META_DIR).appendPath(STATISTICS_FILE); + Location statisticsPath = resolveStatsFileLocation(tableLocation, extendedStatsFile); try { TrinoFileSystem fileSystem = fileSystemFactory.create(session, credentialsHandle); if (fileSystem.newInputFile(statisticsPath).exists()) { @@ -142,4 +133,9 @@ public static T decodeAndRethrowIfNotFound(JsonCodec codec, InputStream s throw new RuntimeException("Failed to decode JSON", e); } } + + private static Location resolveStatsFileLocation(String tableLocation, String statsFile) + { + return Location.of(tableLocation).appendPath(STATISTICS_META_DIR).appendPath(statsFile); + } } diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/CommitInfoEntry.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/CommitInfoEntry.java index 2e9e29194c74..3433ce636d36 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/CommitInfoEntry.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/CommitInfoEntry.java @@ -39,7 +39,8 @@ public record CommitInfoEntry( long readVersion, String isolationLevel, Optional isBlindAppend, - Map operationMetrics) + Map operationMetrics, + Optional extendedStatsFile) { private static final int INSTANCE_SIZE = instanceSize(CommitInfoEntry.class); @@ -50,7 +51,7 @@ public record CommitInfoEntry( public CommitInfoEntry withVersion(long version) { - return new CommitInfoEntry(version, inCommitTimestamp, timestamp, userId, userName, operation, operationParameters, job, notebook, clusterId, readVersion, isolationLevel, isBlindAppend, operationMetrics); + return new CommitInfoEntry(version, inCommitTimestamp, timestamp, userId, userName, operation, operationParameters, job, notebook, clusterId, readVersion, isolationLevel, isBlindAppend, operationMetrics, extendedStatsFile); } public long getRetainedSizeInBytes() @@ -69,7 +70,8 @@ public long getRetainedSizeInBytes() + SIZE_OF_LONG + estimatedSizeOf(isolationLevel) + sizeOf(isBlindAppend, SizeOf::sizeOf) - + estimatedSizeOf(operationMetrics, SizeOf::estimatedSizeOf, SizeOf::estimatedSizeOf); + + estimatedSizeOf(operationMetrics, SizeOf::estimatedSizeOf, SizeOf::estimatedSizeOf) + + sizeOf(extendedStatsFile, SizeOf::estimatedSizeOf); } public record Job(String jobId, String jobName, String runId, String jobOwnerId, String triggerType) diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/MetadataAndProtocolEntries.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/MetadataAndProtocolEntries.java index 707686f4bcba..82fa7c11fbb8 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/MetadataAndProtocolEntries.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/MetadataAndProtocolEntries.java @@ -18,7 +18,7 @@ import static io.airlift.slice.SizeOf.instanceSize; import static io.airlift.slice.SizeOf.sizeOf; -public record MetadataAndProtocolEntries(Optional metadata, Optional protocol) +public record MetadataAndProtocolEntries(Optional metadata, Optional protocol, Optional commitInfo) { private static final int INSTANCE_SIZE = instanceSize(MetadataAndProtocolEntries.class); @@ -26,7 +26,8 @@ public long getRetainedSizeInBytes() { return INSTANCE_SIZE + sizeOf(metadata, MetadataEntry::getRetainedSizeInBytes) - + sizeOf(protocol, ProtocolEntry::getRetainedSizeInBytes); + + sizeOf(protocol, ProtocolEntry::getRetainedSizeInBytes) + + sizeOf(commitInfo, CommitInfoEntry::getRetainedSizeInBytes); } public static Builder builder() @@ -38,6 +39,7 @@ public static final class Builder { private Optional metadataEntry = Optional.empty(); private Optional protocolEntry = Optional.empty(); + private Optional commitInfoEntry = Optional.empty(); public boolean hasMetadata() { @@ -49,9 +51,14 @@ public boolean hasProtocol() return protocolEntry.isPresent(); } + public boolean hasCommitInfo() + { + return commitInfoEntry.isPresent(); + } + public boolean isFull() { - return hasMetadata() && hasProtocol(); + return hasMetadata() && hasProtocol() && hasCommitInfo(); } public Builder withMetadataEntry(MetadataEntry metadataEntry) @@ -66,6 +73,12 @@ public Builder withProtocolEntry(ProtocolEntry protocolEntry) return this; } + public Builder withCommitInfo(CommitInfoEntry commitInfoEntry) + { + this.commitInfoEntry = Optional.of(commitInfoEntry); + return this; + } + public Builder withEntries(MetadataAndProtocolEntries entries) { if (!hasMetadata() && entries.metadata().isPresent()) { @@ -74,6 +87,9 @@ public Builder withEntries(MetadataAndProtocolEntries entries) if (!hasProtocol() && entries.protocol().isPresent()) { withProtocolEntry(entries.protocol().get()); } + if (!hasCommitInfo() && entries.commitInfo().isPresent()) { + withCommitInfo(entries.commitInfo().get()); + } return this; } @@ -85,12 +101,15 @@ public Builder withTransactionLogEntry(DeltaLakeTransactionLogEntry transactionL if (protocolEntry.isEmpty() && transactionLogEntry.getProtocol() != null) { withProtocolEntry(transactionLogEntry.getProtocol()); } + if (commitInfoEntry.isEmpty() && transactionLogEntry.getCommitInfo() != null) { + withCommitInfo(transactionLogEntry.getCommitInfo()); + } return this; } public MetadataAndProtocolEntries build() { - return new MetadataAndProtocolEntries(metadataEntry, protocolEntry); + return new MetadataAndProtocolEntries(metadataEntry, protocolEntry, commitInfoEntry); } } } diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java index 42dd21452c3f..738dc4984b9e 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java @@ -449,7 +449,7 @@ private Stream activeAddEntries(Stream activeJsonEntries = new LinkedHashMap<>(); HashSet removedFiles = new HashSet<>(); - // The json entries containing the last few entries in the log need to be applied on top of the parquet snapshot: + // The JSON entries containing the last few entries in the log need to be applied on top of the parquet snapshot: // - Any files which have been removed need to be excluded // - Any files with newer add actions need to be updated with the most recent metadata transactions.forEach(transaction -> { @@ -509,15 +509,25 @@ public MetadataAndProtocolEntries getMetadataAndProtocolEntry(ConnectorSession s return entries; } + tryUpdateCommitInfoIfPossible(builder, fileSystem, tableSnapshot); + + Set entryTypes = resolveCheckpointReadTypes(builder); + if (entryTypes.isEmpty()) { + MetadataAndProtocolEntries entries = builder.build(); + tableSnapshot.updateCachedEntries(entries); + return entries; + } + MetadataAndProtocolEntries checkpointEntries = getCheckpointEntry( session, tableSnapshot, - resolveCheckpointReadTypes(builder), + entryTypes, checkpointStream -> { for (Iterator it = checkpointStream.iterator(); it.hasNext(); ) { DeltaLakeTransactionLogEntry transactionLogEntry = it.next(); builder.withTransactionLogEntry(transactionLogEntry); - if (builder.isFull()) { + // checkpoint don't contain commitInfo entry + if (builder.hasMetadata() && builder.hasProtocol()) { return Optional.of(builder.build()); } } @@ -547,6 +557,31 @@ private static Set resolveCheckpointReadTypes return typesForRead.build(); } + private void tryUpdateCommitInfoIfPossible(MetadataAndProtocolEntries.Builder builder, TrinoFileSystem fileSystem, TableSnapshot tableSnapshot) + { + if (builder.hasCommitInfo()) { + return; + } + + // we have looked up in the transaction logs and can't find the commitInfo + if (!tableSnapshot.getTransactions().isEmpty()) { + return; + } + + // we didn't try to find the commitInfo in any transaction logs, probably because there are no transactions in the log(table just created) + // or there exists logs but just has written the checkpoint, but we didn't persist the commitInfo in the checkpoint, in this case we should + // look at the last log to try to find the commitInfo. + String tableLocation = tableSnapshot.getTableLocation(); + String transactionLogDir = getTransactionLogDir(tableLocation); + try (Stream entryStream = getJsonEntries(fileSystem, transactionLogDir, ImmutableList.of(tableSnapshot.getVersion()))) { + // TODO: optimize the logic for inCommitTimestamp https://github.com/delta-io/delta/blob/master/PROTOCOL.md#writer-requirements-for-in-commit-timestamps + entryStream.map(DeltaLakeTransactionLogEntry::getCommitInfo) + .filter(Objects::nonNull) + .findFirst() + .ifPresent(builder::withCommitInfo); + } + } + private Optional getCheckpointEntry( ConnectorSession session, TableSnapshot tableSnapshot, diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeAlluxioCacheFileOperations.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeAlluxioCacheFileOperations.java index 480ed8e57972..628261ca8fb0 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeAlluxioCacheFileOperations.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeAlluxioCacheFileOperations.java @@ -84,15 +84,15 @@ public void testCacheFileOperations() assertFileSystemAccesses( "SELECT * FROM test_cache_file_operations", ImmutableMultiset.builder() - .add(new CacheOperation("Alluxio.readCached", "00000000000000000000.json", 0, 816)) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000000.json", 0, 858)) .add(new CacheOperation("InputFile.length", "00000000000000000000.json")) - .add(new CacheOperation("Alluxio.readCached", "00000000000000000001.json", 0, 658)) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000001.json", 0, 737)) .add(new CacheOperation("InputFile.length", "00000000000000000001.json")) - .add(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 658)) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 737)) .add(new CacheOperation("InputFile.length", "00000000000000000002.json")) - .add(new CacheOperation("Alluxio.readExternalStream", "00000000000000000002.json", 0, 658)) + .add(new CacheOperation("Alluxio.readExternalStream", "00000000000000000002.json", 0, 737)) .add(new CacheOperation("InputFile.newStream", "00000000000000000002.json")) - .add(new CacheOperation("Alluxio.writeCache", "00000000000000000002.json", 0, 658)) + .add(new CacheOperation("Alluxio.writeCache", "00000000000000000002.json", 0, 737)) .add(new CacheOperation("InputFile.length", "00000000000000000003.json")) .add(new CacheOperation("InputFile.newStream", "_last_checkpoint")) .add(new CacheOperation("Alluxio.readCached", "key=p1/", 0, 229)) @@ -105,11 +105,11 @@ public void testCacheFileOperations() assertFileSystemAccesses( "SELECT * FROM test_cache_file_operations", ImmutableMultiset.builder() - .add(new CacheOperation("Alluxio.readCached", "00000000000000000000.json", 0, 816)) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000000.json", 0, 858)) .add(new CacheOperation("InputFile.length", "00000000000000000000.json")) - .add(new CacheOperation("Alluxio.readCached", "00000000000000000001.json", 0, 658)) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000001.json", 0, 737)) .add(new CacheOperation("InputFile.length", "00000000000000000001.json")) - .add(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 658)) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 737)) .add(new CacheOperation("InputFile.length", "00000000000000000002.json")) .add(new CacheOperation("InputFile.length", "00000000000000000003.json")) .add(new CacheOperation("InputFile.newStream", "_last_checkpoint")) @@ -122,19 +122,19 @@ public void testCacheFileOperations() assertFileSystemAccesses( "SELECT * FROM test_cache_file_operations", ImmutableMultiset.builder() - .add(new CacheOperation("Alluxio.readCached", "00000000000000000000.json", 0, 816)) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000000.json", 0, 858)) .add(new CacheOperation("InputFile.length", "00000000000000000000.json")) - .add(new CacheOperation("Alluxio.readCached", "00000000000000000001.json", 0, 658)) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000001.json", 0, 737)) .add(new CacheOperation("InputFile.length", "00000000000000000001.json")) - .add(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 658)) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 737)) .add(new CacheOperation("InputFile.length", "00000000000000000002.json")) - .add(new CacheOperation("Alluxio.readCached", "00000000000000000003.json", 0, 658)) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000003.json", 0, 737)) .add(new CacheOperation("InputFile.length", "00000000000000000003.json")) - .add(new CacheOperation("Alluxio.readCached", "00000000000000000004.json", 0, 658)) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000004.json", 0, 737)) .add(new CacheOperation("InputFile.length", "00000000000000000004.json")) - .add(new CacheOperation("Alluxio.readCached", "00000000000000000005.json", 0, 658)) - .add(new CacheOperation("Alluxio.readExternalStream", "00000000000000000005.json", 0, 658)) - .add(new CacheOperation("Alluxio.writeCache", "00000000000000000005.json", 0, 658)) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000005.json", 0, 737)) + .add(new CacheOperation("Alluxio.readExternalStream", "00000000000000000005.json", 0, 737)) + .add(new CacheOperation("Alluxio.writeCache", "00000000000000000005.json", 0, 737)) .add(new CacheOperation("InputFile.newStream", "00000000000000000005.json")) .add(new CacheOperation("InputFile.length", "00000000000000000005.json")) .add(new CacheOperation("InputFile.length", "00000000000000000006.json")) @@ -154,17 +154,17 @@ public void testCacheFileOperations() assertFileSystemAccesses( "SELECT * FROM test_cache_file_operations", ImmutableMultiset.builder() - .add(new CacheOperation("Alluxio.readCached", "00000000000000000000.json", 0, 816)) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000000.json", 0, 858)) .add(new CacheOperation("InputFile.length", "00000000000000000000.json")) - .add(new CacheOperation("Alluxio.readCached", "00000000000000000001.json", 0, 658)) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000001.json", 0, 737)) .add(new CacheOperation("InputFile.length", "00000000000000000001.json")) - .add(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 658)) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 737)) .add(new CacheOperation("InputFile.length", "00000000000000000002.json")) - .add(new CacheOperation("Alluxio.readCached", "00000000000000000003.json", 0, 658)) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000003.json", 0, 737)) .add(new CacheOperation("InputFile.length", "00000000000000000003.json")) - .add(new CacheOperation("Alluxio.readCached", "00000000000000000004.json", 0, 658)) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000004.json", 0, 737)) .add(new CacheOperation("InputFile.length", "00000000000000000004.json")) - .add(new CacheOperation("Alluxio.readCached", "00000000000000000005.json", 0, 658)) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000005.json", 0, 737)) .add(new CacheOperation("InputFile.length", "00000000000000000005.json")) .add(new CacheOperation("InputFile.length", "00000000000000000006.json")) .add(new CacheOperation("InputFile.newStream", "_last_checkpoint")) @@ -186,6 +186,11 @@ public void testCacheCheckpointAndExtendedStatsFileOperations() ImmutableMultiset.builder() .addCopies(new CacheOperation("Alluxio.readCached", "00000000000000000003.checkpoint.parquet", 0, 7077), 2) .addCopies(new CacheOperation("InputFile.length", "00000000000000000003.checkpoint.parquet"), 2) + .add(new CacheOperation("Alluxio.readExternalStream", "00000000000000000003.json", 0, 722)) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000003.json", 0, 722)) + .add(new CacheOperation("Alluxio.writeCache", "00000000000000000003.json", 0, 722)) + .add(new CacheOperation("InputFile.length", "00000000000000000003.json")) + .add(new CacheOperation("InputFile.newStream", "00000000000000000003.json")) .add(new CacheOperation("InputFile.length", "00000000000000000004.json")) .addAll(Stream.of("int_part=10/string_part=part1/", "int_part=20/string_part=part2/", "int_part=__HIVE_DEFAULT_PARTITION__/string_part=__HIVE_DEFAULT_PARTITION__/") .flatMap(fileId -> @@ -201,6 +206,8 @@ public void testCacheCheckpointAndExtendedStatsFileOperations() ImmutableMultiset.builder() .addCopies(new CacheOperation("Alluxio.readCached", "00000000000000000003.checkpoint.parquet", 0, 7077), 3) .addCopies(new CacheOperation("InputFile.length", "00000000000000000003.checkpoint.parquet"), 3) + .add(new CacheOperation("InputFile.length", "00000000000000000003.json")) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000003.json", 0, 722)) .add(new CacheOperation("InputFile.length", "00000000000000000004.json")) .addAll(Stream.of("int_part=10/string_part=part1/", "int_part=20/string_part=part2/", "int_part=__HIVE_DEFAULT_PARTITION__/string_part=__HIVE_DEFAULT_PARTITION__/") .flatMap(fileId -> Stream.of(new CacheOperation("Alluxio.readCached", fileId, 0, 199))) @@ -248,7 +255,6 @@ public void testCacheDeletionVectorsFileOperations() .add(new CacheOperation("Alluxio.readCached", "deletion_vector", 1, 42)) .add(new CacheOperation("InputFile.length", "deletion_vector")) .add(new CacheOperation("InputFile.newStream", "extended_stats.json")) - .add(new CacheOperation("InputFile.newStream", "extendeded_stats.json")) .add(new CacheOperation("InputFile.newStream", "_last_checkpoint")) .build()); } @@ -323,6 +329,11 @@ public void testTimeTravelWithLastCheckpoint() ImmutableMultiset.builder() .addCopies(new CacheOperation("Alluxio.readCached", "00000000000000000002.checkpoint.parquet", 0, 5884), 2) .addCopies(new CacheOperation("InputFile.length", "00000000000000000002.checkpoint.parquet"), 2) + .add(new CacheOperation("Alluxio.writeCache", "00000000000000000002.json", 0, 613)) + .add(new CacheOperation("Alluxio.readExternalStream", "00000000000000000002.json", 0, 613)) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 613)) + .add(new CacheOperation("InputFile.length", "00000000000000000002.json")) + .add(new CacheOperation("InputFile.newStream", "00000000000000000002.json")) .add(new CacheOperation("InputFile.exists", "00000000000000000002.json")) .add(new CacheOperation("Input.readFully", "data", 0, 199)) .add(new CacheOperation("Alluxio.writeCache", "data", 0, 199)) @@ -335,6 +346,8 @@ public void testTimeTravelWithLastCheckpoint() .addCopies(new CacheOperation("Alluxio.readCached", "00000000000000000002.checkpoint.parquet", 0, 5884), 2) .addCopies(new CacheOperation("InputFile.length", "00000000000000000002.checkpoint.parquet"), 2) .addCopies(new CacheOperation("Alluxio.readCached", "data", 0, 199), 3) + .add(new CacheOperation("InputFile.length", "00000000000000000002.json")) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 613)) .add(new CacheOperation("InputFile.exists", "00000000000000000002.json")) .add(new CacheOperation("InputFile.newStream", "_last_checkpoint")) .build()); @@ -357,11 +370,8 @@ public void testTimeTravelWithLastCheckpointUsingTemporalVersion() .addCopies(new CacheOperation("Alluxio.readCached", "00000000000000000001.json", 0, 613), 2) .addCopies(new CacheOperation("InputFile.length", "00000000000000000001.json"), 3) .addCopies(new CacheOperation("InputFile.newStream", "_last_checkpoint"), 2) - .add(new CacheOperation("Alluxio.writeCache", "00000000000000000002.json", 0, 613)) .add(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 613)) .addCopies(new CacheOperation("InputFile.length", "00000000000000000002.json"), 2) - .add(new CacheOperation("Alluxio.readExternalStream", "00000000000000000002.json", 0, 613)) - .add(new CacheOperation("InputFile.newStream", "00000000000000000002.json")) .addCopies(new CacheOperation("Alluxio.readCached", "data", 0, 199), 2) .build()); assertFileSystemAccesses( @@ -371,8 +381,8 @@ public void testTimeTravelWithLastCheckpointUsingTemporalVersion() .addCopies(new CacheOperation("InputFile.length", "00000000000000000002.checkpoint.parquet"), 2) .addCopies(new CacheOperation("Alluxio.readCached", "data", 0, 199), 3) .addCopies(new CacheOperation("InputFile.newStream", "_last_checkpoint"), 2) - .addCopies(new CacheOperation("InputFile.length", "00000000000000000002.json"), 2) - .add(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 613)) + .addCopies(new CacheOperation("InputFile.length", "00000000000000000002.json"), 3) + .addCopies(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 613), 2) .build()); assertFileSystemAccesses( "SELECT * FROM time_travel_with_last_checkpoint_using_temporal FOR TIMESTAMP AS OF TIMESTAMP '2024-03-13 04:33:19.342 UTC'", // version 2 @@ -381,8 +391,8 @@ public void testTimeTravelWithLastCheckpointUsingTemporalVersion() .addCopies(new CacheOperation("InputFile.length", "00000000000000000002.checkpoint.parquet"), 2) .addCopies(new CacheOperation("Alluxio.readCached", "data", 0, 199), 3) .addCopies(new CacheOperation("InputFile.newStream", "_last_checkpoint"), 2) - .addCopies(new CacheOperation("InputFile.length", "00000000000000000002.json"), 2) - .add(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 613)) + .addCopies(new CacheOperation("InputFile.length", "00000000000000000002.json"), 3) + .addCopies(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 613), 2) .build()); assertUpdate("DROP TABLE time_travel_with_last_checkpoint_using_temporal"); @@ -421,6 +431,8 @@ public void testTimeTravelWithoutLastCheckpoint() .addCopies(new CacheOperation("Alluxio.readCached", "00000000000000000002.checkpoint.parquet", 0, 5884), 2) .addCopies(new CacheOperation("InputFile.length", "00000000000000000002.checkpoint.parquet"), 2) .add(new CacheOperation("InputFile.exists", "00000000000000000002.json")) + .add(new CacheOperation("InputFile.length", "00000000000000000002.json")) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 613)) .add(new CacheOperation("Input.readFully", "data", 0, 199)) .add(new CacheOperation("Alluxio.writeCache", "data", 0, 199)) .addCopies(new CacheOperation("Alluxio.readCached", "data", 0, 199), 3) @@ -432,6 +444,8 @@ public void testTimeTravelWithoutLastCheckpoint() .addCopies(new CacheOperation("Alluxio.readCached", "00000000000000000002.checkpoint.parquet", 0, 5884), 2) .addCopies(new CacheOperation("InputFile.length", "00000000000000000002.checkpoint.parquet"), 2) .add(new CacheOperation("InputFile.exists", "00000000000000000002.json")) + .add(new CacheOperation("InputFile.length", "00000000000000000002.json")) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 613)) .addCopies(new CacheOperation("Alluxio.readCached", "data", 0, 199), 3) .add(new CacheOperation("InputFile.newStream", "_last_checkpoint")) .build()); @@ -477,6 +491,8 @@ public void testTimeTravelWithoutLastCheckpointUsingTemporal() .add(new CacheOperation("InputFile.length", "00000000000000000001.json")) .add(new CacheOperation("Alluxio.readCached", "00000000000000000001.json", 0, 613)) .add(new CacheOperation("InputFile.length", "00000000000000000001.json")) + .add(new CacheOperation("InputFile.length", "00000000000000000002.json")) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 613)) .add(new CacheOperation("Input.readFully", "data", 0, 199)) .add(new CacheOperation("Alluxio.writeCache", "data", 0, 199)) .addCopies(new CacheOperation("Alluxio.readCached", "data", 0, 199), 3) @@ -503,8 +519,8 @@ public void testTimeTravelWithoutLastCheckpointUsingTemporal() .add(new CacheOperation("InputFile.length", "00000000000000000001.json")) .addCopies(new CacheOperation("Alluxio.readCached", "data", 0, 199), 3) .addCopies(new CacheOperation("Alluxio.readCached", "00000000000000000002.checkpoint.parquet", 0, 5884), 2) - .add(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 613)) - .addCopies(new CacheOperation("InputFile.length", "00000000000000000002.json"), 2) + .addCopies(new CacheOperation("Alluxio.readCached", "00000000000000000002.json", 0, 613), 2) + .addCopies(new CacheOperation("InputFile.length", "00000000000000000002.json"), 3) .addCopies(new CacheOperation("InputFile.length", "00000000000000000002.checkpoint.parquet"), 2) .addCopies(new CacheOperation("InputFile.length", "00000000000000000003.json"), 2) .add(new CacheOperation("Alluxio.readCached", "00000000000000000003.json", 0, 613)) @@ -539,6 +555,11 @@ public void testReadV2CheckpointJson() .add(new CacheOperation("Input.readFully", "00000000000000000001.checkpoint.0000000001.0000000001.90cf4e21-dbaa-41d6-8ae5-6709cfbfbfe0.parquet", 0, 9176)) .add(new CacheOperation("Alluxio.writeCache", "00000000000000000001.checkpoint.0000000001.0000000001.90cf4e21-dbaa-41d6-8ae5-6709cfbfbfe0.parquet", 0, 9176)) .add(new CacheOperation("InputFile.length", "00000000000000000001.checkpoint.0000000001.0000000001.90cf4e21-dbaa-41d6-8ae5-6709cfbfbfe0.parquet")) + .add(new CacheOperation("Alluxio.writeCache", "00000000000000000001.json", 0, 663)) + .add(new CacheOperation("Alluxio.readExternalStream", "00000000000000000001.json", 0, 663)) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000001.json", 0, 663)) + .add(new CacheOperation("InputFile.length", "00000000000000000001.json")) + .add(new CacheOperation("InputFile.newStream", "00000000000000000001.json")) .add(new CacheOperation("InputFile.length", "00000000000000000002.json")) .add(new CacheOperation("InputFile.newStream", "_last_checkpoint")) .add(new CacheOperation("Alluxio.writeCache", "data", 0, 666)) @@ -552,6 +573,8 @@ public void testReadV2CheckpointJson() .addCopies(new CacheOperation("InputFile.length", "00000000000000000001.checkpoint.73a4ddb8-2bfc-40d8-b09f-1b6a0abdfb04.json"), 2) .add(new CacheOperation("Alluxio.readCached", "00000000000000000001.checkpoint.0000000001.0000000001.90cf4e21-dbaa-41d6-8ae5-6709cfbfbfe0.parquet", 0, 9176)) .add(new CacheOperation("InputFile.length", "00000000000000000001.checkpoint.0000000001.0000000001.90cf4e21-dbaa-41d6-8ae5-6709cfbfbfe0.parquet")) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000001.json", 0, 663)) + .add(new CacheOperation("InputFile.length", "00000000000000000001.json")) .add(new CacheOperation("InputFile.length", "00000000000000000002.json")) .add(new CacheOperation("InputFile.newStream", "_last_checkpoint")) .add(new CacheOperation("Alluxio.readCached", "data", 0, 666)) @@ -583,6 +606,11 @@ public void testReadV2CheckpointParquet() .add(new CacheOperation("InputFile.length", "00000000000000000001.checkpoint.0000000001.0000000001.03288d7e-af16-44ed-829c-196064a71812.parquet")) .add(new CacheOperation("Input.readFully", "00000000000000000001.checkpoint.0000000001.0000000001.03288d7e-af16-44ed-829c-196064a71812.parquet", 0, 9415)) .add(new CacheOperation("Alluxio.writeCache", "00000000000000000001.checkpoint.0000000001.0000000001.03288d7e-af16-44ed-829c-196064a71812.parquet", 0, 9415)) + .add(new CacheOperation("Alluxio.writeCache", "00000000000000000001.json", 0, 663)) + .add(new CacheOperation("Alluxio.readExternalStream", "00000000000000000001.json", 0, 663)) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000001.json", 0, 663)) + .add(new CacheOperation("InputFile.length", "00000000000000000001.json")) + .add(new CacheOperation("InputFile.newStream", "00000000000000000001.json")) .add(new CacheOperation("InputFile.length", "00000000000000000002.json")) .add(new CacheOperation("Alluxio.writeCache", "data", 0, 666)) .add(new CacheOperation("Alluxio.readCached", "data", 0, 666)) @@ -597,6 +625,8 @@ public void testReadV2CheckpointParquet() .add(new CacheOperation("Alluxio.readCached", "00000000000000000001.checkpoint.0000000001.0000000001.03288d7e-af16-44ed-829c-196064a71812.parquet", 0, 9415)) .add(new CacheOperation("InputFile.length", "00000000000000000001.checkpoint.0000000001.0000000001.03288d7e-af16-44ed-829c-196064a71812.parquet")) .add(new CacheOperation("InputFile.length", "00000000000000000002.json")) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000001.json", 0, 663)) + .add(new CacheOperation("InputFile.length", "00000000000000000001.json")) .add(new CacheOperation("InputFile.newStream", "_last_checkpoint")) .add(new CacheOperation("Alluxio.readCached", "data", 0, 666)) .build()); @@ -609,10 +639,10 @@ public void testCreateOrReplaceTable() assertFileSystemAccesses("CREATE OR REPLACE TABLE test_create_or_replace (id VARCHAR, age INT)", ImmutableMultiset.builder() - .add(new CacheOperation("Alluxio.readCached", "00000000000000000000.json", 0, 821)) - .add(new CacheOperation("Alluxio.readExternalStream", "00000000000000000000.json", 0, 821)) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000000.json", 0, 863)) + .add(new CacheOperation("Alluxio.readExternalStream", "00000000000000000000.json", 0, 863)) .add(new CacheOperation("InputFile.newStream", "00000000000000000000.json")) - .add(new CacheOperation("Alluxio.writeCache", "00000000000000000000.json", 0, 821)) + .add(new CacheOperation("Alluxio.writeCache", "00000000000000000000.json", 0, 863)) .add(new CacheOperation("InputFile.length", "00000000000000000000.json")) .add(new CacheOperation("InputFile.exists", "00000000000000000001.json")) .add(new CacheOperation("InputFile.length", "00000000000000000001.json")) @@ -628,22 +658,19 @@ public void testCreateOrReplaceTableAsSelect() assertFileSystemAccesses( "CREATE OR REPLACE TABLE test_create_or_replace_as_select AS SELECT 1 col_name", ImmutableMultiset.builder() - .add(new CacheOperation("InputFile.exists", "extendeded_stats.json")) - .add(new CacheOperation("InputFile.newStream", "extendeded_stats.json")) .add(new CacheOperation("InputFile.newStream", "extended_stats.json")) .build()); assertFileSystemAccesses( "CREATE OR REPLACE TABLE test_create_or_replace_as_select AS SELECT 1 col_name", ImmutableMultiset.builder() - .add(new CacheOperation("Alluxio.readCached", "00000000000000000000.json", 0, 1063)) - .add(new CacheOperation("Alluxio.readExternalStream", "00000000000000000000.json", 0, 1063)) - .add(new CacheOperation("Alluxio.writeCache", "00000000000000000000.json", 0, 1063)) + .add(new CacheOperation("Alluxio.readCached", "00000000000000000000.json", 0, 1142)) + .add(new CacheOperation("Alluxio.readExternalStream", "00000000000000000000.json", 0, 1142)) + .add(new CacheOperation("Alluxio.writeCache", "00000000000000000000.json", 0, 1142)) .add(new CacheOperation("InputFile.newStream", "00000000000000000000.json")) .add(new CacheOperation("InputFile.length", "00000000000000000000.json")) .add(new CacheOperation("InputFile.length", "00000000000000000001.json")) .add(new CacheOperation("InputFile.exists", "00000000000000000001.json")) - .add(new CacheOperation("InputFile.exists", "extendeded_stats.json")) .add(new CacheOperation("InputFile.newStream", "extended_stats.json")) .add(new CacheOperation("InputFile.newStream", "_last_checkpoint")) .build()); diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeMetadata.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeMetadata.java index e96e7afc6087..159d224588d6 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeMetadata.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeMetadata.java @@ -499,7 +499,8 @@ private static DeltaLakeTableHandle createDeltaLakeTableHandle(Set createConstrainedColumnsTuple( diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeNodeLocalDynamicSplitPruning.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeNodeLocalDynamicSplitPruning.java index ffa215cbb70d..5d4c2dbbd29d 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeNodeLocalDynamicSplitPruning.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeNodeLocalDynamicSplitPruning.java @@ -146,7 +146,8 @@ public void testDynamicSplitPruningOnUnpartitionedTable() Optional.of(Set.of(keyColumnHandle, dataColumnHandle)), Optional.empty(), 0, - false), + false, + Optional.empty()), transaction); TupleDomain splitPruningPredicate = TupleDomain.withColumnDomains( @@ -244,7 +245,8 @@ public void testDynamicSplitPruningWithExplicitPartitionFilter() Optional.of(Set.of(dateColumnHandle, receiptColumnHandle, amountColumnHandle)), Optional.empty(), 0, - false), + false, + Optional.empty()), transaction); // Simulate situations where the dynamic filter (e.g.: while performing a JOIN with another table) reduces considerably diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeSplitManager.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeSplitManager.java index 5d99949ee86a..85c9a635fbff 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeSplitManager.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeSplitManager.java @@ -99,7 +99,8 @@ public class TestDeltaLakeSplitManager Optional.empty(), Optional.empty(), 0, - false); + false, + Optional.empty()); private final HiveTransactionHandle transactionHandle = new HiveTransactionHandle(true); @Test diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestTransactionLogAccess.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestTransactionLogAccess.java index 21b2e184316d..c0e6e6f02668 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestTransactionLogAccess.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestTransactionLogAccess.java @@ -627,7 +627,8 @@ public void testIncrementalCacheUpdates() Optional.empty(), Optional.empty(), 0, - false), + false, + Optional.empty()), updatedTableSnapshot, TupleDomain.all(), alwaysTrue())) { diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestingDeltaLakeUtils.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestingDeltaLakeUtils.java index e02b5bbced30..2e3dcaebb540 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestingDeltaLakeUtils.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestingDeltaLakeUtils.java @@ -71,7 +71,8 @@ public static DeltaLakeTableHandle createTable(SchemaTableName schemaTableName, Optional.empty(), Optional.empty(), 0, - false); + false, + Optional.empty()); } public static DeltaLakeTableHandle createTable(MetadataEntry metadataEntry, ProtocolEntry protocolEntry) @@ -89,7 +90,8 @@ public static DeltaLakeTableHandle createTable(MetadataEntry metadataEntry, Prot Optional.empty(), Optional.empty(), 0, - false); + false, + Optional.empty()); } public static List getTableActiveFiles(TransactionLogAccess transactionLogAccess, TrinoFileSystemFactory fileSystemFactory, String tableLocation) diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/statistics/TestDeltaLakeFileBasedTableStatisticsProvider.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/statistics/TestDeltaLakeFileBasedTableStatisticsProvider.java index 5727ce4a4724..0aca4061de00 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/statistics/TestDeltaLakeFileBasedTableStatisticsProvider.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/statistics/TestDeltaLakeFileBasedTableStatisticsProvider.java @@ -138,7 +138,8 @@ private DeltaLakeTableHandle registerTable(String tableName, String directoryNam Optional.empty(), Optional.empty(), 0, - false); + false, + Optional.empty()); } @Test @@ -268,7 +269,8 @@ public void testStatisticsMultipleFiles() tableHandle.getProjectedColumns(), tableHandle.getAnalyzeHandle(), 0, - tableHandle.isTimeTravel()); + tableHandle.isTimeTravel(), + tableHandle.extendedStatsFile()); stats = getTableStatistics(SESSION, tableHandleWithUnenforcedConstraint); columnStatistics = stats.getColumnStatistics().get(COLUMN_HANDLE); assertThat(columnStatistics.getRange().get().getMin()).isEqualTo(0.0); @@ -292,7 +294,8 @@ public void testStatisticsNoRecords() tableHandle.getProjectedColumns(), tableHandle.getAnalyzeHandle(), 0, - tableHandle.isTimeTravel()); + tableHandle.isTimeTravel(), + tableHandle.extendedStatsFile()); DeltaLakeTableHandle tableHandleWithNoneUnenforcedConstraint = new DeltaLakeTableHandle( tableHandle.getSchemaName(), tableHandle.getTableName(), @@ -306,7 +309,8 @@ public void testStatisticsNoRecords() tableHandle.getProjectedColumns(), tableHandle.getAnalyzeHandle(), 0, - tableHandle.isTimeTravel()); + tableHandle.isTimeTravel(), + tableHandle.extendedStatsFile()); // If either the table handle's constraint or the provided Constraint are none, it will cause a 0 record count to be reported assertEmptyStats(getTableStatistics(SESSION, tableHandleWithNoneEnforcedConstraint)); assertEmptyStats(getTableStatistics(SESSION, tableHandleWithNoneUnenforcedConstraint)); @@ -491,6 +495,6 @@ private Optional readExtendedStatisticsFromTableResource(Str { SchemaTableName name = new SchemaTableName("some_ignored_schema", "some_ignored_name"); String tableLocation = Resources.getResource(tableLocationResourceName).toExternalForm(); - return statistics.readExtendedStatistics(SESSION, name, tableLocation, VendedCredentialsHandle.empty(tableLocation)); + return statistics.readExtendedStatistics(SESSION, name, tableLocation, "extended_stats.json", VendedCredentialsHandle.empty(tableLocation)); } } diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/statistics/TestMetaDirStatisticsAccess.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/statistics/TestMetaDirStatisticsAccess.java new file mode 100644 index 000000000000..41d25dc37fa9 --- /dev/null +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/statistics/TestMetaDirStatisticsAccess.java @@ -0,0 +1,184 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.deltalake.statistics; + +import com.google.common.collect.ImmutableSet; +import io.airlift.json.JsonCodecFactory; +import io.trino.filesystem.local.LocalFileSystemFactory; +import io.trino.plugin.deltalake.DefaultDeltaLakeFileSystemFactory; +import io.trino.plugin.deltalake.metastore.NoOpVendedCredentialsProvider; +import io.trino.plugin.deltalake.metastore.VendedCredentialsHandle; +import io.trino.spi.connector.SchemaTableName; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Instant; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import static io.trino.plugin.deltalake.DeltaTestingConnectorSession.SESSION; +import static io.trino.plugin.deltalake.statistics.MetaDirStatisticsAccess.STATISTICS_META_DIR; +import static org.assertj.core.api.Assertions.assertThat; + +final class TestMetaDirStatisticsAccess +{ + private static final SchemaTableName TABLE_NAME = new SchemaTableName("test_schema", "test_table"); + private static final String TABLE_LOCATION = "local:///test_extended_stats"; + + @TempDir + Path tempDir; + + @Test + void testReadMissingFileReturnsEmpty() + { + MetaDirStatisticsAccess access = createAccess(); + Optional result = access.readExtendedStatistics( + SESSION, TABLE_NAME, TABLE_LOCATION, "nonexistent.extended_stats.json", credentials()); + assertThat(result).isEmpty(); + } + + @Test + void testUpdateAndRead() + { + MetaDirStatisticsAccess access = createAccess(); + ExtendedStatistics initialStats = new ExtendedStatistics(Instant.ofEpochMilli(1_000_000), Map.of(), Optional.empty()); + + // no previous file + String firstFile = access.updateExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.empty(), credentials(), initialStats); + + assertThat(firstFile).isNotBlank(); + Optional firstRead = access.readExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, firstFile, credentials()); + assertThat(firstRead).isPresent(); + assertThat(firstRead.get().getAlreadyAnalyzedModifiedTimeMax()).isEqualTo(initialStats.getAlreadyAnalyzedModifiedTimeMax()); + assertThat(firstRead.get().getColumnStatistics()).isEqualTo(initialStats.getColumnStatistics()); + + // provide previous file so it gets replaced + ExtendedStatistics updatedStats = new ExtendedStatistics(Instant.ofEpochMilli(2_000_000), Map.of(), Optional.of(ImmutableSet.of("col_a"))); + String secondFile = access.updateExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.of(firstFile), credentials(), updatedStats); + + // new file differ from the old one + assertThat(secondFile).isNotEqualTo(firstFile); + assertThat(statsFilePath(firstFile)).doesNotExist(); + assertThat(statsFilePath(secondFile)).exists(); + + // Reading with the new filename returns the updated content, not the old one + Optional secondRead = access.readExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, secondFile, credentials()); + assertThat(secondRead).isPresent(); + assertThat(secondRead.get().getAlreadyAnalyzedModifiedTimeMax()).isEqualTo(updatedStats.getAlreadyAnalyzedModifiedTimeMax()); + assertThat(secondRead.get().getAnalyzedColumns()).isEqualTo(updatedStats.getAnalyzedColumns()); + + assertThat(access.readExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, firstFile, credentials())).isEmpty(); + } + + @Test + void testUpdateCreatesFile() + { + MetaDirStatisticsAccess access = createAccess(); + String statsFile = access.updateExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.empty(), credentials(), sampleStatistics()); + assertThat(statsFilePath(statsFile)).exists(); + } + + @Test + void testUpdateReturnsUniqueFilename() + { + MetaDirStatisticsAccess access = createAccess(); + String file1 = access.updateExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.empty(), credentials(), sampleStatistics()); + String file2 = access.updateExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.empty(), credentials(), sampleStatistics()); + assertThat(file1).isNotEqualTo(file2); + } + + @Test + void testUpdateWithNoPreviousFile() + throws IOException + { + MetaDirStatisticsAccess access = createAccess(); + String statsFile = access.updateExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.empty(), credentials(), sampleStatistics()); + + Path statsDir = statsFilePath(statsFile).getParent(); + try (var files = Files.list(statsDir)) { + List listed = files.toList(); + assertThat(listed).hasSize(1); + assertThat(listed.getFirst().getFileName().toString()).isEqualTo(statsFile); + } + } + + @Test + void testUpdateDeletesPreviousFile() + { + MetaDirStatisticsAccess access = createAccess(); + + String oldFile = access.updateExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.empty(), credentials(), sampleStatistics()); + assertThat(statsFilePath(oldFile)).exists(); + + String newFile = access.updateExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.of(oldFile), credentials(), sampleStatistics()); + + assertThat(statsFilePath(oldFile)).doesNotExist(); + assertThat(statsFilePath(newFile)).exists(); + } + + @Test + void testDeleteExistingFile() + { + MetaDirStatisticsAccess access = createAccess(); + String statsFile = access.updateExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.empty(), credentials(), sampleStatistics()); + assertThat(statsFilePath(statsFile)).exists(); + + access.deleteExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, statsFile, credentials()); + + assertThat(statsFilePath(statsFile)).doesNotExist(); + } + + @Test + void testDeleteMissingFileIsNoOp() + { + // Must not throw + createAccess().deleteExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, "nonexistent.extended_stats.json", credentials()); + } + + @Test + void testFilenameContainsExpectedSuffix() + { + MetaDirStatisticsAccess access = createAccess(); + String statsFile = access.updateExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.empty(), credentials(), sampleStatistics()); + assertThat(statsFile).endsWith(".extended_stats.json"); + } + + private MetaDirStatisticsAccess createAccess() + { + return new MetaDirStatisticsAccess( + new DefaultDeltaLakeFileSystemFactory( + new LocalFileSystemFactory(tempDir), + new NoOpVendedCredentialsProvider()), + new JsonCodecFactory().jsonCodec(ExtendedStatistics.class)); + } + + private static VendedCredentialsHandle credentials() + { + return VendedCredentialsHandle.empty(TABLE_LOCATION); + } + + private static ExtendedStatistics sampleStatistics() + { + return new ExtendedStatistics(Instant.ofEpochMilli(1_000_000), Map.of(), Optional.empty()); + } + + private Path statsFilePath(String statsFile) + { + return tempDir.resolve("test_extended_stats").resolve(STATISTICS_META_DIR).resolve(statsFile); + } +} From d91281c054aa25c4c47fcacb260f7f8b8ba2dd8f Mon Sep 17 00:00:00 2001 From: chenjian2664 Date: Tue, 14 Apr 2026 19:11:59 +0800 Subject: [PATCH 09/10] fixup! Store extended statistics filename in Delta Lake `CommitInfoEntry` --- .../plugin/deltalake/DeltaLakeMetadata.java | 6 ++-- .../CachingExtendedStatisticsAccess.java | 4 +-- .../statistics/ExtendedStatisticsAccess.java | 2 +- .../statistics/MetaDirStatisticsAccess.java | 4 +-- ...LakeAlluxioCacheMutableTransactionLog.java | 4 +++ .../TestDeltaLakeFileOperations.java | 28 +++++++++++-------- .../TestMetaDirStatisticsAccess.java | 20 ++++++------- 7 files changed, 38 insertions(+), 30 deletions(-) diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeMetadata.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeMetadata.java index 27da4958adcd..c9d8bce647a8 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeMetadata.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaLakeMetadata.java @@ -2079,7 +2079,7 @@ public void dropColumn(ConnectorSession session, ConnectorTableHandle tableHandl .collect(toImmutableMap(Entry::getKey, Entry::getValue)), existingStatistics.getAnalyzedColumns() .map(analyzedColumns -> analyzedColumns.stream().filter(column -> !column.equalsIgnoreCase(dropColumnName)).collect(toImmutableSet()))); - return statisticsAccess.updateExtendedStatistics(session, table.getSchemaTableName(), table.getLocation(), table.extendedStatsFile(), table.toCredentialsHandle(), statistics); + return statisticsAccess.writeExtendedStatistics(session, table.getSchemaTableName(), table.getLocation(), table.extendedStatsFile(), table.toCredentialsHandle(), statistics); }); String schemaString = serializeSchemaAsJson(deltaTable); @@ -2361,7 +2361,7 @@ public Optional finishInsert( private Optional getExtendedStatsFileForInsert(ConnectorSession session, Collection computedStatistics, List dataFileInfos, DeltaLakeInsertTableHandle handle) { - Optional extendedStatsFile = Optional.empty(); + Optional extendedStatsFile = handle.extendedStatisticsFile(); if (isCollectExtendedStatisticsColumnStatisticsOnWrite(session) && !computedStatistics.isEmpty() && !dataFileInfos.isEmpty()) { // TODO (https://github.com/trinodb/trino/issues/16088) Add synchronization when version conflict for INSERT is resolved. Optional maxFileModificationTime = dataFileInfos.stream() @@ -4227,7 +4227,7 @@ private String updateTableStatistics( mergedColumnStatistics, analyzedColumns); - return statisticsAccess.updateExtendedStatistics(session, schemaTableName, location, previousExtendedStatsFile, credentialsHandle, mergedExtendedStatistics); + return statisticsAccess.writeExtendedStatistics(session, schemaTableName, location, previousExtendedStatsFile, credentialsHandle, mergedExtendedStatistics); } private static String toPhysicalColumnName(String columnName, Map lowerCaseToExactColumnNames, Optional> physicalColumnNameMapping) diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/CachingExtendedStatisticsAccess.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/CachingExtendedStatisticsAccess.java index 46daea8efc0f..bd63923abac3 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/CachingExtendedStatisticsAccess.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/CachingExtendedStatisticsAccess.java @@ -75,7 +75,7 @@ public Optional readExtendedStatistics( } @Override - public String updateExtendedStatistics( + public String writeExtendedStatistics( ConnectorSession session, SchemaTableName schemaTableName, String tableLocation, @@ -83,7 +83,7 @@ public String updateExtendedStatistics( VendedCredentialsHandle credentialsHandle, ExtendedStatistics statistics) { - String extendedStatsFile = delegate.updateExtendedStatistics(session, schemaTableName, tableLocation, previousExtendedStatsFile, credentialsHandle, statistics); + String extendedStatsFile = delegate.writeExtendedStatistics(session, schemaTableName, tableLocation, previousExtendedStatsFile, credentialsHandle, statistics); cache.invalidate(new CacheKey(schemaTableName, tableLocation, extendedStatsFile)); return extendedStatsFile; } diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/ExtendedStatisticsAccess.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/ExtendedStatisticsAccess.java index 2d522358aaf0..5ba6f2cb2647 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/ExtendedStatisticsAccess.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/ExtendedStatisticsAccess.java @@ -43,7 +43,7 @@ Optional readExtendedStatistics( * * @return the name of the newly written stats file */ - String updateExtendedStatistics( + String writeExtendedStatistics( ConnectorSession session, SchemaTableName schemaTableName, String tableLocation, diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/MetaDirStatisticsAccess.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/MetaDirStatisticsAccess.java index 6da2de6a1d86..6f2931732472 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/MetaDirStatisticsAccess.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/statistics/MetaDirStatisticsAccess.java @@ -84,7 +84,7 @@ public Optional readExtendedStatistics( } @Override - public String updateExtendedStatistics( + public String writeExtendedStatistics( ConnectorSession session, SchemaTableName schemaTableName, String tableLocation, @@ -92,7 +92,7 @@ public String updateExtendedStatistics( VendedCredentialsHandle credentialsHandle, ExtendedStatistics statistics) { - String currentStatsFile = UUID.randomUUID().toString() + ".extended_stats.json"; + String currentStatsFile = UUID.randomUUID() + ".extended_stats.json"; try { Location statisticsPath = resolveStatsFileLocation(tableLocation, currentStatsFile); diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeAlluxioCacheMutableTransactionLog.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeAlluxioCacheMutableTransactionLog.java index a69d2df8a19d..c1d2f3226998 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeAlluxioCacheMutableTransactionLog.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeAlluxioCacheMutableTransactionLog.java @@ -77,6 +77,8 @@ public void testTableDataCachedWhileTransactionLogNotCached() ImmutableMultiset.builder() .addCopies(new CacheFileSystemTraceUtils.CacheOperation("InputFile.length", "00000000000000000002.checkpoint.parquet"), 2) .addCopies(new CacheFileSystemTraceUtils.CacheOperation("Input.readTail", "00000000000000000002.checkpoint.parquet"), 2) + .add(new CacheFileSystemTraceUtils.CacheOperation("InputFile.length", "00000000000000000002.json")) + .add(new CacheFileSystemTraceUtils.CacheOperation("InputFile.newStream", "00000000000000000002.json")) .add(new CacheFileSystemTraceUtils.CacheOperation("InputFile.length", "00000000000000000003.json")) .add(new CacheFileSystemTraceUtils.CacheOperation("InputFile.newStream", "_last_checkpoint")) .add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.readCached", "key=p1/", 0, 229)) @@ -91,6 +93,8 @@ public void testTableDataCachedWhileTransactionLogNotCached() ImmutableMultiset.builder() .addCopies(new CacheFileSystemTraceUtils.CacheOperation("InputFile.length", "00000000000000000002.checkpoint.parquet"), 2) .addCopies(new CacheFileSystemTraceUtils.CacheOperation("Input.readTail", "00000000000000000002.checkpoint.parquet"), 2) + .add(new CacheFileSystemTraceUtils.CacheOperation("InputFile.length", "00000000000000000002.json")) + .add(new CacheFileSystemTraceUtils.CacheOperation("InputFile.newStream", "00000000000000000002.json")) .add(new CacheFileSystemTraceUtils.CacheOperation("InputFile.length", "00000000000000000003.json")) .add(new CacheFileSystemTraceUtils.CacheOperation("InputFile.newStream", "_last_checkpoint")) .add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.readCached", "key=p1/", 0, 229)) diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeFileOperations.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeFileOperations.java index d121092392fc..ccbd2bafd59c 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeFileOperations.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeFileOperations.java @@ -110,8 +110,6 @@ public void testCreateTableAsSelect() assertFileSystemAccesses( "CREATE TABLE test_create_as_select AS SELECT 1 col_name", ImmutableMultiset.builder() - .add(new FileOperation(STARBURST_EXTENDED_STATS_JSON, "extendeded_stats.json", "InputFile.newStream")) - .add(new FileOperation(STARBURST_EXTENDED_STATS_JSON, "extendeded_stats.json", "InputFile.exists")) .add(new FileOperation(TRINO_EXTENDED_STATS_JSON, "extended_stats.json", "InputFile.newStream")) .add(new FileOperation(TRINO_EXTENDED_STATS_JSON, "extended_stats.json", "OutputFile.createOrOverwrite")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "OutputFile.create")) @@ -122,8 +120,6 @@ public void testCreateTableAsSelect() assertFileSystemAccesses( "CREATE TABLE test_create_partitioned_as_select WITH (partitioned_by=ARRAY['key']) AS SELECT * FROM (VALUES (1, 'a'), (2, 'b')) t(key, col)", ImmutableMultiset.builder() - .add(new FileOperation(STARBURST_EXTENDED_STATS_JSON, "extendeded_stats.json", "InputFile.newStream")) - .add(new FileOperation(STARBURST_EXTENDED_STATS_JSON, "extendeded_stats.json", "InputFile.exists")) .add(new FileOperation(TRINO_EXTENDED_STATS_JSON, "extended_stats.json", "InputFile.newStream")) .add(new FileOperation(TRINO_EXTENDED_STATS_JSON, "extended_stats.json", "OutputFile.createOrOverwrite")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "OutputFile.create")) @@ -160,8 +156,6 @@ public void testCreateOrReplaceTableAsSelect() assertFileSystemAccesses( "CREATE OR REPLACE TABLE test_create_or_replace_as_select AS SELECT 1 col_name", ImmutableMultiset.builder() - .add(new FileOperation(STARBURST_EXTENDED_STATS_JSON, "extendeded_stats.json", "InputFile.newStream")) - .add(new FileOperation(STARBURST_EXTENDED_STATS_JSON, "extendeded_stats.json", "InputFile.exists")) .add(new FileOperation(TRINO_EXTENDED_STATS_JSON, "extended_stats.json", "InputFile.newStream")) .add(new FileOperation(TRINO_EXTENDED_STATS_JSON, "extended_stats.json", "OutputFile.createOrOverwrite")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "OutputFile.create")) @@ -171,7 +165,6 @@ public void testCreateOrReplaceTableAsSelect() assertFileSystemAccesses( "CREATE OR REPLACE TABLE test_create_or_replace_as_select AS SELECT 1 col_name", ImmutableMultiset.builder() - .add(new FileOperation(STARBURST_EXTENDED_STATS_JSON, "extendeded_stats.json", "InputFile.exists")) .add(new FileOperation(TRINO_EXTENDED_STATS_JSON, "extended_stats.json", "InputFile.newStream")) .add(new FileOperation(TRINO_EXTENDED_STATS_JSON, "extended_stats.json", "OutputFile.createOrOverwrite")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000000.json", "InputFile.newStream")) @@ -247,6 +240,8 @@ public void testReadTableCheckpointInterval() .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) .addCopies(new FileOperation(CHECKPOINT, "00000000000000000002.checkpoint.parquet", "InputFile.length"), 2) // TODO (https://github.com/trinodb/trino/issues/18916) should be checked once per query .addCopies(new FileOperation(CHECKPOINT, "00000000000000000002.checkpoint.parquet", "InputFile.newInput"), 2) // TODO (https://github.com/trinodb/trino/issues/18916) should be checked once per query + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000003.json", "InputFile.length")) .addCopies(new FileOperation(DATA, "no partition", "InputFile.newInput"), 2) .build()); @@ -270,6 +265,8 @@ public void testReadPartitionTableWithCheckpointFiltering() .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) .addCopies(new FileOperation(CHECKPOINT, "00000000000000000002.checkpoint.parquet", "InputFile.length"), 2) // TODO (https://github.com/trinodb/trino/issues/18916) should be checked once per query .addCopies(new FileOperation(CHECKPOINT, "00000000000000000002.checkpoint.parquet", "InputFile.newInput"), 2) // TODO (https://github.com/trinodb/trino/issues/18916) should be checked once per query + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.newStream")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000003.json", "InputFile.length")) .addCopies(new FileOperation(DATA, "key=p1/", "InputFile.newInput"), 2) .addCopies(new FileOperation(DATA, "key=p2/", "InputFile.newInput"), 2) @@ -538,6 +535,8 @@ public void testSelectFromVersionedTable() ImmutableMultiset.builder() .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000020.json", "InputFile.exists")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000020.json", "InputFile.newStream")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000020.json", "InputFile.length")) .addCopies(new FileOperation(CHECKPOINT, "00000000000000000020.checkpoint.parquet", "InputFile.length"), 2) .addCopies(new FileOperation(CHECKPOINT, "00000000000000000020.checkpoint.parquet", "InputFile.newInput"), 2) .addCopies(new FileOperation(DATA, "no partition", "InputFile.newInput"), 20) @@ -1155,8 +1154,8 @@ public void testReadMultipartCheckpoint() .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) .addCopies(new FileOperation(CHECKPOINT, "00000000000000000006.checkpoint.0000000001.0000000002.parquet", "InputFile.length"), 2) // TODO (https://github.com/trinodb/trino/issues/18916) should be checked once per query .addCopies(new FileOperation(CHECKPOINT, "00000000000000000006.checkpoint.0000000001.0000000002.parquet", "InputFile.newInput"), 2) // TODO (https://github.com/trinodb/trino/issues/18916) should be checked once per query - .addCopies(new FileOperation(CHECKPOINT, "00000000000000000006.checkpoint.0000000002.0000000002.parquet", "InputFile.length"), 2) // TODO (https://github.com/trinodb/trino/issues/18916) should be checked once per query - .addCopies(new FileOperation(CHECKPOINT, "00000000000000000006.checkpoint.0000000002.0000000002.parquet", "InputFile.newInput"), 2) // TODO (https://github.com/trinodb/trino/issues/18916) should be checked once per query + .add(new FileOperation(CHECKPOINT, "00000000000000000006.checkpoint.0000000002.0000000002.parquet", "InputFile.length")) // TODO (https://github.com/trinodb/trino/issues/18916) should be checked once per query + .add(new FileOperation(CHECKPOINT, "00000000000000000006.checkpoint.0000000002.0000000002.parquet", "InputFile.newInput"))// TODO (https://github.com/trinodb/trino/issues/18916) should be checked once per query .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000007.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000007.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000008.json", "InputFile.length")) @@ -1178,6 +1177,8 @@ public void testReadMultipartV2Checkpoint() .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) .addCopies(new FileOperation(CHECKPOINT, "00000000000000000004.checkpoint.42f48375-5c72-4d2f-8dcc-7ce4d45e2d8c.json", "InputFile.length"), 2) // TODO (https://github.com/trinodb/trino/issues/18916) should be checked once per query .addCopies(new FileOperation(CHECKPOINT, "00000000000000000004.checkpoint.42f48375-5c72-4d2f-8dcc-7ce4d45e2d8c.json", "InputFile.newStream"), 2) // TODO (https://github.com/trinodb/trino/issues/18916) should be checked once per query + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000004.json", "InputFile.length")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000004.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000005.json", "InputFile.length")) .add(new FileOperation(CHECKPOINT, "00000000000000000004.checkpoint.0000000001.0000000004.9f573e40-495e-4fb5-9a86-638dbdf0909e.parquet", "InputFile.newInput")) .add(new FileOperation(CHECKPOINT, "00000000000000000004.checkpoint.0000000001.0000000004.9f573e40-495e-4fb5-9a86-638dbdf0909e.parquet", "InputFile.length")) @@ -1214,6 +1215,8 @@ public void testV2CheckpointJson() .addCopies(new FileOperation(CHECKPOINT, "00000000000000000001.checkpoint.73a4ddb8-2bfc-40d8-b09f-1b6a0abdfb04.json", "InputFile.newStream"), 2) // TODO (https://github.com/trinodb/trino/issues/18916) should be checked once per query .add(new FileOperation(CHECKPOINT, "00000000000000000001.checkpoint.0000000001.0000000001.90cf4e21-dbaa-41d6-8ae5-6709cfbfbfe0.parquet", "InputFile.length")) .add(new FileOperation(CHECKPOINT, "00000000000000000001.checkpoint.0000000001.0000000001.90cf4e21-dbaa-41d6-8ae5-6709cfbfbfe0.parquet", "InputFile.newInput")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) // TransactionLogTail.getEntriesFromJson access non-existing file as end of tail .add(new FileOperation(DATA, "no partition", "InputFile.newInput")) .build()); @@ -1244,6 +1247,8 @@ public void testV2CheckpointParquet() .addCopies(new FileOperation(CHECKPOINT, "00000000000000000001.checkpoint.156b3304-76b2-49c3-a9a1-626f07df27c9.parquet", "InputFile.newInput"), 2) // TODO (https://github.com/trinodb/trino/issues/18916) should be checked once per query .add(new FileOperation(CHECKPOINT, "00000000000000000001.checkpoint.0000000001.0000000001.03288d7e-af16-44ed-829c-196064a71812.parquet", "InputFile.length")) .add(new FileOperation(CHECKPOINT, "00000000000000000001.checkpoint.0000000001.0000000001.03288d7e-af16-44ed-829c-196064a71812.parquet", "InputFile.newInput")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.newStream")) + .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) // TransactionLogTail.getEntriesFromJson access non-existing file as end of tail .add(new FileOperation(DATA, "no partition", "InputFile.newInput")) .build()); @@ -1281,7 +1286,6 @@ public void testDeletionVectors() .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000001.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000002.json", "InputFile.length")) .add(new FileOperation(TRANSACTION_LOG_JSON, "00000000000000000003.json", "InputFile.length")) - .add(new FileOperation(STARBURST_EXTENDED_STATS_JSON, "extendeded_stats.json", "InputFile.newStream")) .add(new FileOperation(TRINO_EXTENDED_STATS_JSON, "extended_stats.json", "InputFile.newStream")) .add(new FileOperation(LAST_CHECKPOINT, "_last_checkpoint", "InputFile.newStream")) .add(new FileOperation(DELETION_VECTOR, "deletion_vector_a52eda8c-0a57-4636-814b-9c165388f7ca.bin", "InputFile.newInput")) @@ -1400,8 +1404,8 @@ public static FileOperation create(String path, String operationType) if (path.matches(".*/_delta_log/\\d+\\.json")) { return new FileOperation(TRANSACTION_LOG_JSON, fileName, operationType); } - if (path.matches(".*/_delta_log/_trino_meta/extended_stats.json")) { - return new FileOperation(TRINO_EXTENDED_STATS_JSON, fileName, operationType); + if (path.matches(".*/_delta_log/_trino_meta/.*extended_stats.json")) { + return new FileOperation(TRINO_EXTENDED_STATS_JSON, "extended_stats.json", operationType); } if (path.matches(".*/_delta_log/_starburst_meta/extendeded_stats.json")) { return new FileOperation(STARBURST_EXTENDED_STATS_JSON, fileName, operationType); diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/statistics/TestMetaDirStatisticsAccess.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/statistics/TestMetaDirStatisticsAccess.java index 41d25dc37fa9..218651c3efe5 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/statistics/TestMetaDirStatisticsAccess.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/statistics/TestMetaDirStatisticsAccess.java @@ -59,7 +59,7 @@ void testUpdateAndRead() ExtendedStatistics initialStats = new ExtendedStatistics(Instant.ofEpochMilli(1_000_000), Map.of(), Optional.empty()); // no previous file - String firstFile = access.updateExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.empty(), credentials(), initialStats); + String firstFile = access.writeExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.empty(), credentials(), initialStats); assertThat(firstFile).isNotBlank(); Optional firstRead = access.readExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, firstFile, credentials()); @@ -69,7 +69,7 @@ void testUpdateAndRead() // provide previous file so it gets replaced ExtendedStatistics updatedStats = new ExtendedStatistics(Instant.ofEpochMilli(2_000_000), Map.of(), Optional.of(ImmutableSet.of("col_a"))); - String secondFile = access.updateExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.of(firstFile), credentials(), updatedStats); + String secondFile = access.writeExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.of(firstFile), credentials(), updatedStats); // new file differ from the old one assertThat(secondFile).isNotEqualTo(firstFile); @@ -89,7 +89,7 @@ void testUpdateAndRead() void testUpdateCreatesFile() { MetaDirStatisticsAccess access = createAccess(); - String statsFile = access.updateExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.empty(), credentials(), sampleStatistics()); + String statsFile = access.writeExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.empty(), credentials(), sampleStatistics()); assertThat(statsFilePath(statsFile)).exists(); } @@ -97,8 +97,8 @@ void testUpdateCreatesFile() void testUpdateReturnsUniqueFilename() { MetaDirStatisticsAccess access = createAccess(); - String file1 = access.updateExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.empty(), credentials(), sampleStatistics()); - String file2 = access.updateExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.empty(), credentials(), sampleStatistics()); + String file1 = access.writeExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.empty(), credentials(), sampleStatistics()); + String file2 = access.writeExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.empty(), credentials(), sampleStatistics()); assertThat(file1).isNotEqualTo(file2); } @@ -107,7 +107,7 @@ void testUpdateWithNoPreviousFile() throws IOException { MetaDirStatisticsAccess access = createAccess(); - String statsFile = access.updateExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.empty(), credentials(), sampleStatistics()); + String statsFile = access.writeExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.empty(), credentials(), sampleStatistics()); Path statsDir = statsFilePath(statsFile).getParent(); try (var files = Files.list(statsDir)) { @@ -122,10 +122,10 @@ void testUpdateDeletesPreviousFile() { MetaDirStatisticsAccess access = createAccess(); - String oldFile = access.updateExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.empty(), credentials(), sampleStatistics()); + String oldFile = access.writeExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.empty(), credentials(), sampleStatistics()); assertThat(statsFilePath(oldFile)).exists(); - String newFile = access.updateExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.of(oldFile), credentials(), sampleStatistics()); + String newFile = access.writeExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.of(oldFile), credentials(), sampleStatistics()); assertThat(statsFilePath(oldFile)).doesNotExist(); assertThat(statsFilePath(newFile)).exists(); @@ -135,7 +135,7 @@ void testUpdateDeletesPreviousFile() void testDeleteExistingFile() { MetaDirStatisticsAccess access = createAccess(); - String statsFile = access.updateExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.empty(), credentials(), sampleStatistics()); + String statsFile = access.writeExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.empty(), credentials(), sampleStatistics()); assertThat(statsFilePath(statsFile)).exists(); access.deleteExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, statsFile, credentials()); @@ -154,7 +154,7 @@ void testDeleteMissingFileIsNoOp() void testFilenameContainsExpectedSuffix() { MetaDirStatisticsAccess access = createAccess(); - String statsFile = access.updateExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.empty(), credentials(), sampleStatistics()); + String statsFile = access.writeExtendedStatistics(SESSION, TABLE_NAME, TABLE_LOCATION, Optional.empty(), credentials(), sampleStatistics()); assertThat(statsFile).endsWith(".extended_stats.json"); } From 1857e837fe413aa8bac396fb45241fbdf107cea7 Mon Sep 17 00:00:00 2001 From: chenjian2664 Date: Wed, 15 Apr 2026 10:50:53 +0800 Subject: [PATCH 10/10] fixup! Store extended statistics filename in Delta Lake `CommitInfoEntry` --- ...stDeltaLakeMinioAndLockBasedSynchronizerSmokeTest.java | 8 ++++++-- .../deltalake/TestDeltaLakeCheckpointsCompatibility.java | 4 ++-- .../product/deltalake/TestDeltaLakeColumnMappingMode.java | 4 ++-- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeMinioAndLockBasedSynchronizerSmokeTest.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeMinioAndLockBasedSynchronizerSmokeTest.java index 8a93d317cba0..a3cb9481d5f9 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeMinioAndLockBasedSynchronizerSmokeTest.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeMinioAndLockBasedSynchronizerSmokeTest.java @@ -36,6 +36,7 @@ import java.util.concurrent.TimeUnit; import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableSet.toImmutableSet; import static io.trino.plugin.base.util.Closables.closeAllSuppress; import static io.trino.plugin.deltalake.DeltaLakeQueryRunner.DELTA_CATALOG; import static io.trino.testing.TestingNames.randomNameSuffix; @@ -138,7 +139,9 @@ private void testWritesLocked(String writeStatement) tableName), 2); - Set originalFiles = ImmutableSet.copyOf(getTableFiles(tableName)); + Set originalFiles = ImmutableSet.copyOf(getTableFiles(tableName)).stream() + .filter(path -> !path.endsWith(".extended_stats.json")) + .collect(toImmutableSet()); assertThat(originalFiles).isNotEmpty(); // sanity check String lockFilePath = lockTable(tableName, java.time.Duration.ofMinutes(5)); @@ -153,7 +156,8 @@ private void testWritesLocked(String writeStatement) .build(); assertEventually( new Duration(5, TimeUnit.SECONDS), - () -> assertThat(getTableFiles(tableName)).containsExactlyInAnyOrderElementsOf(expectedFiles)); + () -> assertThat(getTableFiles(tableName).stream().filter(path -> !path.endsWith(".extended_stats.json")).collect(toImmutableSet())) + .containsExactlyInAnyOrderElementsOf(expectedFiles)); } finally { assertUpdate("DROP TABLE " + tableName); diff --git a/testing/trino-product-tests/src/main/java/io/trino/tests/product/deltalake/TestDeltaLakeCheckpointsCompatibility.java b/testing/trino-product-tests/src/main/java/io/trino/tests/product/deltalake/TestDeltaLakeCheckpointsCompatibility.java index 5f7d27731425..1d924c7e014f 100644 --- a/testing/trino-product-tests/src/main/java/io/trino/tests/product/deltalake/TestDeltaLakeCheckpointsCompatibility.java +++ b/testing/trino-product-tests/src/main/java/io/trino/tests/product/deltalake/TestDeltaLakeCheckpointsCompatibility.java @@ -598,8 +598,8 @@ public Object[][] testTrinoCheckpointWriteStatsAsJson() {"double", "1.0", null, 1.0, 0.0, "1.0"}, {"decimal(3,2)", "3.14", null, 1.0, 0.0, "3.14"}, {"decimal(30,1)", "12345", null, 1.0, 0.0, "12345.0"}, - {"string", "'test'", 12.0, 1.0, 0.0, null}, - {"binary", "X'65683F'", 9.0, 1.0, 0.0, null}, + {"string", "'test'", 4.0, 1.0, 0.0, null}, // we lose tracking of the previous extended statistics after other engine commits + {"binary", "X'65683F'", 3.0, 1.0, 0.0, null}, // we lose tracking of the previous extended statistics after other engine commits {"date", "date '2021-02-03'", null, 1.0, 0.0, "2021-02-03"}, {"timestamp", "timestamp '2001-08-22 11:04:05.321 UTC'", null, 1.0, 0.0, "2001-08-22 11:04:05.321 UTC"}, {"array", "array[1]", null, null, null, null}, diff --git a/testing/trino-product-tests/src/main/java/io/trino/tests/product/deltalake/TestDeltaLakeColumnMappingMode.java b/testing/trino-product-tests/src/main/java/io/trino/tests/product/deltalake/TestDeltaLakeColumnMappingMode.java index 979cb9baa7f5..c99f78246fa9 100644 --- a/testing/trino-product-tests/src/main/java/io/trino/tests/product/deltalake/TestDeltaLakeColumnMappingMode.java +++ b/testing/trino-product-tests/src/main/java/io/trino/tests/product/deltalake/TestDeltaLakeColumnMappingMode.java @@ -737,7 +737,7 @@ public void testChangeColumnMappingAndShowStatsForColumnMappingMode() // Ensure SHOW STATS doesn't return stats for the restored column assertThat(onTrino().executeQuery("SHOW STATS FOR delta.default." + tableName)) .containsOnly( - row("a_number", null, 2.0, 0.33333333333, null, "1", "2"), + row("a_number", null, null, 0.33333333333, null, "1", "2"), // we lose tracking of the previous extended statistics after other engine commits row("b_number", null, null, null, null, null, null), row(null, null, null, null, 3.0, null, null)); @@ -1074,7 +1074,7 @@ public void testDropNonLowercaseColumnWithColumnMappingMode(String mode) onDelta().executeQuery("ALTER TABLE default." + tableName + " ADD COLUMN UPPER_DATA INT"); assertThat(onTrino().executeQuery("SHOW STATS FOR delta.default." + tableName)) .containsOnly(ImmutableList.of( - row("upper_id", null, 1.0, 0.0, null, "1", "1"), + row("upper_id", null, null, 0.0, null, "1", "1"), // we lose tracking of the previous extended statistics after other engine commits row("upper_data", null, null, null, null, null, null), row("upper_part", null, 1.0, 0.0, null, null, null), row(null, null, null, null, 1.0, null, null)));