Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@
package io.trino.plugin.iceberg;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.AbstractIterator;
import com.google.common.collect.AbstractSequentialIterator;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Maps;
import io.airlift.log.Logger;
import io.trino.spi.TrinoException;
import io.trino.spi.connector.ColumnHandle;
import io.trino.spi.connector.ConnectorSession;
import io.trino.spi.predicate.TupleDomain;
Expand All @@ -44,12 +44,12 @@

import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.regex.Matcher;
Expand All @@ -58,7 +58,9 @@
import static com.google.common.base.Verify.verifyNotNull;
import static com.google.common.collect.ImmutableMap.toImmutableMap;
import static com.google.common.collect.Iterables.getOnlyElement;
import static com.google.common.collect.Streams.stream;
import static io.trino.plugin.iceberg.ExpressionConverter.toIcebergExpression;
import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_INVALID_METADATA;
import static io.trino.plugin.iceberg.IcebergSessionProperties.isExtendedStatisticsEnabled;
import static io.trino.plugin.iceberg.IcebergUtil.getColumns;
import static io.trino.spi.type.VarbinaryType.VARBINARY;
Expand Down Expand Up @@ -230,10 +232,7 @@ private static Map<Integer, Long> readNdvs(Table icebergTable, long snapshotId,
ImmutableMap.Builder<Integer, Long> ndvByColumnId = ImmutableMap.builder();
Set<Integer> remainingColumnIds = new HashSet<>(columnIds);
Comment thread
findepi marked this conversation as resolved.
Outdated

Iterator<StatisticsFile> statisticsFiles = walkStatisticsFiles(icebergTable, snapshotId);
while (!remainingColumnIds.isEmpty() && statisticsFiles.hasNext()) {
StatisticsFile statisticsFile = statisticsFiles.next();

getLatestStatisticsFile(icebergTable, snapshotId).ifPresent(statisticsFile -> {
Map<Integer, BlobMetadata> thetaBlobsByFieldId = statisticsFile.blobMetadata().stream()
.filter(blobMetadata -> blobMetadata.type().equals(StandardBlobTypes.APACHE_DATASKETCHES_THETA_V1))
.filter(blobMetadata -> blobMetadata.fields().size() == 1)
Expand All @@ -254,7 +253,7 @@ private static Map<Integer, Long> readNdvs(Table icebergTable, long snapshotId,
ndvByColumnId.put(fieldId, parseLong(ndv));
}
}
}
});

// TODO (https://github.com/trinodb/trino/issues/15397): remove support for Trino-specific statistics properties
Iterator<Entry<String, String>> properties = icebergTable.properties().entrySet().iterator();
Expand All @@ -278,41 +277,29 @@ private static Map<Integer, Long> readNdvs(Table icebergTable, long snapshotId,
}

/**
* Iterates over existing statistics files present for parent snapshot chain, starting at {@code startingSnapshotId} (inclusive).
* Returns most recent statistics file for the given {@code snapshotId}
Comment thread
findepi marked this conversation as resolved.
Outdated
*/
public static Iterator<StatisticsFile> walkStatisticsFiles(Table icebergTable, long startingSnapshotId)
public static Optional<StatisticsFile> getLatestStatisticsFile(Table icebergTable, long snapshotId)
{
return new AbstractIterator<>()
{
private final Map<Long, StatisticsFile> statsFileBySnapshot = icebergTable.statisticsFiles().stream()
.collect(toMap(
StatisticsFile::snapshotId,
identity(),
(a, b) -> {
throw new IllegalStateException("Unexpected duplicate statistics files %s, %s".formatted(a, b));
},
HashMap::new));

private final Iterator<Long> snapshots = walkSnapshots(icebergTable, startingSnapshotId);
if (icebergTable.statisticsFiles().isEmpty()) {
return Optional.empty();
}

@Override
protected StatisticsFile computeNext()
{
if (statsFileBySnapshot.isEmpty()) {
// Already found all statistics files
return endOfData();
}
Map<Long, StatisticsFile> statsFileBySnapshot = icebergTable.statisticsFiles().stream()
.collect(toMap(
StatisticsFile::snapshotId,
identity(),
(file1, file2) -> {
throw new TrinoException(
ICEBERG_INVALID_METADATA,
"Table '%s' has duplicate statistics files '%s' and '%s' for snapshot ID %s"
Comment thread
findepi marked this conversation as resolved.
Outdated
.formatted(icebergTable, file1.path(), file2.path(), file1.snapshotId()));
}));

while (snapshots.hasNext()) {
long snapshotId = snapshots.next();
StatisticsFile statisticsFile = statsFileBySnapshot.remove(snapshotId);
if (statisticsFile != null) {
return statisticsFile;
}
}
return endOfData();
}
};
return stream(walkSnapshots(icebergTable, snapshotId))
.map(statsFileBySnapshot::get)
.filter(Objects::nonNull)
.findFirst();
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,6 @@
import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.ByteBuffer;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
Expand All @@ -61,11 +59,10 @@
import static com.google.common.collect.ImmutableList.toImmutableList;
import static com.google.common.collect.ImmutableSet.toImmutableSet;
import static com.google.common.collect.Iterables.getOnlyElement;
import static com.google.common.collect.MoreCollectors.toOptional;
import static com.google.common.collect.Streams.stream;
import static io.trino.plugin.base.util.Closables.closeAllSuppress;
import static io.trino.plugin.iceberg.TableStatisticsReader.APACHE_DATASKETCHES_THETA_V1_NDV_PROPERTY;
import static io.trino.plugin.iceberg.TableStatisticsReader.walkStatisticsFiles;
import static io.trino.plugin.iceberg.TableStatisticsReader.getLatestStatisticsFile;
import static io.trino.plugin.iceberg.TableStatisticsWriter.StatsUpdateMode.INCREMENTAL_UPDATE;
import static io.trino.plugin.iceberg.TableStatisticsWriter.StatsUpdateMode.REPLACE;
import static java.lang.String.format;
Expand Down Expand Up @@ -137,9 +134,7 @@ public StatisticsFile writeStatisticsFile(
try (PuffinWriter writer = Puffin.write(outputFile)
.createdBy("Trino version " + trinoVersion)
.build()) {
table.statisticsFiles().stream()
.filter(statisticsFile -> statisticsFile.snapshotId() == snapshotId)
.collect(toOptional())
getLatestStatisticsFile(table, snapshotId)
.ifPresent(previousStatisticsFile -> copyRetainedStatistics(fileIO, previousStatisticsFile, validFieldIds, ndvSketches.keySet(), writer));

ndvSketches.entrySet().stream()
Expand Down Expand Up @@ -198,18 +193,16 @@ private CollectedStatistics mergeStatisticsIfNecessary(
return switch (updateMode) {
case REPLACE -> collectedStatistics;
case INCREMENTAL_UPDATE -> {
Map<Integer, CompactSketch> collectedNdvSketches = collectedStatistics.ndvSketches();
Optional<StatisticsFile> latestStatisticsFile = getLatestStatisticsFile(table, snapshotId);
ImmutableMap.Builder<Integer, CompactSketch> ndvSketches = ImmutableMap.builder();

Set<Integer> pendingPreviousNdvSketches = new HashSet<>(collectedNdvSketches.keySet());
Iterator<StatisticsFile> statisticsFiles = walkStatisticsFiles(table, snapshotId);
while (!pendingPreviousNdvSketches.isEmpty() && statisticsFiles.hasNext()) {
StatisticsFile statisticsFile = statisticsFiles.next();

if (latestStatisticsFile.isPresent()) {
Map<Integer, CompactSketch> collectedNdvSketches = collectedStatistics.ndvSketches();
Set<Integer> columnsWithRecentlyComputedStats = collectedNdvSketches.keySet();
StatisticsFile statisticsFile = latestStatisticsFile.get();
boolean hasUsefulData = statisticsFile.blobMetadata().stream()
.filter(blobMetadata -> blobMetadata.type().equals(StandardBlobTypes.APACHE_DATASKETCHES_THETA_V1))
.filter(blobMetadata -> blobMetadata.fields().size() == 1)
.anyMatch(blobMetadata -> pendingPreviousNdvSketches.contains(getOnlyElement(blobMetadata.fields())));
.anyMatch(blobMetadata -> columnsWithRecentlyComputedStats.contains(getOnlyElement(blobMetadata.fields())));

if (hasUsefulData) {
try (PuffinReader reader = Puffin.read(fileIO.newInputFile(statisticsFile.path()))
Expand All @@ -219,11 +212,10 @@ private CollectedStatistics mergeStatisticsIfNecessary(
List<BlobMetadata> toRead = reader.fileMetadata().blobs().stream()
.filter(blobMetadata -> blobMetadata.type().equals(APACHE_DATASKETCHES_THETA_V1))
.filter(blobMetadata -> blobMetadata.inputFields().size() == 1)
.filter(blobMetadata -> pendingPreviousNdvSketches.contains(getOnlyElement(blobMetadata.inputFields())))
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should be same as hasUsefulData above, i.e.

.filter(blobMetadata -> columnsWithRecentlyComputedStats.contains(getOnlyElement(blobMetadata.fields())))

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i fail to understand why this .filter(blobMetadata -> pendingPreviousNdvSketches.contains(getOnlyElement(blobMetadata.inputFields()))) is removed.
can you please explain?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I probably lost it while changing the implementation few times. Thanks for catching it

.filter(blobMetadata -> columnsWithRecentlyComputedStats.contains(getOnlyElement(blobMetadata.inputFields())))
.collect(toImmutableList());
for (Pair<BlobMetadata, ByteBuffer> read : reader.readAll(toRead)) {
Integer fieldId = getOnlyElement(read.first().inputFields());
checkState(pendingPreviousNdvSketches.remove(fieldId), "Unwanted read of stats for field %s", fieldId);
Memory memory = Memory.wrap(ByteBuffers.getBytes(read.second())); // Memory.wrap(ByteBuffer) results in a different deserialized state
CompactSketch previousSketch = CompactSketch.wrap(memory);
CompactSketch newSketch = requireNonNull(collectedNdvSketches.get(fieldId), "ndvSketches.get(fieldId) is null");
Expand All @@ -235,7 +227,6 @@ private CollectedStatistics mergeStatisticsIfNecessary(
}
}
}

yield new CollectedStatistics(ndvSketches.buildOrThrow());
}
};
Expand Down