-
Notifications
You must be signed in to change notification settings - Fork 3.6k
Use unenforced constraint when building Iceberg stats #16244
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,6 +13,7 @@ | |
| */ | ||
| package io.trino.plugin.iceberg; | ||
|
|
||
| import com.google.common.annotations.VisibleForTesting; | ||
| import com.google.common.collect.AbstractIterator; | ||
| import com.google.common.collect.AbstractSequentialIterator; | ||
| import com.google.common.collect.ImmutableMap; | ||
|
|
@@ -66,8 +67,10 @@ | |
| import static java.util.stream.Collectors.toMap; | ||
| import static java.util.stream.Collectors.toUnmodifiableMap; | ||
|
|
||
| public class TableStatisticsReader | ||
| public final class TableStatisticsReader | ||
| { | ||
| private TableStatisticsReader() {} | ||
|
|
||
| private static final Logger log = Logger.get(TableStatisticsReader.class); | ||
|
|
||
| // TODO (https://github.com/trinodb/trino/issues/15397): remove support for Trino-specific statistics properties | ||
|
|
@@ -85,35 +88,39 @@ public class TableStatisticsReader | |
|
|
||
| public static final String APACHE_DATASKETCHES_THETA_V1_NDV_PROPERTY = "ndv"; | ||
|
|
||
| private final TypeManager typeManager; | ||
| private final ConnectorSession session; | ||
| private final Table icebergTable; | ||
|
|
||
| private TableStatisticsReader(TypeManager typeManager, ConnectorSession session, Table icebergTable) | ||
| { | ||
| this.typeManager = typeManager; | ||
| this.session = session; | ||
| this.icebergTable = icebergTable; | ||
| } | ||
|
|
||
| public static TableStatistics getTableStatistics(TypeManager typeManager, ConnectorSession session, IcebergTableHandle tableHandle, Table icebergTable) | ||
| { | ||
| return new TableStatisticsReader(typeManager, session, icebergTable).makeTableStatistics(tableHandle); | ||
| return makeTableStatistics( | ||
| typeManager, | ||
| icebergTable, | ||
| tableHandle.getSnapshotId(), | ||
| tableHandle.getEnforcedPredicate(), | ||
| tableHandle.getUnenforcedPredicate(), | ||
| isExtendedStatisticsEnabled(session)); | ||
| } | ||
|
|
||
| private TableStatistics makeTableStatistics(IcebergTableHandle tableHandle) | ||
| @VisibleForTesting | ||
| public static TableStatistics makeTableStatistics( | ||
| TypeManager typeManager, | ||
| Table icebergTable, | ||
| Optional<Long> snapshot, | ||
| TupleDomain<IcebergColumnHandle> enforcedConstraint, | ||
| TupleDomain<IcebergColumnHandle> unenforcedConstraint, | ||
| boolean extendedStatisticsEnabled) | ||
| { | ||
| if (tableHandle.getSnapshotId().isEmpty()) { | ||
| if (snapshot.isEmpty()) { | ||
| // No snapshot, so no data. | ||
| return TableStatistics.builder() | ||
| .setRowCount(Estimate.of(0)) | ||
| .build(); | ||
| } | ||
| long snapshotId = tableHandle.getSnapshotId().get(); | ||
|
|
||
| TupleDomain<IcebergColumnHandle> enforcedPredicate = tableHandle.getEnforcedPredicate(); | ||
| long snapshotId = snapshot.get(); | ||
|
|
||
| if (enforcedPredicate.isNone()) { | ||
| // Including both enforced and unenforced constraint matches how Splits will eventually be generated and allows | ||
| // us to provide more accurate estimates. Stats will be estimated again by FilterStatsCalculator based on the | ||
| // unenforced constraint. | ||
| TupleDomain<IcebergColumnHandle> effectivePredicate = enforcedConstraint.intersect(unenforcedConstraint); | ||
| if (effectivePredicate.isNone()) { | ||
| return TableStatistics.builder() | ||
| .setRowCount(Estimate.of(0)) | ||
| .build(); | ||
|
|
@@ -130,7 +137,7 @@ private TableStatistics makeTableStatistics(IcebergTableHandle tableHandle) | |
| .collect(toUnmodifiableMap(Map.Entry::getKey, Map.Entry::getValue)); | ||
|
|
||
| TableScan tableScan = icebergTable.newScan() | ||
| .filter(toIcebergExpression(enforcedPredicate)) | ||
| .filter(toIcebergExpression(effectivePredicate)) | ||
| .useSnapshot(snapshotId) | ||
| .includeColumnStats(); | ||
|
|
||
|
|
@@ -151,10 +158,12 @@ private TableStatistics makeTableStatistics(IcebergTableHandle tableHandle) | |
| } | ||
|
|
||
| Map<Integer, Long> ndvs = readNdvs( | ||
| icebergTable, | ||
| snapshotId, | ||
| // TODO We don't need NDV information for columns not involved in filters/joins. Engine should provide set of columns | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Regarding this TODO (not related to current PR), the engine does provide the columns required for the query (various connectors save this in their ConnectorTableHandle after applyProjection). Do we try to parse all columns here or only the columns accessed by the query ? |
||
| // it makes sense to find NDV information for. | ||
| idToColumnHandle.keySet()); | ||
| idToColumnHandle.keySet(), | ||
| extendedStatisticsEnabled); | ||
|
|
||
| ImmutableMap.Builder<ColumnHandle, ColumnStatistics> columnHandleBuilder = ImmutableMap.builder(); | ||
| double recordCount = summary.getRecordCount(); | ||
|
|
@@ -210,9 +219,9 @@ else if (columnHandle.getBaseType() == VARBINARY) { | |
| return new TableStatistics(Estimate.of(recordCount), columnHandleBuilder.buildOrThrow()); | ||
| } | ||
|
|
||
| private Map<Integer, Long> readNdvs(long snapshotId, Set<Integer> columnIds) | ||
| private static Map<Integer, Long> readNdvs(Table icebergTable, long snapshotId, Set<Integer> columnIds, boolean extendedStatisticsEnabled) | ||
| { | ||
| if (!isExtendedStatisticsEnabled(session)) { | ||
|
findepi marked this conversation as resolved.
Outdated
|
||
| if (!extendedStatisticsEnabled) { | ||
| return ImmutableMap.of(); | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,6 +12,11 @@ cross join: | |
| cross join: | ||
| cross join: | ||
| cross join: | ||
| final aggregation over () | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @findepi @raunaqmorarka is there a good way to tell if these changes are good or not?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, just run the benchmarks :)
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @alexjo2144 do you have the results? |
||
| local exchange (GATHER, SINGLE, []) | ||
| remote exchange (GATHER, SINGLE, []) | ||
| partial aggregation over () | ||
| scan store_sales | ||
| cross join: | ||
| final aggregation over () | ||
| local exchange (GATHER, SINGLE, []) | ||
|
|
@@ -21,11 +26,6 @@ cross join: | |
| local exchange (GATHER, SINGLE, []) | ||
| remote exchange (GATHER, SINGLE, []) | ||
| scan reason | ||
| final aggregation over () | ||
| local exchange (GATHER, SINGLE, []) | ||
| remote exchange (GATHER, SINGLE, []) | ||
| partial aggregation over () | ||
| scan store_sales | ||
| final aggregation over () | ||
| local exchange (GATHER, SINGLE, []) | ||
| remote exchange (GATHER, SINGLE, []) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
let's capture some info from #16244 (comment) conversation
like that this matches split gen
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I tried to sum it up in the commit message, let me know if theres anything missing.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
code comments are easier to notice (and serve slightly different purpose)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added a code comment as well