-
Notifications
You must be signed in to change notification settings - Fork 3.4k
Add aggregation pushdown support for count using Iceberg Metrics #15832
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,14 +32,19 @@ | |
| import io.trino.filesystem.FileIterator; | ||
| import io.trino.filesystem.Location; | ||
| import io.trino.filesystem.TrinoFileSystem; | ||
| import io.trino.plugin.base.aggregation.AggregateFunctionRewriter; | ||
| import io.trino.plugin.base.aggregation.AggregateFunctionRule; | ||
| import io.trino.plugin.base.classloader.ClassLoaderSafeSystemTable; | ||
| import io.trino.plugin.base.expression.ConnectorExpressionRewriter; | ||
| import io.trino.plugin.base.filter.UtcConstraintExtractor; | ||
| import io.trino.plugin.base.projection.ApplyProjectionUtil; | ||
| import io.trino.plugin.base.projection.ApplyProjectionUtil.ProjectedColumnRepresentation; | ||
| import io.trino.plugin.hive.HiveWrittenPartitions; | ||
| import io.trino.plugin.hive.metastore.TableInfo; | ||
| import io.trino.plugin.iceberg.aggregation.AggregateExpression; | ||
| import io.trino.plugin.iceberg.aggregation.DataSketchStateSerializer; | ||
| import io.trino.plugin.iceberg.aggregation.IcebergThetaSketchForStats; | ||
| import io.trino.plugin.iceberg.aggregation.ImplementCountAll; | ||
| import io.trino.plugin.iceberg.catalog.TrinoCatalog; | ||
| import io.trino.plugin.iceberg.procedure.IcebergDropExtendedStatsHandle; | ||
| import io.trino.plugin.iceberg.procedure.IcebergExpireSnapshotsHandle; | ||
|
|
@@ -51,6 +56,8 @@ | |
| import io.trino.spi.ErrorCode; | ||
| import io.trino.spi.TrinoException; | ||
| import io.trino.spi.block.Block; | ||
| import io.trino.spi.connector.AggregateFunction; | ||
| import io.trino.spi.connector.AggregationApplicationResult; | ||
| import io.trino.spi.connector.Assignment; | ||
| import io.trino.spi.connector.BeginTableExecuteResult; | ||
| import io.trino.spi.connector.CatalogHandle; | ||
|
|
@@ -135,6 +142,7 @@ | |
| import org.apache.iceberg.SchemaParser; | ||
| import org.apache.iceberg.Snapshot; | ||
| import org.apache.iceberg.SnapshotRef; | ||
| import org.apache.iceberg.SnapshotSummary; | ||
| import org.apache.iceberg.SortField; | ||
| import org.apache.iceberg.SortOrder; | ||
| import org.apache.iceberg.StatisticsFile; | ||
|
|
@@ -220,6 +228,7 @@ | |
| import static io.trino.plugin.iceberg.IcebergSessionProperties.getExpireSnapshotMinRetention; | ||
| import static io.trino.plugin.iceberg.IcebergSessionProperties.getHiveCatalogName; | ||
| import static io.trino.plugin.iceberg.IcebergSessionProperties.getRemoveOrphanFilesMinRetention; | ||
| import static io.trino.plugin.iceberg.IcebergSessionProperties.isAggregationPushdownEnabled; | ||
| import static io.trino.plugin.iceberg.IcebergSessionProperties.isCollectExtendedStatisticsOnWrite; | ||
| import static io.trino.plugin.iceberg.IcebergSessionProperties.isExtendedStatisticsEnabled; | ||
| import static io.trino.plugin.iceberg.IcebergSessionProperties.isMergeManifestsOnWrite; | ||
|
|
@@ -327,6 +336,7 @@ public class IcebergMetadata | |
| private final TrinoCatalog catalog; | ||
| private final IcebergFileSystemFactory fileSystemFactory; | ||
| private final TableStatisticsWriter tableStatisticsWriter; | ||
| private final AggregateFunctionRewriter<AggregateExpression, Void> aggregateFunctionRewriter; | ||
|
|
||
| private final Map<IcebergTableHandle, TableStatistics> tableStatisticsCache = new ConcurrentHashMap<>(); | ||
|
|
||
|
|
@@ -346,6 +356,12 @@ public IcebergMetadata( | |
| this.catalog = requireNonNull(catalog, "catalog is null"); | ||
| this.fileSystemFactory = requireNonNull(fileSystemFactory, "fileSystemFactory is null"); | ||
| this.tableStatisticsWriter = requireNonNull(tableStatisticsWriter, "tableStatisticsWriter is null"); | ||
|
|
||
| this.aggregateFunctionRewriter = new AggregateFunctionRewriter( | ||
| new ConnectorExpressionRewriter<>(ImmutableSet.of()), | ||
| ImmutableSet.<AggregateFunctionRule<AggregateExpression, Void>>builder() | ||
| .add(new ImplementCountAll()) | ||
| .build()); | ||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -2653,6 +2669,101 @@ else if (isMetadataColumnId(columnHandle.getId())) { | |
| false)); | ||
| } | ||
|
|
||
| @Override | ||
| public Optional<AggregationApplicationResult<ConnectorTableHandle>> applyAggregation( | ||
| ConnectorSession session, | ||
| ConnectorTableHandle handle, | ||
| List<AggregateFunction> aggregates, | ||
| Map<String, ColumnHandle> assignments, | ||
| List<List<ColumnHandle>> groupingSets) | ||
| { | ||
| IcebergTableHandle tableHandle = (IcebergTableHandle) handle; | ||
|
|
||
| // Iceberg's metadata cannot be used for aggregation calculation. | ||
| // As equality deletes do not reflect at the metadata/count level. | ||
| if (hasEqualityDeletes(session, tableHandle)) { | ||
| return Optional.empty(); | ||
| } | ||
|
|
||
| if (!isAggregationPushdownEnabled(session)) { | ||
| return Optional.empty(); | ||
| } | ||
|
|
||
| // not supporting unenforced predicate | ||
| if (!tableHandle.getUnenforcedPredicate().isNone() | ||
| && tableHandle.getUnenforcedPredicate().getDomains().isPresent() | ||
| && !tableHandle.getUnenforcedPredicate().getDomains().get().isEmpty()) { | ||
| return Optional.empty(); | ||
| } | ||
|
|
||
| // not supporting group by | ||
|
||
| if (!groupingSets.equals(List.of(List.of()))) { | ||
| return Optional.empty(); | ||
| } | ||
|
|
||
| ImmutableList.Builder<ConnectorExpression> projections = ImmutableList.builder(); | ||
| ImmutableList.Builder<Assignment> resultAssignments = ImmutableList.builder(); | ||
| ImmutableList.Builder<IcebergColumnHandle> aggregateColumnsBuilder = ImmutableList.builder(); | ||
|
|
||
| Set<IcebergColumnHandle> projectionsSet = new HashSet<>(); | ||
|
|
||
| if (aggregates.size() != 1) { | ||
| // not handling multiple aggregations for now | ||
| return Optional.empty(); | ||
| } | ||
|
|
||
| AggregateFunction aggregate = aggregates.get(0); | ||
|
|
||
| Optional<AggregateExpression> rewriteResult = aggregateFunctionRewriter.rewrite(session, aggregate, assignments); | ||
| if (rewriteResult.isEmpty()) { | ||
| return Optional.empty(); | ||
| } | ||
| AggregateExpression aggregateExpression = rewriteResult.get(); | ||
|
|
||
| if (aggregateExpression.getFunction().startsWith("count")) { | ||
|
||
| IcebergColumnHandle aggregateIcebergColumnHandle = new IcebergColumnHandle(aggregateExpression.toColumnIdentity(AggregateExpression.COUNT_AGGREGATE_COLUMN_ID), | ||
| aggregate.getOutputType(), List.of(), aggregate.getOutputType(), false, Optional.empty()); | ||
| aggregateColumnsBuilder.add(aggregateIcebergColumnHandle); | ||
| projections.add(new Variable(aggregateIcebergColumnHandle.getName(), aggregateIcebergColumnHandle.getType())); | ||
| projectionsSet.add(aggregateIcebergColumnHandle); | ||
| resultAssignments.add(new Assignment(aggregateIcebergColumnHandle.getName(), aggregateIcebergColumnHandle, aggregateIcebergColumnHandle.getType())); | ||
| } | ||
|
|
||
| IcebergTableHandle tableHandleTemp = new IcebergTableHandle( | ||
| tableHandle.getCatalog(), | ||
| tableHandle.getSchemaName(), | ||
| tableHandle.getTableName(), | ||
| tableHandle.getTableType(), | ||
| tableHandle.getSnapshotId(), | ||
| tableHandle.getTableSchemaJson(), | ||
| tableHandle.getPartitionSpecJson(), | ||
| tableHandle.getFormatVersion(), | ||
| tableHandle.getUnenforcedPredicate(), | ||
| tableHandle.getEnforcedPredicate(), | ||
| tableHandle.getLimit(), | ||
| projectionsSet, | ||
| tableHandle.getNameMappingJson(), | ||
| tableHandle.getTableLocation(), | ||
| tableHandle.getStorageProperties(), | ||
| tableHandle.isRecordScannedFiles(), | ||
| tableHandle.getMaxScannedFileSize(), | ||
| tableHandle.getConstraintColumns(), | ||
| tableHandle.getForAnalyze()); | ||
|
|
||
| return Optional.of(new AggregationApplicationResult<>(tableHandleTemp, projections.build(), resultAssignments.build(), ImmutableMap.of(), false)); | ||
| } | ||
|
|
||
| private boolean hasEqualityDeletes(ConnectorSession session, IcebergTableHandle tableHandle) | ||
| { | ||
| Table icebergTable = catalog.loadTable(session, tableHandle.getSchemaTableName()); | ||
|
|
||
| if (icebergTable.currentSnapshot().summary().containsKey(SnapshotSummary.TOTAL_EQ_DELETES_PROP)) { | ||
|
||
| return (Long.parseLong(icebergTable.currentSnapshot().summary().get(SnapshotSummary.TOTAL_EQ_DELETES_PROP)) > 0); | ||
| } | ||
|
|
||
| return false; | ||
| } | ||
|
|
||
| private static Set<Integer> identityPartitionColumnsInAllSpecs(Table table) | ||
| { | ||
| // Extract identity partition column source ids common to ALL specs | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -57,6 +57,8 @@ | |
| import io.trino.plugin.hive.parquet.ParquetPageSource; | ||
| import io.trino.plugin.hive.parquet.ParquetReaderConfig; | ||
| import io.trino.plugin.iceberg.IcebergParquetColumnIOConverter.FieldContext; | ||
| import io.trino.plugin.iceberg.aggregation.AggregateIcebergSplit; | ||
| import io.trino.plugin.iceberg.aggregation.AggregatePageSource; | ||
| import io.trino.plugin.iceberg.delete.DeleteFile; | ||
| import io.trino.plugin.iceberg.delete.DeleteFilter; | ||
| import io.trino.plugin.iceberg.delete.EqualityDeleteFilter; | ||
|
|
@@ -251,36 +253,43 @@ public ConnectorPageSource createPageSource( | |
| List<ColumnHandle> columns, | ||
| DynamicFilter dynamicFilter) | ||
| { | ||
| IcebergSplit split = (IcebergSplit) connectorSplit; | ||
| List<IcebergColumnHandle> icebergColumns = columns.stream() | ||
| .map(IcebergColumnHandle.class::cast) | ||
| .collect(toImmutableList()); | ||
| IcebergTableHandle tableHandle = (IcebergTableHandle) connectorTable; | ||
| Schema schema = SchemaParser.fromJson(tableHandle.getTableSchemaJson()); | ||
| PartitionSpec partitionSpec = PartitionSpecParser.fromJson(schema, split.getPartitionSpecJson()); | ||
| org.apache.iceberg.types.Type[] partitionColumnTypes = partitionSpec.fields().stream() | ||
| .map(field -> field.transform().getResultType(schema.findType(field.sourceId()))) | ||
| .toArray(org.apache.iceberg.types.Type[]::new); | ||
|
|
||
| return createPageSource( | ||
| session, | ||
| icebergColumns, | ||
| schema, | ||
| partitionSpec, | ||
| PartitionData.fromJson(split.getPartitionDataJson(), partitionColumnTypes), | ||
| split.getDeletes(), | ||
| dynamicFilter, | ||
| tableHandle.getUnenforcedPredicate(), | ||
| split.getFileStatisticsDomain(), | ||
| split.getPath(), | ||
| split.getStart(), | ||
| split.getLength(), | ||
| split.getFileSize(), | ||
| split.getFileRecordCount(), | ||
| split.getPartitionDataJson(), | ||
| split.getFileFormat(), | ||
| split.getFileIoProperties(), | ||
| tableHandle.getNameMappingJson().map(NameMappingParser::fromJson)); | ||
|
|
||
| if (shouldHandleAggregatePushDown(icebergColumns)) { | ||
| AggregateIcebergSplit aggregateIcebergSplit = (AggregateIcebergSplit) connectorSplit; | ||
| return new AggregatePageSource(icebergColumns, aggregateIcebergSplit.getTotalCount()); | ||
| } | ||
| else { | ||
|
||
| IcebergSplit split = (IcebergSplit) connectorSplit; | ||
| IcebergTableHandle tableHandle = (IcebergTableHandle) connectorTable; | ||
| Schema schema = SchemaParser.fromJson(tableHandle.getTableSchemaJson()); | ||
| PartitionSpec partitionSpec = PartitionSpecParser.fromJson(schema, split.getPartitionSpecJson()); | ||
| org.apache.iceberg.types.Type[] partitionColumnTypes = partitionSpec.fields().stream() | ||
| .map(field -> field.transform().getResultType(schema.findType(field.sourceId()))) | ||
| .toArray(org.apache.iceberg.types.Type[]::new); | ||
|
|
||
| return createPageSource( | ||
| session, | ||
| icebergColumns, | ||
| schema, | ||
| partitionSpec, | ||
| PartitionData.fromJson(split.getPartitionDataJson(), partitionColumnTypes), | ||
| split.getDeletes(), | ||
| dynamicFilter, | ||
| tableHandle.getUnenforcedPredicate(), | ||
| split.getFileStatisticsDomain(), | ||
| split.getPath(), | ||
| split.getStart(), | ||
| split.getLength(), | ||
| split.getFileSize(), | ||
| split.getFileRecordCount(), | ||
| split.getPartitionDataJson(), | ||
| split.getFileFormat(), | ||
| split.getFileIoProperties(), | ||
| tableHandle.getNameMappingJson().map(NameMappingParser::fromJson)); | ||
| } | ||
| } | ||
|
|
||
| public ConnectorPageSource createPageSource( | ||
|
|
@@ -1542,6 +1551,11 @@ private static TrinoException handleException(ParquetDataSourceId dataSourceId, | |
| return new TrinoException(ICEBERG_CURSOR_ERROR, format("Failed to read Parquet file: %s", dataSourceId), exception); | ||
| } | ||
|
|
||
| private static boolean shouldHandleAggregatePushDown(List<IcebergColumnHandle> columns) | ||
| { | ||
| return columns.size() == 1 && columns.get(0).isAggregateColumn(); | ||
| } | ||
|
|
||
| public static final class ReaderPageSourceWithRowPositions | ||
| { | ||
| private final ReaderPageSource readerPageSource; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Generally improvement that is not enabled benefits only those who know about it (those who added it + small groups of other people). If there is something preventing the feature from being enabled, it should be code documented as a
// TODOcode comment, with a link to the issue that's focused solely on that (eg #10974 as it stands today is too broad)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Even if this doesn't get enable by default, it should be enabled in at least one test class extending from
BaseConnectorTest, to get reasonable test coverage.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It should be enabled by default since it's only about
counts, which according to spec should be accurate in metadata filesThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed. However, we need to understand what the approach would look like to support other aggregations than
count, since the input information, and guarantees, is so much different.