diff --git a/presto-docs/src/main/sphinx/connector/iceberg.rst b/presto-docs/src/main/sphinx/connector/iceberg.rst index a51b9f0169d2e..8c38bf77ecef5 100644 --- a/presto-docs/src/main/sphinx/connector/iceberg.rst +++ b/presto-docs/src/main/sphinx/connector/iceberg.rst @@ -323,6 +323,8 @@ Property Name Description ``iceberg.metrics-max-inferred-column`` The maximum number of columns for which metrics ``100`` are collected. +``iceberg.max-statistics-file-cache-size`` Maximum size in bytes that should be consumed by the ``256MB`` + statistics file cache. ======================================================= ============================================================= ============ Table Properties diff --git a/presto-iceberg/pom.xml b/presto-iceberg/pom.xml index f8d65d59164df..1550d47846ac1 100644 --- a/presto-iceberg/pom.xml +++ b/presto-iceberg/pom.xml @@ -187,6 +187,11 @@ provided + + com.facebook.airlift + stats + + com.facebook.drift drift-api diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergAbstractMetadata.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergAbstractMetadata.java index be35159d509f7..eb7b5df390b78 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergAbstractMetadata.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergAbstractMetadata.java @@ -25,6 +25,7 @@ import com.facebook.presto.hive.HiveWrittenPartitions; import com.facebook.presto.hive.NodeVersion; import com.facebook.presto.iceberg.changelog.ChangelogUtil; +import com.facebook.presto.iceberg.statistics.StatisticsFileCache; import com.facebook.presto.spi.ColumnHandle; import com.facebook.presto.spi.ColumnMetadata; import com.facebook.presto.spi.ConnectorInsertTableHandle; @@ -182,6 +183,7 @@ public abstract class IcebergAbstractMetadata protected final RowExpressionService rowExpressionService; protected final FilterStatsCalculatorService filterStatsCalculatorService; protected Transaction transaction; + protected final StatisticsFileCache statisticsFileCache; private final StandardFunctionResolution functionResolution; private final ConcurrentMap icebergTables = new ConcurrentHashMap<>(); @@ -192,7 +194,8 @@ public IcebergAbstractMetadata( RowExpressionService rowExpressionService, JsonCodec commitTaskCodec, NodeVersion nodeVersion, - FilterStatsCalculatorService filterStatsCalculatorService) + FilterStatsCalculatorService filterStatsCalculatorService, + StatisticsFileCache statisticsFileCache) { this.typeManager = requireNonNull(typeManager, "typeManager is null"); this.commitTaskCodec = requireNonNull(commitTaskCodec, "commitTaskCodec is null"); @@ -200,6 +203,7 @@ public IcebergAbstractMetadata( this.rowExpressionService = requireNonNull(rowExpressionService, "rowExpressionService is null"); this.nodeVersion = requireNonNull(nodeVersion, "nodeVersion is null"); this.filterStatsCalculatorService = requireNonNull(filterStatsCalculatorService, "filterStatsCalculatorService is null"); + this.statisticsFileCache = requireNonNull(statisticsFileCache, "statisticsFileCache is null"); } protected final Table getIcebergTable(ConnectorSession session, SchemaTableName schemaTableName) @@ -733,7 +737,7 @@ public Map getColumnHandles(ConnectorSession session, Conn @Override public TableStatistics getTableStatistics(ConnectorSession session, ConnectorTableHandle tableHandle, Optional tableLayoutHandle, List columnHandles, Constraint constraint) { - TableStatistics baseStatistics = calculateBaseTableStatistics(this, typeManager, session, (IcebergTableHandle) tableHandle, tableLayoutHandle, columnHandles, constraint); + TableStatistics baseStatistics = calculateBaseTableStatistics(this, typeManager, session, statisticsFileCache, (IcebergTableHandle) tableHandle, tableLayoutHandle, columnHandles, constraint); return calculateStatisticsConsideringLayout(filterStatsCalculatorService, rowExpressionService, baseStatistics, session, tableLayoutHandle); } diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergCommonModule.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergCommonModule.java index 66d06f3a31061..1ea2d23ef9a93 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergCommonModule.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergCommonModule.java @@ -44,6 +44,8 @@ import com.facebook.presto.iceberg.procedure.RemoveOrphanFiles; import com.facebook.presto.iceberg.procedure.RollbackToSnapshotProcedure; import com.facebook.presto.iceberg.procedure.UnregisterTableProcedure; +import com.facebook.presto.iceberg.statistics.StatisticsFileCache; +import com.facebook.presto.iceberg.statistics.StatisticsFileCacheKey; import com.facebook.presto.orc.CachingStripeMetadataSource; import com.facebook.presto.orc.DwrfAwareStripeMetadataSourceFactory; import com.facebook.presto.orc.EncryptionLibrary; @@ -71,6 +73,7 @@ import com.facebook.presto.spi.connector.ConnectorPlanOptimizerProvider; import com.facebook.presto.spi.connector.ConnectorSplitManager; import com.facebook.presto.spi.procedure.Procedure; +import com.facebook.presto.spi.statistics.ColumnStatistics; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import com.google.inject.Binder; @@ -170,6 +173,20 @@ public void setup(Binder binder) binder.bind(ConnectorPlanOptimizerProvider.class).to(IcebergPlanOptimizerProvider.class).in(Scopes.SINGLETON); } + @Singleton + @Provides + public StatisticsFileCache createStatisticsFileCache(IcebergConfig config, MBeanExporter exporter) + { + Cache delegate = CacheBuilder.newBuilder() + .maximumWeight(config.getMaxStatisticsFileCacheSize().toBytes()) + .weigher((key, entry) -> (int) entry.getEstimatedSize()) + .recordStats() + .build(); + CacheStatsMBean bean = new CacheStatsMBean(delegate); + exporter.export(generatedNameOf(StatisticsFileCache.class, connectorId), bean); + return new StatisticsFileCache(delegate); + } + @ForCachingHiveMetastore @Singleton @Provides diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergConfig.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergConfig.java index 74464f938f516..fe11f4b2fefbe 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergConfig.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergConfig.java @@ -19,6 +19,7 @@ import com.facebook.presto.spi.statistics.ColumnStatisticType; import com.google.common.base.Splitter; import com.google.common.collect.ImmutableList; +import io.airlift.units.DataSize; import org.apache.iceberg.hadoop.HadoopFileIO; import javax.validation.constraints.DecimalMax; @@ -33,6 +34,8 @@ import static com.facebook.presto.iceberg.CatalogType.HIVE; import static com.facebook.presto.iceberg.IcebergFileFormat.PARQUET; import static com.facebook.presto.iceberg.util.StatisticsUtil.decodeMergeFlags; +import static io.airlift.units.DataSize.Unit.MEGABYTE; +import static io.airlift.units.DataSize.succinctDataSize; import static org.apache.iceberg.CatalogProperties.IO_MANIFEST_CACHE_EXPIRATION_INTERVAL_MS_DEFAULT; import static org.apache.iceberg.CatalogProperties.IO_MANIFEST_CACHE_MAX_CONTENT_LENGTH_DEFAULT; import static org.apache.iceberg.CatalogProperties.IO_MANIFEST_CACHE_MAX_TOTAL_BYTES_DEFAULT; @@ -67,6 +70,7 @@ public class IcebergConfig private long manifestCacheExpireDuration = IO_MANIFEST_CACHE_EXPIRATION_INTERVAL_MS_DEFAULT; private long manifestCacheMaxContentLength = IO_MANIFEST_CACHE_MAX_CONTENT_LENGTH_DEFAULT; private int splitManagerThreads = Runtime.getRuntime().availableProcessors(); + private DataSize maxStatisticsFileCacheSize = succinctDataSize(256, MEGABYTE); @NotNull public FileFormat getFileFormat() @@ -395,4 +399,17 @@ public IcebergConfig setMetricsMaxInferredColumn(int metricsMaxInferredColumn) this.metricsMaxInferredColumn = metricsMaxInferredColumn; return this; } + + public DataSize getMaxStatisticsFileCacheSize() + { + return maxStatisticsFileCacheSize; + } + + @Config("iceberg.max-statistics-file-cache-size") + @ConfigDescription("The maximum size in bytes the statistics file cache should consume") + public IcebergConfig setMaxStatisticsFileCacheSize(DataSize maxStatisticsFileCacheSize) + { + this.maxStatisticsFileCacheSize = maxStatisticsFileCacheSize; + return this; + } } diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergErrorCode.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergErrorCode.java index 1f1e5fdf5146a..3548ef24ba496 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergErrorCode.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergErrorCode.java @@ -38,7 +38,8 @@ public enum IcebergErrorCode ICEBERG_INVALID_TABLE_TIMESTAMP(12, USER_ERROR), ICEBERG_ROLLBACK_ERROR(13, EXTERNAL), ICEBERG_INVALID_FORMAT_VERSION(14, USER_ERROR), - ICEBERG_UNKNOWN_MANIFEST_TYPE(15, EXTERNAL); + ICEBERG_UNKNOWN_MANIFEST_TYPE(15, EXTERNAL), + ICEBERG_CACHE_WRITE_ERROR(16, INTERNAL_ERROR); private final ErrorCode errorCode; diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergHiveMetadata.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergHiveMetadata.java index f64337a1079db..ea610a5037083 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergHiveMetadata.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergHiveMetadata.java @@ -33,6 +33,7 @@ import com.facebook.presto.hive.metastore.PrestoTableType; import com.facebook.presto.hive.metastore.PrincipalPrivileges; import com.facebook.presto.hive.metastore.Table; +import com.facebook.presto.iceberg.statistics.StatisticsFileCache; import com.facebook.presto.spi.ColumnHandle; import com.facebook.presto.spi.ConnectorNewTableLayout; import com.facebook.presto.spi.ConnectorOutputTableHandle; @@ -160,9 +161,10 @@ public IcebergHiveMetadata( JsonCodec commitTaskCodec, NodeVersion nodeVersion, FilterStatsCalculatorService filterStatsCalculatorService, - IcebergHiveTableOperationsConfig hiveTableOeprationsConfig) + IcebergHiveTableOperationsConfig hiveTableOeprationsConfig, + StatisticsFileCache statisticsFileCache) { - super(typeManager, functionResolution, rowExpressionService, commitTaskCodec, nodeVersion, filterStatsCalculatorService); + super(typeManager, functionResolution, rowExpressionService, commitTaskCodec, nodeVersion, filterStatsCalculatorService, statisticsFileCache); this.metastore = requireNonNull(metastore, "metastore is null"); this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); this.hiveTableOeprationsConfig = requireNonNull(hiveTableOeprationsConfig, "hiveTableOperationsConfig is null"); @@ -448,7 +450,7 @@ public TableStatistics getTableStatistics(ConnectorSession session, ConnectorTab { IcebergTableHandle handle = (IcebergTableHandle) tableHandle; org.apache.iceberg.Table icebergTable = getIcebergTable(session, handle.getSchemaTableName()); - TableStatistics icebergStatistics = calculateBaseTableStatistics(this, typeManager, session, (IcebergTableHandle) tableHandle, tableLayoutHandle, columnHandles, constraint); + TableStatistics icebergStatistics = calculateBaseTableStatistics(this, typeManager, session, statisticsFileCache, (IcebergTableHandle) tableHandle, tableLayoutHandle, columnHandles, constraint); EnumSet mergeFlags = getHiveStatisticsMergeStrategy(session); TableStatistics mergedStatistics = Optional.of(mergeFlags) .filter(set -> !set.isEmpty()) diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergHiveMetadataFactory.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergHiveMetadataFactory.java index f8175f1b0c856..5bc8c9e4a8b0c 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergHiveMetadataFactory.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergHiveMetadataFactory.java @@ -18,6 +18,7 @@ import com.facebook.presto.hive.HdfsEnvironment; import com.facebook.presto.hive.NodeVersion; import com.facebook.presto.hive.metastore.ExtendedHiveMetastore; +import com.facebook.presto.iceberg.statistics.StatisticsFileCache; import com.facebook.presto.spi.connector.ConnectorMetadata; import com.facebook.presto.spi.function.StandardFunctionResolution; import com.facebook.presto.spi.plan.FilterStatsCalculatorService; @@ -39,6 +40,7 @@ public class IcebergHiveMetadataFactory final NodeVersion nodeVersion; final FilterStatsCalculatorService filterStatsCalculatorService; final IcebergHiveTableOperationsConfig operationsConfig; + final StatisticsFileCache statisticsFileCache; @Inject public IcebergHiveMetadataFactory( @@ -50,7 +52,8 @@ public IcebergHiveMetadataFactory( JsonCodec commitTaskCodec, NodeVersion nodeVersion, FilterStatsCalculatorService filterStatsCalculatorService, - IcebergHiveTableOperationsConfig operationsConfig) + IcebergHiveTableOperationsConfig operationsConfig, + StatisticsFileCache statisticsFileCache) { this.metastore = requireNonNull(metastore, "metastore is null"); this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); @@ -61,6 +64,7 @@ public IcebergHiveMetadataFactory( this.nodeVersion = requireNonNull(nodeVersion, "nodeVersion is null"); this.filterStatsCalculatorService = requireNonNull(filterStatsCalculatorService, "filterStatsCalculatorService is null"); this.operationsConfig = requireNonNull(operationsConfig, "operationsConfig is null"); + this.statisticsFileCache = requireNonNull(statisticsFileCache, "statisticsFileCache is null"); } public ConnectorMetadata create() @@ -74,6 +78,7 @@ public ConnectorMetadata create() commitTaskCodec, nodeVersion, filterStatsCalculatorService, - operationsConfig); + operationsConfig, + statisticsFileCache); } } diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergNativeMetadata.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergNativeMetadata.java index bb18a676b2450..44205752b159f 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergNativeMetadata.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergNativeMetadata.java @@ -18,6 +18,7 @@ import com.facebook.presto.common.type.TypeManager; import com.facebook.presto.hive.NodeVersion; import com.facebook.presto.hive.TableAlreadyExistsException; +import com.facebook.presto.iceberg.statistics.StatisticsFileCache; import com.facebook.presto.iceberg.util.IcebergPrestoModelConverters; import com.facebook.presto.spi.ConnectorNewTableLayout; import com.facebook.presto.spi.ConnectorOutputTableHandle; @@ -82,9 +83,10 @@ public IcebergNativeMetadata( JsonCodec commitTaskCodec, CatalogType catalogType, NodeVersion nodeVersion, - FilterStatsCalculatorService filterStatsCalculatorService) + FilterStatsCalculatorService filterStatsCalculatorService, + StatisticsFileCache statisticsFileCache) { - super(typeManager, functionResolution, rowExpressionService, commitTaskCodec, nodeVersion, filterStatsCalculatorService); + super(typeManager, functionResolution, rowExpressionService, commitTaskCodec, nodeVersion, filterStatsCalculatorService, statisticsFileCache); this.catalogFactory = requireNonNull(catalogFactory, "catalogFactory is null"); this.catalogType = requireNonNull(catalogType, "catalogType is null"); } diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergNativeMetadataFactory.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergNativeMetadataFactory.java index 33a3caf34b9f8..98e0e1f546946 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergNativeMetadataFactory.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergNativeMetadataFactory.java @@ -16,6 +16,7 @@ import com.facebook.airlift.json.JsonCodec; import com.facebook.presto.common.type.TypeManager; import com.facebook.presto.hive.NodeVersion; +import com.facebook.presto.iceberg.statistics.StatisticsFileCache; import com.facebook.presto.spi.connector.ConnectorMetadata; import com.facebook.presto.spi.function.StandardFunctionResolution; import com.facebook.presto.spi.plan.FilterStatsCalculatorService; @@ -36,6 +37,7 @@ public class IcebergNativeMetadataFactory final RowExpressionService rowExpressionService; final NodeVersion nodeVersion; final FilterStatsCalculatorService filterStatsCalculatorService; + final StatisticsFileCache statisticsFileCache; @Inject public IcebergNativeMetadataFactory( @@ -46,7 +48,8 @@ public IcebergNativeMetadataFactory( RowExpressionService rowExpressionService, JsonCodec commitTaskCodec, NodeVersion nodeVersion, - FilterStatsCalculatorService filterStatsCalculatorService) + FilterStatsCalculatorService filterStatsCalculatorService, + StatisticsFileCache statisticsFileCache) { this.catalogFactory = requireNonNull(catalogFactory, "catalogFactory is null"); this.typeManager = requireNonNull(typeManager, "typeManager is null"); @@ -57,10 +60,11 @@ public IcebergNativeMetadataFactory( requireNonNull(config, "config is null"); this.catalogType = config.getCatalogType(); this.filterStatsCalculatorService = requireNonNull(filterStatsCalculatorService, "filterStatsCalculatorService is null"); + this.statisticsFileCache = requireNonNull(statisticsFileCache, "statisticsFileCache is null"); } public ConnectorMetadata create() { - return new IcebergNativeMetadata(catalogFactory, typeManager, functionResolution, rowExpressionService, commitTaskCodec, catalogType, nodeVersion, filterStatsCalculatorService); + return new IcebergNativeMetadata(catalogFactory, typeManager, functionResolution, rowExpressionService, commitTaskCodec, catalogType, nodeVersion, filterStatsCalculatorService, statisticsFileCache); } } diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java index 9da3de696f620..b86a7e9c87cb0 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java @@ -14,11 +14,14 @@ package com.facebook.presto.iceberg; import com.facebook.airlift.log.Logger; +import com.facebook.presto.common.RuntimeUnit; import com.facebook.presto.common.block.Block; import com.facebook.presto.common.predicate.TupleDomain; import com.facebook.presto.common.type.FixedWidthType; import com.facebook.presto.common.type.TypeManager; import com.facebook.presto.hive.NodeVersion; +import com.facebook.presto.iceberg.statistics.StatisticsFileCache; +import com.facebook.presto.iceberg.statistics.StatisticsFileCacheKey; import com.facebook.presto.spi.ConnectorSession; import com.facebook.presto.spi.Constraint; import com.facebook.presto.spi.PrestoException; @@ -31,6 +34,7 @@ import com.facebook.presto.spi.statistics.TableStatistics; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Maps; import org.apache.datasketches.memory.Memory; import org.apache.datasketches.theta.CompactSketch; import org.apache.iceberg.ContentFile; @@ -93,6 +97,8 @@ import static com.facebook.presto.spi.statistics.ColumnStatisticType.TOTAL_SIZE_IN_BYTES; import static com.facebook.presto.spi.statistics.SourceInfo.ConfidenceLevel.HIGH; import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static com.google.common.collect.ImmutableSet.toImmutableSet; import static com.google.common.collect.Iterables.getOnlyElement; import static java.lang.Long.parseLong; import static java.lang.Math.abs; @@ -112,6 +118,10 @@ public class TableStatisticsMaker private final ConnectorSession session; private final TypeManager typeManager; + private static final String STATISITCS_CACHE_METRIC_FILE_SIZE_FORMAT = "StatisticsFileCache/PuffinFileSize/%s/%s"; + private static final String STATISITCS_CACHE_METRIC_FILE_COLUMN_COUNT_FORMAT = "StatisticsFileCache/ColumnCount/%s/%s"; + private static final String STATISITCS_CACHE_METRIC_PARTIAL_MISS_FORMAT = "StatisticsFileCache/PartialMiss/%s/%s"; + private TableStatisticsMaker(Table icebergTable, ConnectorSession session, TypeManager typeManager) { this.icebergTable = icebergTable; @@ -129,12 +139,24 @@ private TableStatisticsMaker(Table icebergTable, ConnectorSession session, TypeM .put(ICEBERG_DATA_SIZE_BLOB_TYPE_ID, TableStatisticsMaker::readDataSizeBlob) .build(); - public static TableStatistics getTableStatistics(ConnectorSession session, TypeManager typeManager, Optional> currentPredicate, Constraint constraint, IcebergTableHandle tableHandle, Table icebergTable, List columns) + public static TableStatistics getTableStatistics( + ConnectorSession session, + TypeManager typeManager, + StatisticsFileCache statisticsFileCache, + Optional> currentPredicate, + Constraint constraint, + IcebergTableHandle tableHandle, + Table icebergTable, + List columns) { - return new TableStatisticsMaker(icebergTable, session, typeManager).makeTableStatistics(tableHandle, currentPredicate, constraint, columns); + return new TableStatisticsMaker(icebergTable, session, typeManager).makeTableStatistics(statisticsFileCache, tableHandle, currentPredicate, constraint, columns); } - private TableStatistics makeTableStatistics(IcebergTableHandle tableHandle, Optional> currentPredicate, Constraint constraint, List selectedColumns) + private TableStatistics makeTableStatistics(StatisticsFileCache statisticsFileCache, + IcebergTableHandle tableHandle, + Optional> currentPredicate, + Constraint constraint, + List selectedColumns) { if (!tableHandle.getIcebergTableName().getSnapshotId().isPresent() || constraint.getSummary().isNone()) { return TableStatistics.builder() @@ -192,8 +214,14 @@ private TableStatistics makeTableStatistics(IcebergTableHandle tableHandle, Opti TableStatistics.Builder result = TableStatistics.builder(); result.setRowCount(Estimate.of(recordCount)); - Map tableStats = getClosestStatisticsFileForSnapshot(tableHandle) - .map(this::loadStatisticsFile).orElseGet(Collections::emptyMap); + // transformValues returns a view. We need to make a copy of the map in order to update + Map tableStats = ImmutableMap.copyOf( + Maps.transformValues(getClosestStatisticsFileForSnapshot(tableHandle) + .map(file -> loadStatisticsFile(tableHandle, file, statisticsFileCache, selectedColumns.stream() + .map(IcebergColumnHandle::getId) + .collect(Collectors.toList()))) + .orElseGet(Collections::emptyMap), + ColumnStatistics::buildFrom)); // scale all NDV values loaded from puffin files based on row count totalRecordCount.ifPresent(fullTableRecordCount -> tableStats.forEach((id, stat) -> stat.setDistinctValuesCount(stat.getDistinctValuesCount().map(value -> value * recordCount / fullTableRecordCount)))); @@ -497,8 +525,42 @@ private Optional getClosestStatisticsFileForSnapshot(IcebergTabl /** * Builds a map of field ID to ColumnStatistics for a particular {@link StatisticsFile}. */ - private Map loadStatisticsFile(StatisticsFile file) + private Map loadStatisticsFile(IcebergTableHandle tableHandle, StatisticsFile file, StatisticsFileCache statisticsFileCache, List columnIds) { + // first, try to load all stats from the cache. If the map doesn't contain all keys, load the missing + // stats into the cache. + Map cachedStats = columnIds.stream() + .map(id -> Pair.of(id, statisticsFileCache.getIfPresent(new StatisticsFileCacheKey(file, id)))) + .filter(pair -> pair.second() != null) + .collect(toImmutableMap(Pair::first, Pair::second)); + Set missingStats = columnIds.stream().filter(id -> !cachedStats.containsKey(id)).collect(toImmutableSet()); + if (missingStats.isEmpty()) { + return cachedStats; + } + if (!cachedStats.isEmpty()) { + session.getRuntimeStats().addMetricValue( + String.format(STATISITCS_CACHE_METRIC_PARTIAL_MISS_FORMAT, + tableHandle.getSchemaTableName(), + file.path()), + RuntimeUnit.NONE, 1); + } + + String fileSizeMetricName = String.format(STATISITCS_CACHE_METRIC_FILE_SIZE_FORMAT, + tableHandle.getSchemaTableName(), + file.path()); + if (session.getRuntimeStats().getMetric(fileSizeMetricName) == null) { + long fileSize = file.fileFooterSizeInBytes(); + session.getRuntimeStats().addMetricValue(fileSizeMetricName, RuntimeUnit.NONE, fileSize); + statisticsFileCache.recordFileSize(fileSize); + } + String columnCountMetricName = String.format(STATISITCS_CACHE_METRIC_FILE_COLUMN_COUNT_FORMAT, + tableHandle.getSchemaTableName(), + file.path()); + if (session.getRuntimeStats().getMetric(columnCountMetricName) == null) { + long columnCount = file.blobMetadata().size(); + session.getRuntimeStats().addMetricValue(columnCountMetricName, RuntimeUnit.NONE, columnCount); + statisticsFileCache.recordColumnCount(columnCount); + } Map result = new HashMap<>(); try (FileIO io = icebergTable.io()) { InputFile inputFile = io.newInputFile(file.path()); @@ -507,6 +569,10 @@ private Map loadStatisticsFile(StatisticsFile BlobMetadata metadata = data.first(); ByteBuffer blob = data.second(); Integer field = getOnlyElement(metadata.inputFields()); + if (!missingStats.contains(field)) { + continue; + } + Optional.ofNullable(puffinStatReaders.get(metadata.type())) .ifPresent(statReader -> { result.compute(field, (key, value) -> { @@ -523,8 +589,19 @@ private Map loadStatisticsFile(StatisticsFile throw new PrestoException(ICEBERG_FILESYSTEM_ERROR, "failed to read statistics file at " + file.path(), e); } } - - return ImmutableMap.copyOf(result); + Map computedResults = new HashMap<>(Maps.transformValues(result, ColumnStatistics.Builder::build)); + missingStats.stream() + .filter(id -> !computedResults.containsKey(id)) + .forEach(id -> computedResults.put(id, ColumnStatistics.empty())); + ImmutableMap.Builder finalResult = ImmutableMap.builder(); + computedResults.forEach((key, value) -> { + // stats for a particular column may not appear in a file. Add a cache entry to + // denote that this file has been read for a particular column to avoid reading the + // file again + statisticsFileCache.put(new StatisticsFileCacheKey(file, key), value); + finalResult.put(key, value); + }); + return finalResult.build(); } public static List getSupportedColumnStatistics(String columnName, com.facebook.presto.common.type.Type type) diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/statistics/StatisticsFileCache.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/statistics/StatisticsFileCache.java new file mode 100644 index 0000000000000..faf61aa6427fa --- /dev/null +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/statistics/StatisticsFileCache.java @@ -0,0 +1,57 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.iceberg.statistics; + +import com.facebook.airlift.stats.DistributionStat; +import com.facebook.presto.spi.statistics.ColumnStatistics; +import com.google.common.cache.Cache; +import com.google.common.cache.ForwardingCache.SimpleForwardingCache; +import org.weakref.jmx.Managed; +import org.weakref.jmx.Nested; + +public class StatisticsFileCache + extends SimpleForwardingCache +{ + private final DistributionStat fileSizes = new DistributionStat(); + private final DistributionStat columnCounts = new DistributionStat(); + + public StatisticsFileCache(Cache delegate) + { + super(delegate); + } + + @Managed + @Nested + public DistributionStat getFileSizeDistribution() + { + return fileSizes; + } + + public void recordFileSize(long size) + { + fileSizes.add(size); + } + + @Managed + @Nested + public DistributionStat getColumnCountDistribution() + { + return columnCounts; + } + + public void recordColumnCount(long count) + { + columnCounts.add(count); + } +} diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/statistics/StatisticsFileCacheKey.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/statistics/StatisticsFileCacheKey.java new file mode 100644 index 0000000000000..65afd009dd0a4 --- /dev/null +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/statistics/StatisticsFileCacheKey.java @@ -0,0 +1,51 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.iceberg.statistics; + +import org.apache.iceberg.StatisticsFile; + +import java.util.Objects; + +import static java.util.Objects.requireNonNull; + +public class StatisticsFileCacheKey +{ + private final StatisticsFile file; + private final int columnId; + + public StatisticsFileCacheKey(StatisticsFile file, int columnId) + { + this.file = requireNonNull(file, "file is null"); + this.columnId = columnId; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (!(o instanceof StatisticsFileCacheKey)) { + return false; + } + StatisticsFileCacheKey that = (StatisticsFileCacheKey) o; + return columnId == that.columnId && Objects.equals(file, that.file); + } + + @Override + public int hashCode() + { + return Objects.hash(file, columnId); + } +} diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/util/StatisticsUtil.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/util/StatisticsUtil.java index 446296346c7f4..a0587ccd8b069 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/util/StatisticsUtil.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/util/StatisticsUtil.java @@ -22,6 +22,7 @@ import com.facebook.presto.iceberg.IcebergTableHandle; import com.facebook.presto.iceberg.IcebergTableLayoutHandle; import com.facebook.presto.iceberg.TableStatisticsMaker; +import com.facebook.presto.iceberg.statistics.StatisticsFileCache; import com.facebook.presto.spi.ColumnHandle; import com.facebook.presto.spi.ConnectorSession; import com.facebook.presto.spi.ConnectorTableLayoutHandle; @@ -214,6 +215,7 @@ public static TableStatistics calculateBaseTableStatistics( ConnectorMetadata metadata, TypeManager typeManager, ConnectorSession session, + StatisticsFileCache statisticsFileCache, IcebergTableHandle tableHandle, Optional tableLayoutHandle, List columnHandles, @@ -226,6 +228,7 @@ public static TableStatistics calculateBaseTableStatistics( .collect(toImmutableList()), tableLayoutHandle.map(IcebergTableLayoutHandle.class::cast)); return TableStatisticsMaker.getTableStatistics(session, typeManager, + statisticsFileCache, tableLayoutHandle .map(IcebergTableLayoutHandle.class::cast) .map(IcebergTableLayoutHandle::getValidPredicate), diff --git a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java index 7b2cb01e61749..fa140c98a8b94 100644 --- a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java +++ b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java @@ -30,12 +30,15 @@ import com.facebook.presto.hive.MetastoreClientConfig; import com.facebook.presto.hive.authentication.NoHdfsAuthentication; import com.facebook.presto.iceberg.delete.DeleteFile; +import com.facebook.presto.metadata.CatalogMetadata; import com.facebook.presto.metadata.Metadata; +import com.facebook.presto.metadata.MetadataUtil; import com.facebook.presto.spi.ColumnHandle; import com.facebook.presto.spi.Constraint; import com.facebook.presto.spi.PrestoException; import com.facebook.presto.spi.TableHandle; import com.facebook.presto.spi.analyzer.MetadataResolver; +import com.facebook.presto.spi.connector.classloader.ClassLoaderSafeConnectorMetadata; import com.facebook.presto.spi.security.AllowAllAccessControl; import com.facebook.presto.spi.statistics.ColumnStatistics; import com.facebook.presto.spi.statistics.Estimate; @@ -46,9 +49,11 @@ import com.facebook.presto.tests.AbstractTestQueryFramework; import com.google.common.base.Joiner; import com.google.common.base.Strings; +import com.google.common.cache.CacheStats; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Maps; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -81,6 +86,7 @@ import java.io.Closeable; import java.io.File; import java.io.IOException; +import java.lang.reflect.Field; import java.nio.file.Path; import java.time.LocalDateTime; import java.time.LocalTime; @@ -933,7 +939,7 @@ public void testStatsByDistance() Session weightedSession = Session.builder(getSession()) .setCatalogSessionProperty("iceberg", STATISTIC_SNAPSHOT_RECORD_DIFFERENCE_WEIGHT, "10000000") .build(); - Function ndvs = (x) -> columnStatsFor(getTableStats("test_stat_dist", Optional.of(snapshots.get(x)), weightedSession), "col0") + Function ndvs = (x) -> columnStatsFor(getTableStats("test_stat_dist", Optional.of(snapshots.get(x)), weightedSession, Optional.empty()), "col0") .getDistinctValuesCount(); assertEquals(ndvs.apply(0).getValue(), 1); assertEquals(ndvs.apply(1).getValue(), 1); @@ -991,10 +997,10 @@ private TableStatistics getTableStats(String name) private TableStatistics getTableStats(String name, Optional snapshot) { - return getTableStats(name, snapshot, getSession()); + return getTableStats(name, snapshot, getSession(), Optional.empty()); } - private TableStatistics getTableStats(String name, Optional snapshot, Session session) + private TableStatistics getTableStats(String name, Optional snapshot, Session session, Optional> columns) { TransactionId transactionId = getQueryRunner().getTransactionManager().beginTransaction(false); Session metadataSession = session.beginTransactionId( @@ -1008,7 +1014,9 @@ private TableStatistics getTableStats(String name, Optional snapshot, Sess TableHandle handle = resolver.getTableHandle(QualifiedObjectName.valueOf(qualifiedName)).get(); return metadata.getTableStatistics(metadataSession, handle, - new ArrayList<>(resolver.getColumnHandles(handle).values()), + new ArrayList<>(columns + .map(columnSet -> Maps.filterKeys(resolver.getColumnHandles(handle), columnSet::contains)) + .orElse(resolver.getColumnHandles(handle)).values()), Constraint.alwaysTrue()); } @@ -1829,6 +1837,36 @@ public void testFilterWithRemainingPredicate(boolean pushdownFilterEnabled) assertQuerySucceeds("DROP TABLE test_filterstats_remaining_predicate"); } + public void testStatisticsFileCache() + throws Exception + { + assertQuerySucceeds("CREATE TABLE test_statistics_file_cache(i int)"); + assertUpdate("INSERT INTO test_statistics_file_cache VALUES 1, 2, 3, 4, 5", 5); + assertQuerySucceeds("ANALYZE test_statistics_file_cache"); + Session session = Session.builder(getSession()) + .setTransactionId(getQueryRunner().getTransactionManager().beginTransaction(false)) + .build(); + Optional handle = MetadataUtil.getOptionalTableHandle(session, + getQueryRunner().getTransactionManager(), + QualifiedObjectName.valueOf(session.getCatalog().get(), session.getSchema().get(), "test_statistics_file_cache"), + Optional.empty()); + CatalogMetadata catalogMetadata = getQueryRunner().getTransactionManager() + .getCatalogMetadata(session.getTransactionId().get(), handle.get().getConnectorId()); + // There isn't an easy way to access the cache internally, so use some reflection to grab it + Field delegate = ClassLoaderSafeConnectorMetadata.class.getDeclaredField("delegate"); + delegate.setAccessible(true); + IcebergAbstractMetadata metadata = (IcebergAbstractMetadata) delegate.get(catalogMetadata.getMetadataFor(handle.get().getConnectorId())); + CacheStats initial = metadata.statisticsFileCache.stats(); + assertEquals(metadata.statisticsFileCache.stats().minus(initial).hitCount(), 0); + TableStatistics stats = getTableStats("test_statistics_file_cache", Optional.empty(), getSession(), Optional.of(ImmutableList.of("i"))); + assertEquals(stats.getRowCount().getValue(), 5); + assertEquals(metadata.statisticsFileCache.stats().minus(initial).missCount(), 1); + getTableStats("test_statistics_file_cache", Optional.empty(), getSession(), Optional.of(ImmutableList.of("i"))); + assertEquals(metadata.statisticsFileCache.stats().minus(initial).missCount(), 1); + assertEquals(metadata.statisticsFileCache.stats().minus(initial).hitCount(), 1); + getQueryRunner().execute("DROP TABLE test_statistics_file_cache"); + } + private void testCheckDeleteFiles(Table icebergTable, int expectedSize, List expectedFileContent) { // check delete file list diff --git a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestIcebergConfig.java b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestIcebergConfig.java index cbfeebf72797e..d28503a27d316 100644 --- a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestIcebergConfig.java +++ b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestIcebergConfig.java @@ -30,6 +30,8 @@ import static com.facebook.presto.iceberg.IcebergFileFormat.PARQUET; import static com.facebook.presto.spi.statistics.ColumnStatisticType.NUMBER_OF_DISTINCT_VALUES; import static com.facebook.presto.spi.statistics.ColumnStatisticType.TOTAL_SIZE_IN_BYTES; +import static io.airlift.units.DataSize.Unit.MEGABYTE; +import static io.airlift.units.DataSize.succinctDataSize; import static org.apache.iceberg.CatalogProperties.IO_MANIFEST_CACHE_EXPIRATION_INTERVAL_MS_DEFAULT; import static org.apache.iceberg.CatalogProperties.IO_MANIFEST_CACHE_MAX_CONTENT_LENGTH_DEFAULT; import static org.apache.iceberg.CatalogProperties.IO_MANIFEST_CACHE_MAX_TOTAL_BYTES_DEFAULT; @@ -66,7 +68,8 @@ public void testDefaults() .setSplitManagerThreads(Runtime.getRuntime().availableProcessors()) .setMetadataPreviousVersionsMax(METADATA_PREVIOUS_VERSIONS_MAX_DEFAULT) .setMetadataDeleteAfterCommit(METADATA_DELETE_AFTER_COMMIT_ENABLED_DEFAULT) - .setMetricsMaxInferredColumn(METRICS_MAX_INFERRED_COLUMN_DEFAULTS_DEFAULT)); + .setMetricsMaxInferredColumn(METRICS_MAX_INFERRED_COLUMN_DEFAULTS_DEFAULT) + .setMaxStatisticsFileCacheSize(succinctDataSize(256, MEGABYTE))); } @Test @@ -97,6 +100,7 @@ public void testExplicitPropertyMappings() .put("iceberg.metadata-previous-versions-max", "1") .put("iceberg.metadata-delete-after-commit", "true") .put("iceberg.metrics-max-inferred-column", "16") + .put("iceberg.max-statistics-file-cache-size", "512MB") .build(); IcebergConfig expected = new IcebergConfig() @@ -123,7 +127,8 @@ public void testExplicitPropertyMappings() .setSplitManagerThreads(42) .setMetadataPreviousVersionsMax(1) .setMetadataDeleteAfterCommit(true) - .setMetricsMaxInferredColumn(16); + .setMetricsMaxInferredColumn(16) + .setMaxStatisticsFileCacheSize(succinctDataSize(512, MEGABYTE)); assertFullMapping(properties, expected); } diff --git a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/hive/TestIcebergDistributedHive.java b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/hive/TestIcebergDistributedHive.java index 7bd7fb34bacee..3954524b959b2 100644 --- a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/hive/TestIcebergDistributedHive.java +++ b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/hive/TestIcebergDistributedHive.java @@ -59,6 +59,14 @@ public void testPartShowStatsWithFilters() // Hive doesn't support returning statistics on partitioned tables } + @Override + public void testStatisticsFileCache() + throws Exception + { + // hive doesn't write Iceberg statistics files when metastore is in use, + // so this test won't complete successfully. + } + @Override protected Table loadTable(String tableName) { diff --git a/presto-main/src/main/java/com/facebook/presto/cost/DisjointRangeDomainHistogram.java b/presto-main/src/main/java/com/facebook/presto/cost/DisjointRangeDomainHistogram.java index 1d4b44140de41..553cf84d07bf9 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/DisjointRangeDomainHistogram.java +++ b/presto-main/src/main/java/com/facebook/presto/cost/DisjointRangeDomainHistogram.java @@ -24,6 +24,7 @@ import com.google.common.collect.Range; import com.google.common.collect.RangeSet; import com.google.common.collect.TreeRangeSet; +import org.openjdk.jol.info.ClassLayout; import java.util.Collection; import java.util.NoSuchElementException; @@ -68,6 +69,9 @@ public class DisjointRangeDomainHistogram implements ConnectorHistogram { + private static final long INSTANCE_SIZE = ClassLayout.parseClass(DisjointRangeDomainHistogram.class).instanceSize(); + private static final long RANGE_SIZE = ClassLayout.parseClass(Range.class).instanceSize(); + private final ConnectorHistogram source; // use RangeSet as the internal representation of the ranges, but the constructor arguments // use StatisticRange to support serialization and deserialization. @@ -356,4 +360,13 @@ public int hashCode() { return hash(source, getRanges()); } + + @Override + public long getEstimatedSize() + { + // don't count the source histogram as it's just a reference to + // another histogram. We don't want to count the retained memory. + return INSTANCE_SIZE + + (RANGE_SIZE * ranges.size()); + } } diff --git a/presto-main/src/main/java/com/facebook/presto/cost/UniformDistributionHistogram.java b/presto-main/src/main/java/com/facebook/presto/cost/UniformDistributionHistogram.java index d06232d1fb608..c9cb6bfd19523 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/UniformDistributionHistogram.java +++ b/presto-main/src/main/java/com/facebook/presto/cost/UniformDistributionHistogram.java @@ -18,6 +18,7 @@ import com.facebook.presto.spi.statistics.Estimate; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; +import org.openjdk.jol.info.ClassLayout; import static com.google.common.base.MoreObjects.toStringHelper; import static com.google.common.base.Preconditions.checkArgument; @@ -38,6 +39,7 @@ public class UniformDistributionHistogram implements ConnectorHistogram { + private static final long INSTANCE_SIZE = ClassLayout.parseClass(UniformDistributionHistogram.class).instanceSize(); private final double lowValue; private final double highValue; @@ -111,6 +113,12 @@ public Estimate inverseCumulativeProbability(double percentile) return Estimate.of(lowValue + (percentile * (highValue - lowValue))); } + @Override + public long getEstimatedSize() + { + return INSTANCE_SIZE; + } + @Override public String toString() { diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestDisjointRangeDomainHistogram.java b/presto-main/src/test/java/com/facebook/presto/cost/TestDisjointRangeDomainHistogram.java index 1cbddcc781c58..0c11e729b1fef 100644 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestDisjointRangeDomainHistogram.java +++ b/presto-main/src/test/java/com/facebook/presto/cost/TestDisjointRangeDomainHistogram.java @@ -254,6 +254,12 @@ public Estimate inverseCumulativeProbability(double percentile) } return Estimate.of(distribution.inverseCumulativeProbability(percentile)); } + + @Override + public long getEstimatedSize() + { + return 0; + } } @Override diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java index bec6fb5c31299..97b1ac7fd4c1a 100644 --- a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java @@ -14,15 +14,21 @@ package com.facebook.presto.spi.statistics; import com.fasterxml.jackson.annotation.JsonProperty; +import org.openjdk.jol.info.ClassLayout; import java.util.Objects; import java.util.Optional; +import static com.facebook.presto.spi.statistics.DoubleRange.RANGE_SIZE; +import static com.facebook.presto.spi.statistics.Estimate.ESTIMATE_SIZE; import static java.lang.String.format; import static java.util.Objects.requireNonNull; public final class ColumnStatistics { + private static final long COLUMN_STATISTICS_SIZE = ClassLayout.parseClass(ColumnStatistics.class).instanceSize(); + private static final long OPTION_SIZE = ClassLayout.parseClass(Optional.class).instanceSize(); + private static final ColumnStatistics EMPTY = new ColumnStatistics(Estimate.unknown(), Estimate.unknown(), Estimate.unknown(), Optional.empty(), Optional.empty()); private final Estimate nullsFraction; @@ -142,6 +148,15 @@ public static Builder buildFrom(ColumnStatistics statistics) .setHistogram(statistics.getHistogram()); } + public long getEstimatedSize() + { + return COLUMN_STATISTICS_SIZE + + 3 * ESTIMATE_SIZE + + 2 * OPTION_SIZE + + histogram.map(ConnectorHistogram::getEstimatedSize).orElse(0L) + + range.map(unused -> RANGE_SIZE).orElse(0L); + } + /** * If one of the estimates below is unspecified, the default "unknown" estimate value * (represented by floating point NaN) may cause the resulting symbol statistics diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ConnectorHistogram.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ConnectorHistogram.java index 0febeb7f0d3fa..0b5187eb7d038 100644 --- a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ConnectorHistogram.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ConnectorHistogram.java @@ -61,4 +61,9 @@ public interface ConnectorHistogram * @return the value in the distribution corresponding to the percentile */ Estimate inverseCumulativeProbability(double percentile); + + /** + * @return an approximation of the memory utilization of this histogram. + */ + long getEstimatedSize(); } diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/DoubleRange.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/DoubleRange.java index eab5015013648..b30c856e1c112 100644 --- a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/DoubleRange.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/DoubleRange.java @@ -14,6 +14,7 @@ package com.facebook.presto.spi.statistics; import com.fasterxml.jackson.annotation.JsonProperty; +import org.openjdk.jol.info.ClassLayout; import java.util.Objects; @@ -25,6 +26,8 @@ public class DoubleRange { + static final long RANGE_SIZE = ClassLayout.parseClass(DoubleRange.class).instanceSize(); + private final double min; private final double max; diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/Estimate.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/Estimate.java index 02f8713d4da80..946d5f69d5b52 100644 --- a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/Estimate.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/Estimate.java @@ -19,6 +19,7 @@ import com.facebook.drift.annotations.ThriftStruct; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; +import org.openjdk.jol.info.ClassLayout; import java.util.Objects; import java.util.function.Function; @@ -31,6 +32,8 @@ @ThriftStruct public final class Estimate { + static final long ESTIMATE_SIZE = ClassLayout.parseClass(Estimate.class).instanceSize(); + // todo eventually add some notion of statistic reliability // Skipping for now as there hard to compute it properly and so far we do not have // usecase for that. diff --git a/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestColumnStatistics.java b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestColumnStatistics.java new file mode 100644 index 0000000000000..e33d3143b35cb --- /dev/null +++ b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestColumnStatistics.java @@ -0,0 +1,121 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.spi.statistics; + +import org.openjdk.jol.info.ClassLayout; +import org.openjdk.jol.info.GraphLayout; +import org.testng.annotations.Test; + +import java.util.Optional; + +import static io.airlift.slice.SizeOf.sizeOf; +import static org.testng.Assert.assertEquals; + +public class TestColumnStatistics +{ + /** + * Test for all non-histogram stats that memory is accounted accurately + */ + @Test + public void testColumnStatisticsEstimatedSizeAccuracySimple() + { + ColumnStatistics stats = ColumnStatistics.builder() + .setDataSize(Estimate.of(100)) + .setDistinctValuesCount(Estimate.of(1)) + .setRange(new DoubleRange(100, 100)) + .setNullsFraction(Estimate.of(0.1)) + .build(); + + // test without histogram + long actualSize = GraphLayout.parseInstance(stats).totalSize(); + assertEquals(actualSize, stats.getEstimatedSize()); + } + + /** + * Test for when histogram is included but has little-to-no memory usage. + */ + @Test + public void testColumnStatisticsEstimatedSizeAccuracyHistogramEmpty() + { + ColumnStatistics stats = ColumnStatistics.builder() + .setDataSize(Estimate.of(100)) + .setDistinctValuesCount(Estimate.of(1)) + .setRange(new DoubleRange(100, 100)) + .setNullsFraction(Estimate.of(0.1)) + .setHistogram(Optional.of(new ConnectorHistogram() + { + @Override + public Estimate cumulativeProbability(double value, boolean inclusive) + { + return null; + } + + @Override + public Estimate inverseCumulativeProbability(double percentile) + { + return null; + } + + @Override + public long getEstimatedSize() + { + return ClassLayout.parseClass(this.getClass()).instanceSize() * 2L; + } + })) + .build(); + long actualSize = GraphLayout.parseInstance(stats).totalSize(); + assertEquals(actualSize, stats.getEstimatedSize()); + } + + /** + * Test for when histogram is included and has a significant memory footprint + */ + @Test + public void testColumnStatisticsEstimatedSizeAccuracyHistogram() + { + ColumnStatistics stats = ColumnStatistics.builder() + .setDataSize(Estimate.of(100)) + .setDistinctValuesCount(Estimate.of(1)) + .setRange(new DoubleRange(100, 100)) + .setNullsFraction(Estimate.of(0.1)) + .setHistogram(Optional.of(new ConnectorHistogram() + { + final byte[] memory = new byte[4096]; + + @Override + public Estimate cumulativeProbability(double value, boolean inclusive) + { + return null; + } + + @Override + public Estimate inverseCumulativeProbability(double percentile) + { + return null; + } + + @Override + public long getEstimatedSize() + { + return sizeOf(memory) + (ClassLayout.parseClass(this.getClass()).instanceSize() * 2L); + } + })) + .build(); + + // test histogram with fields histogram + long actualSize = GraphLayout.parseInstance(stats).totalSize(); + double error = actualSize * 1E-2; + assertEquals(actualSize, stats.getEstimatedSize(), error); + } +}