diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergAbstractMetadata.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergAbstractMetadata.java index 01e1b161ed4c7..e1b140588f023 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergAbstractMetadata.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergAbstractMetadata.java @@ -103,7 +103,6 @@ import java.util.stream.Collectors; import static com.facebook.presto.expressions.LogicalRowExpressions.TRUE_CONSTANT; -import static com.facebook.presto.hive.MetadataUtils.createPredicate; import static com.facebook.presto.hive.MetadataUtils.getCombinedRemainingPredicate; import static com.facebook.presto.hive.MetadataUtils.getDiscretePredicates; import static com.facebook.presto.hive.MetadataUtils.getPredicate; @@ -216,13 +215,14 @@ public ConnectorTableLayoutResult getTableLayoutForConstraint( IcebergTableHandle handle = (IcebergTableHandle) table; Table icebergTable = getIcebergTable(session, handle.getSchemaTableName()); - TupleDomain partitionColumnPredicate = TupleDomain.withColumnDomains(Maps.filterKeys(constraint.getSummary().getDomains().get(), Predicates.in(getPartitionKeyColumnHandles(icebergTable, typeManager)))); + List partitionColumns = getPartitionKeyColumnHandles(icebergTable, typeManager); + TupleDomain partitionColumnPredicate = TupleDomain.withColumnDomains(Maps.filterKeys(constraint.getSummary().getDomains().get(), Predicates.in(partitionColumns))); Optional> requestedColumns = desiredColumns.map(columns -> columns.stream().map(column -> (IcebergColumnHandle) column).collect(toImmutableSet())); ConnectorTableLayout layout = getTableLayout( session, new IcebergTableLayoutHandle.Builder() - .setPartitionColumns(ImmutableList.copyOf(getPartitionKeyColumnHandles(icebergTable, typeManager))) + .setPartitionColumns(ImmutableList.copyOf(partitionColumns)) .setDataColumns(toHiveColumns(icebergTable.schema().columns())) .setDomainPredicate(constraint.getSummary().transform(IcebergAbstractMetadata::toSubfield)) .setRemainingPredicate(TRUE_CONSTANT) @@ -258,59 +258,52 @@ public ConnectorTableLayout getTableLayout(ConnectorSession session, ConnectorTa Table icebergTable = getIcebergTable(session, tableHandle.getSchemaTableName()); validateTableMode(session, icebergTable); - + List partitionColumns = ImmutableList.copyOf(icebergTableLayoutHandle.getPartitionColumns()); if (!isPushdownFilterEnabled(session)) { - return new ConnectorTableLayout(handle); - } - - if (!icebergTableLayoutHandle.getPartitions().isPresent()) { return new ConnectorTableLayout( icebergTableLayoutHandle, Optional.empty(), - TupleDomain.none(), + icebergTableLayoutHandle.getPartitionColumnPredicate(), Optional.empty(), Optional.empty(), Optional.empty(), ImmutableList.of(), Optional.empty()); } - List partitionColumns = ImmutableList.copyOf(icebergTableLayoutHandle.getPartitionColumns()); - List partitions = icebergTableLayoutHandle.getPartitions().get(); - - Optional discretePredicates = getDiscretePredicates(partitionColumns, partitions); - - TupleDomain predicate; - RowExpression subfieldPredicate; - if (isPushdownFilterEnabled(session)) { - Map predicateColumns = icebergTableLayoutHandle.getPredicateColumns().entrySet() - .stream().collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + Optional> partitions = icebergTableLayoutHandle.getPartitions(); + Optional discretePredicates = partitions.flatMap(parts -> getDiscretePredicates(partitionColumns, parts)); - predicate = getPredicate(icebergTableLayoutHandle, partitionColumns, partitions, predicateColumns); + Map predicateColumns = icebergTableLayoutHandle.getPredicateColumns().entrySet() + .stream().collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + Optional> predicate = partitions.map(parts -> getPredicate(icebergTableLayoutHandle, partitionColumns, parts, predicateColumns)); + // capture subfields from domainPredicate to add to remainingPredicate + // so those filters don't get lost + Map columnTypes = getColumns(icebergTable.schema(), icebergTable.spec(), typeManager).stream() + .collect(toImmutableMap(IcebergColumnHandle::getName, icebergColumnHandle -> getColumnMetadata(session, tableHandle, icebergColumnHandle).getType())); - // capture subfields from domainPredicate to add to remainingPredicate - // so those filters don't get lost - Map columnTypes = getColumns(icebergTable.schema(), icebergTable.spec(), typeManager).stream() - .collect(toImmutableMap(IcebergColumnHandle::getName, icebergColumnHandle -> getColumnMetadata(session, tableHandle, icebergColumnHandle).getType())); - - subfieldPredicate = getSubfieldPredicate(session, icebergTableLayoutHandle, columnTypes, functionResolution, rowExpressionService); - } - else { - predicate = createPredicate(partitionColumns, partitions); - subfieldPredicate = TRUE_CONSTANT; - } + RowExpression subfieldPredicate = getSubfieldPredicate(session, icebergTableLayoutHandle, columnTypes, functionResolution, rowExpressionService); // combine subfieldPredicate with remainingPredicate RowExpression combinedRemainingPredicate = getCombinedRemainingPredicate(icebergTableLayoutHandle, subfieldPredicate); - return new ConnectorTableLayout( - icebergTableLayoutHandle, - Optional.empty(), - predicate, - Optional.empty(), - Optional.empty(), - discretePredicates, - ImmutableList.of(), - Optional.of(combinedRemainingPredicate)); + return predicate.map(pred -> new ConnectorTableLayout( + icebergTableLayoutHandle, + Optional.empty(), + pred, + Optional.empty(), + Optional.empty(), + discretePredicates, + ImmutableList.of(), + Optional.of(combinedRemainingPredicate))) + .orElseGet(() -> new ConnectorTableLayout( + icebergTableLayoutHandle, + Optional.empty(), + TupleDomain.none(), + Optional.empty(), + Optional.empty(), + Optional.empty(), + ImmutableList.of(), + Optional.empty())); } protected Optional getIcebergSystemTable(SchemaTableName tableName, Table table) diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java index 9f0d64a6b2261..557addb1b4511 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java @@ -178,6 +178,10 @@ private TableStatistics makeTableStatistics(IcebergTableHandle tableHandle, Cons .setRowCount(Estimate.of(0)) .build(); } + // the total record count for the whole table + Optional totalRecordCount = Optional.of(intersection) + .filter(domain -> !domain.isAll()) + .map(domain -> getDataTableSummary(tableHandle, ImmutableList.of(), TupleDomain.all(), idToTypeMapping, nonPartitionPrimitiveColumns, partitionFields).getRecordCount()); double recordCount = summary.getRecordCount(); TableStatistics.Builder result = TableStatistics.builder(); @@ -185,6 +189,10 @@ private TableStatistics makeTableStatistics(IcebergTableHandle tableHandle, Cons Map tableStats = getClosestStatisticsFileForSnapshot(tableHandle) .map(this::loadStatisticsFile).orElseGet(Collections::emptyMap); + // scale all NDV values loaded from puffin files based on row count + totalRecordCount.ifPresent(fullTableRecordCount -> tableStats.forEach((id, stat) -> + stat.setDistinctValuesCount(stat.getDistinctValuesCount().map(value -> value * recordCount / fullTableRecordCount)))); + for (IcebergColumnHandle columnHandle : selectedColumns) { int fieldId = columnHandle.getId(); ColumnStatistics.Builder columnBuilder = tableStats.getOrDefault(fieldId, ColumnStatistics.builder()); diff --git a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java index ba1f412476a32..47a9af0243864 100644 --- a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java +++ b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java @@ -41,6 +41,7 @@ import com.facebook.presto.spi.statistics.Estimate; import com.facebook.presto.spi.statistics.TableStatistics; import com.facebook.presto.testing.MaterializedResult; +import com.facebook.presto.testing.MaterializedRow; import com.facebook.presto.testing.QueryRunner; import com.facebook.presto.tests.AbstractTestDistributedQueries; import com.google.common.collect.ImmutableList; @@ -117,7 +118,7 @@ import static org.testng.Assert.assertTrue; @Test(singleThreaded = true) -public class IcebergDistributedTestBase +public abstract class IcebergDistributedTestBase extends AbstractTestDistributedQueries { private final CatalogType catalogType; @@ -1245,6 +1246,63 @@ public void testEqualityDeletesWithHiddenPartitionsEvolution(String fileFormat, assertQuery(session, "SELECT * FROM " + tableName, "VALUES (1, '1001', NULL, NULL), (3, '1003', NULL, NULL), (6, '1004', 1, NULL), (6, '1006', 2, 'th002')"); } + @Test + public void testPartShowStatsWithFilters() + { + assertQuerySucceeds("CREATE TABLE showstatsfilters (i int) WITH (partitioning = ARRAY['i'])"); + assertQuerySucceeds("INSERT INTO showstatsfilters VALUES 1, 2, 3, 4, 5, 6, 7, 7, 7, 7"); + assertQuerySucceeds("ANALYZE showstatsfilters"); + + MaterializedResult statsTable = getQueryRunner().execute("SHOW STATS for showstatsfilters"); + MaterializedRow columnStats = statsTable.getMaterializedRows().get(0); + assertEquals(columnStats.getField(2), 7.0); // ndvs; + assertEquals(columnStats.getField(3), 0.0); // nulls + assertEquals(columnStats.getField(5), "1"); // min + assertEquals(columnStats.getField(6), "7"); // max + + // EQ + statsTable = getQueryRunner().execute("SHOW STATS for (SELECT * FROM showstatsfilters WHERE i = 7)"); + columnStats = statsTable.getMaterializedRows().get(0); + assertEquals(columnStats.getField(5), "7"); // min + assertEquals(columnStats.getField(6), "7"); // max + assertEquals(columnStats.getField(3), 0.0); // nulls + assertEquals((double) columnStats.getField(2), 7.0d * (4.0d / 10.0d), 1E-8); // ndvs; + + // LT + statsTable = getQueryRunner().execute("SHOW STATS for (SELECT * FROM showstatsfilters WHERE i < 7)"); + columnStats = statsTable.getMaterializedRows().get(0); + assertEquals(columnStats.getField(5), "1"); // min + assertEquals(columnStats.getField(6), "6"); // max + assertEquals(columnStats.getField(3), 0.0); // nulls + assertEquals((double) columnStats.getField(2), 7.0d * (6.0d / 10.0d), 1E-8); // ndvs; + + // LTE + statsTable = getQueryRunner().execute("SHOW STATS for (SELECT * FROM showstatsfilters WHERE i <= 7)"); + columnStats = statsTable.getMaterializedRows().get(0); + assertEquals(columnStats.getField(5), "1"); // min + assertEquals(columnStats.getField(6), "7"); // max + assertEquals(columnStats.getField(3), 0.0); // nulls + assertEquals(columnStats.getField(2), 7.0d); // ndvs; + + // GT + statsTable = getQueryRunner().execute("SHOW STATS for (SELECT * FROM showstatsfilters WHERE i > 7)"); + columnStats = statsTable.getMaterializedRows().get(0); + assertEquals(columnStats.getField(5), null); // min + assertEquals(columnStats.getField(6), null); // max + assertEquals(columnStats.getField(3), null); // nulls + assertEquals(columnStats.getField(2), null); // ndvs; + + // GTE + statsTable = getQueryRunner().execute("SHOW STATS for (SELECT * FROM showstatsfilters WHERE i >= 7)"); + columnStats = statsTable.getMaterializedRows().get(0); + assertEquals(columnStats.getField(5), "7"); // min + assertEquals(columnStats.getField(6), "7"); // max + assertEquals(columnStats.getField(3), 0.0); // nulls + assertEquals((double) columnStats.getField(2), 7.0d * (4.0d / 10.0d), 1E-8); // ndvs; + + assertQuerySucceeds("DROP TABLE showstatsfilters"); + } + private void testCheckDeleteFiles(Table icebergTable, int expectedSize, List expectedFileContent) { // check delete file list diff --git a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/hive/TestIcebergDistributedHive.java b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/hive/TestIcebergDistributedHive.java index c76babbd0cecc..039ef121ac4e2 100644 --- a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/hive/TestIcebergDistributedHive.java +++ b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/hive/TestIcebergDistributedHive.java @@ -52,6 +52,12 @@ public void testStatsByDistance() // ignore because HMS doesn't support statistics versioning } + @Override + public void testPartShowStatsWithFilters() + { + // Hive doesn't support returning statistics on partitioned tables + } + @Override protected Table loadTable(String tableName) { diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java index c50abea3bec75..ec4ee420e46bd 100644 --- a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java @@ -154,18 +154,33 @@ public Builder setNullsFraction(Estimate nullsFraction) return this; } + public Estimate getNullsFraction() + { + return nullsFraction; + } + public Builder setDistinctValuesCount(Estimate distinctValuesCount) { this.distinctValuesCount = requireNonNull(distinctValuesCount, "distinctValuesCount is null"); return this; } + public Estimate getDistinctValuesCount() + { + return distinctValuesCount; + } + public Builder setDataSize(Estimate dataSize) { this.dataSize = requireNonNull(dataSize, "dataSize is null"); return this; } + public Estimate getDataSize() + { + return dataSize; + } + public Builder setRange(DoubleRange range) { this.range = Optional.of(requireNonNull(range, "range is null"));