From 04920e5412b1b4cd64d9485f966fb7a14ffffa4e Mon Sep 17 00:00:00 2001 From: Andrii Rosa Date: Wed, 19 Sep 2018 14:03:07 -0400 Subject: [PATCH 01/14] Remove RangeColumnStatistics --- .../MetastoreHiveStatisticsProvider.java | 16 +- .../presto/hive/AbstractTestHiveClient.java | 17 +- .../presto/cost/TableScanStatsRule.java | 9 +- .../presto/sql/rewrite/ShowStatsRewrite.java | 11 +- .../spi/statistics/ColumnStatistics.java | 115 ++++++++----- .../spi/statistics/RangeColumnStatistics.java | 151 ------------------ .../spi/statistics/TableStatistics.java | 4 + .../TpcdsTableStatisticsFactory.java | 16 +- .../tpcds/TestTpcdsMetadataStatistics.java | 83 ++++------ .../facebook/presto/tpch/TpchMetadata.java | 10 +- .../presto/tpch/TestTpchMetadata.java | 21 +-- 11 files changed, 145 insertions(+), 308 deletions(-) delete mode 100644 presto-spi/src/main/java/com/facebook/presto/spi/statistics/RangeColumnStatistics.java diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java b/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java index fe7565634a9c5..8f78bf996e839 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java @@ -29,7 +29,6 @@ import com.facebook.presto.spi.predicate.NullableValue; import com.facebook.presto.spi.statistics.ColumnStatistics; import com.facebook.presto.spi.statistics.Estimate; -import com.facebook.presto.spi.statistics.RangeColumnStatistics; import com.facebook.presto.spi.statistics.TableStatistics; import com.facebook.presto.spi.type.DecimalType; import com.facebook.presto.spi.type.Type; @@ -113,7 +112,6 @@ public TableStatistics getTableStatistics(ConnectorSession session, ConnectorTab for (Map.Entry columnEntry : tableColumns.entrySet()) { String columnName = columnEntry.getKey(); HiveColumnHandle hiveColumnHandle = (HiveColumnHandle) columnEntry.getValue(); - RangeColumnStatistics.Builder rangeStatistics = RangeColumnStatistics.builder(); List lowValueCandidates = ImmutableList.of(); List highValueCandidates = ImmutableList.of(); @@ -121,8 +119,9 @@ public TableStatistics getTableStatistics(ConnectorSession session, ConnectorTab Type prestoType = typeManager.getType(hiveColumnHandle.getTypeSignature()); Estimate nullsFraction; Estimate dataSize; + ColumnStatistics.Builder columnStatistics = ColumnStatistics.builder(); if (hiveColumnHandle.isPartitionKey()) { - rangeStatistics.setDistinctValuesCount(countDistinctPartitionKeys(hiveColumnHandle, queriedPartitions)); + columnStatistics.setDistinctValuesCount(countDistinctPartitionKeys(hiveColumnHandle, queriedPartitions)); nullsFraction = calculateNullsFractionForPartitioningKey(hiveColumnHandle, queriedPartitions, statisticsSample, rowCount, rowsPerPartition); if (isLowHighSupportedForType(prestoType)) { lowValueCandidates = queriedPartitions.stream() @@ -136,7 +135,7 @@ public TableStatistics getTableStatistics(ConnectorSession session, ConnectorTab dataSize = calculateDataSizeForPartitioningKey(hiveColumnHandle, queriedPartitions, statisticsSample, rowCount, rowsPerPartition); } else { - rangeStatistics.setDistinctValuesCount(calculateDistinctValuesCount(statisticsSample, columnName)); + columnStatistics.setDistinctValuesCount(calculateDistinctValuesCount(statisticsSample, columnName)); nullsFraction = calculateNullsFraction(statisticsSample, queriedPartitionsCount, columnName, rowCount); if (isLowHighSupportedForType(prestoType)) { @@ -161,20 +160,17 @@ public TableStatistics getTableStatistics(ConnectorSession session, ConnectorTab } dataSize = calculateDataSize(statisticsSample, columnName, rowCount); } - rangeStatistics.setFraction(nullsFraction.map(value -> 1.0 - value)); Comparator comparator = (leftValue, rightValue) -> { Block leftBlock = nativeValueToBlock(prestoType, leftValue); Block rightBlock = nativeValueToBlock(prestoType, rightValue); return prestoType.compareTo(leftBlock, 0, rightBlock, 0); }; - rangeStatistics.setLowValue(lowValueCandidates.stream().min(comparator)); - rangeStatistics.setHighValue(highValueCandidates.stream().max(comparator)); - rangeStatistics.setDataSize(dataSize); + columnStatistics.setLowValue(lowValueCandidates.stream().min(comparator)); + columnStatistics.setHighValue(highValueCandidates.stream().max(comparator)); + columnStatistics.setDataSize(dataSize); - ColumnStatistics.Builder columnStatistics = ColumnStatistics.builder(); columnStatistics.setNullsFraction(nullsFraction); - columnStatistics.addRange(rangeStatistics.build()); tableStatistics.setColumnStatistics(hiveColumnHandle, columnStatistics.build()); } return tableStatistics.build(); diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/AbstractTestHiveClient.java b/presto-hive/src/test/java/com/facebook/presto/hive/AbstractTestHiveClient.java index e81c906e54e51..7ed5bd34b1458 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/AbstractTestHiveClient.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/AbstractTestHiveClient.java @@ -76,7 +76,6 @@ import com.facebook.presto.spi.predicate.TupleDomain; import com.facebook.presto.spi.predicate.ValueSet; import com.facebook.presto.spi.statistics.ColumnStatistics; -import com.facebook.presto.spi.statistics.RangeColumnStatistics; import com.facebook.presto.spi.statistics.TableStatistics; import com.facebook.presto.spi.type.ArrayType; import com.facebook.presto.spi.type.MapType; @@ -1332,23 +1331,19 @@ private void assertTableStatsComputed( columnStatistics.getNullsFraction().isValueUnknown(), "unknown nulls fraction for " + columnName); - RangeColumnStatistics rangeColumnStatistics = columnStatistics.getOnlyRangeColumnStatistics(); assertFalse( - rangeColumnStatistics.getDistinctValuesCount().isValueUnknown(), - "unknown range distinct values count for " + columnName); - assertFalse( - rangeColumnStatistics.getFraction().isValueUnknown(), - "unknown range non-null fraction for " + columnName); + columnStatistics.getDistinctValuesCount().isValueUnknown(), + "unknown distinct values count for " + columnName); if (isVarcharType(columnType)) { assertFalse( - rangeColumnStatistics.getDataSize().isValueUnknown(), - "unknown range data size for " + columnName); + columnStatistics.getDataSize().isValueUnknown(), + "unknown data size for " + columnName); } else { assertTrue( - rangeColumnStatistics.getDataSize().isValueUnknown(), - "known range data size for" + columnName); + columnStatistics.getDataSize().isValueUnknown(), + "unknown data size for" + columnName); } }); } diff --git a/presto-main/src/main/java/com/facebook/presto/cost/TableScanStatsRule.java b/presto-main/src/main/java/com/facebook/presto/cost/TableScanStatsRule.java index 7460261b0a0cf..1fdf3fc4d923a 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/TableScanStatsRule.java +++ b/presto-main/src/main/java/com/facebook/presto/cost/TableScanStatsRule.java @@ -83,12 +83,13 @@ private SymbolStatsEstimate toSymbolStatistics(TableStatistics tableStatistics, { double nullsFraction = columnStatistics.getNullsFraction().getValue(); double nonNullRowsCount = tableStatistics.getRowCount().getValue() * (1.0 - nullsFraction); + double averageRowSize = nonNullRowsCount == 0 ? 0 : columnStatistics.getDataSize().getValue() / nonNullRowsCount; return SymbolStatsEstimate.builder() - .setLowValue(asDouble(session, type, columnStatistics.getOnlyRangeColumnStatistics().getLowValue()).orElse(NEGATIVE_INFINITY)) - .setHighValue(asDouble(session, type, columnStatistics.getOnlyRangeColumnStatistics().getHighValue()).orElse(POSITIVE_INFINITY)) + .setLowValue(asDouble(session, type, columnStatistics.getLowValue()).orElse(NEGATIVE_INFINITY)) + .setHighValue(asDouble(session, type, columnStatistics.getHighValue()).orElse(POSITIVE_INFINITY)) .setNullsFraction(nullsFraction) - .setDistinctValuesCount(columnStatistics.getOnlyRangeColumnStatistics().getDistinctValuesCount().getValue()) - .setAverageRowSize(columnStatistics.getOnlyRangeColumnStatistics().getDataSize().getValue() / nonNullRowsCount) + .setDistinctValuesCount(columnStatistics.getDistinctValuesCount().getValue()) + .setAverageRowSize(averageRowSize) .build(); } diff --git a/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java b/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java index 2a7480507d131..20d83564996c5 100644 --- a/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java +++ b/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java @@ -24,7 +24,6 @@ import com.facebook.presto.spi.Constraint; import com.facebook.presto.spi.statistics.ColumnStatistics; import com.facebook.presto.spi.statistics.Estimate; -import com.facebook.presto.spi.statistics.RangeColumnStatistics; import com.facebook.presto.spi.statistics.TableStatistics; import com.facebook.presto.spi.type.Type; import com.facebook.presto.spi.type.VarcharType; @@ -306,16 +305,14 @@ private List buildStatisticsRows(TableStatistics tableStatistics, Ma private Row createColumnStatsRow(String columnName, Type type, ColumnStatistics columnStatistics) { - RangeColumnStatistics onlyRangeColumnStatistics = columnStatistics.getOnlyRangeColumnStatistics(); - ImmutableList.Builder rowValues = ImmutableList.builder(); rowValues.add(new StringLiteral(columnName)); - rowValues.add(createStatisticValueOrNull(onlyRangeColumnStatistics.getDataSize())); - rowValues.add(createStatisticValueOrNull(onlyRangeColumnStatistics.getDistinctValuesCount())); + rowValues.add(createStatisticValueOrNull(columnStatistics.getDataSize())); + rowValues.add(createStatisticValueOrNull(columnStatistics.getDistinctValuesCount())); rowValues.add(createStatisticValueOrNull(columnStatistics.getNullsFraction())); rowValues.add(NULL_DOUBLE); - rowValues.add(lowHighAsLiteral(type, onlyRangeColumnStatistics.getLowValue())); - rowValues.add(lowHighAsLiteral(type, onlyRangeColumnStatistics.getHighValue())); + rowValues.add(lowHighAsLiteral(type, columnStatistics.getLowValue())); + rowValues.add(lowHighAsLiteral(type, columnStatistics.getHighValue())); return new Row(rowValues.build()); } diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java index b7249a65a072b..59ab5989a8713 100644 --- a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java @@ -13,42 +13,43 @@ */ package com.facebook.presto.spi.statistics; -import java.util.ArrayList; -import java.util.List; import java.util.Objects; -import java.util.function.Consumer; +import java.util.Optional; -import static com.facebook.presto.spi.statistics.Estimate.unknownValue; -import static java.util.Collections.singletonList; -import static java.util.Collections.unmodifiableList; +import static java.lang.String.format; import static java.util.Objects.requireNonNull; public final class ColumnStatistics { - private static final List SINGLE_UNKNOWN_RANGE_STATISTICS = singletonList(RangeColumnStatistics.builder().build()); - private final Estimate nullsFraction; - private final List rangeColumnStatistics; - - private ColumnStatistics(Estimate nullsFraction, List rangeColumnStatistics) + private final Estimate distinctValuesCount; + private final Estimate dataSize; + private final Optional lowValue; + private final Optional highValue; + + public ColumnStatistics( + Estimate nullsFraction, + Estimate distinctValuesCount, + Estimate dataSize, + Optional lowValue, + Optional highValue) { - this.nullsFraction = requireNonNull(nullsFraction, "nullsFraction can not be null"); - requireNonNull(rangeColumnStatistics, "rangeColumnStatistics can not be null"); - if (!rangeColumnStatistics.stream().allMatch(Objects::nonNull)) { - throw new NullPointerException("elements of rangeColumnStatistics can not be null"); - } - if (rangeColumnStatistics.size() > 1) { - // todo add support for multiple ranges. - throw new IllegalArgumentException("Statistics for multiple ranges are not supported"); + this.nullsFraction = requireNonNull(nullsFraction, "nullsFraction is null"); + if (!nullsFraction.isValueUnknown()) { + if (nullsFraction.getValue() < 0 || nullsFraction.getValue() > 1) { + throw new IllegalArgumentException(format("nullsFraction must be between 0 and 1: %s", nullsFraction.getValue())); + } } - if (rangeColumnStatistics.isEmpty()) { - rangeColumnStatistics = SINGLE_UNKNOWN_RANGE_STATISTICS; + this.distinctValuesCount = requireNonNull(distinctValuesCount, "distinctValuesCount is null"); + if (!distinctValuesCount.isValueUnknown() && distinctValuesCount.getValue() < 0) { + throw new IllegalArgumentException(format("distinctValuesCount must be greater than or equal to 0: %s", distinctValuesCount.getValue())); } - if (nullsFraction.isValueUnknown() != rangeColumnStatistics.get(0).getFraction().isValueUnknown()) { - throw new IllegalArgumentException("All or none fraction/nullsFraction must be set"); + this.dataSize = requireNonNull(dataSize, "dataSize is null"); + if (!dataSize.isValueUnknown() && dataSize.getValue() < 0) { + throw new IllegalArgumentException(format("dataSize must be greater than or equal to 0: %s", dataSize.getValue())); } - - this.rangeColumnStatistics = unmodifiableList(new ArrayList<>(rangeColumnStatistics)); + this.lowValue = requireNonNull(lowValue, "lowValue is null"); + this.highValue = requireNonNull(highValue, "highValue is null"); } public Estimate getNullsFraction() @@ -56,9 +57,24 @@ public Estimate getNullsFraction() return nullsFraction; } - public RangeColumnStatistics getOnlyRangeColumnStatistics() + public Estimate getDistinctValuesCount() { - return rangeColumnStatistics.get(0); + return distinctValuesCount; + } + + public Estimate getDataSize() + { + return dataSize; + } + + public Optional getLowValue() + { + return lowValue; + } + + public Optional getHighValue() + { + return highValue; } @Override @@ -72,13 +88,16 @@ public boolean equals(Object o) } ColumnStatistics that = (ColumnStatistics) o; return Objects.equals(nullsFraction, that.nullsFraction) && - Objects.equals(rangeColumnStatistics, that.rangeColumnStatistics); + Objects.equals(distinctValuesCount, that.distinctValuesCount) && + Objects.equals(dataSize, that.dataSize) && + Objects.equals(lowValue, that.lowValue) && + Objects.equals(highValue, that.highValue); } @Override public int hashCode() { - return Objects.hash(nullsFraction, rangeColumnStatistics); + return Objects.hash(nullsFraction, distinctValuesCount, dataSize, lowValue, highValue); } @Override @@ -86,7 +105,10 @@ public String toString() { return "ColumnStatistics{" + "nullsFraction=" + nullsFraction + - ", rangeColumnStatistics=" + rangeColumnStatistics + + ", distinctValuesCount=" + distinctValuesCount + + ", dataSize=" + dataSize + + ", lowValue=" + lowValue + + ", highValue=" + highValue + '}'; } @@ -97,32 +119,45 @@ public static Builder builder() public static final class Builder { - private Estimate nullsFraction = unknownValue(); - private List rangeColumnStatistics = new ArrayList<>(); + private Estimate nullsFraction = Estimate.unknownValue(); + private Estimate distinctValuesCount = Estimate.unknownValue(); + private Estimate dataSize = Estimate.unknownValue(); + private Optional lowValue = Optional.empty(); + private Optional highValue = Optional.empty(); public Builder setNullsFraction(Estimate nullsFraction) { - this.nullsFraction = nullsFraction; + this.nullsFraction = requireNonNull(nullsFraction, "nullsFraction is null"); + return this; + } + + public Builder setDistinctValuesCount(Estimate distinctValuesCount) + { + this.distinctValuesCount = requireNonNull(distinctValuesCount, "distinctValuesCount is null"); + return this; + } + + public Builder setDataSize(Estimate dataSize) + { + this.dataSize = requireNonNull(dataSize, "dataSize is null"); return this; } - public Builder addRange(Consumer rangeBuilderConsumer) + public Builder setLowValue(Optional lowValue) { - RangeColumnStatistics.Builder rangeBuilder = RangeColumnStatistics.builder(); - rangeBuilderConsumer.accept(rangeBuilder); - addRange(rangeBuilder.build()); + this.lowValue = requireNonNull(lowValue, "lowValue is null"); return this; } - public Builder addRange(RangeColumnStatistics rangeColumnStatistics) + public Builder setHighValue(Optional highValue) { - this.rangeColumnStatistics.add(rangeColumnStatistics); + this.highValue = requireNonNull(highValue, "highValue is null"); return this; } public ColumnStatistics build() { - return new ColumnStatistics(nullsFraction, rangeColumnStatistics); + return new ColumnStatistics(nullsFraction, distinctValuesCount, dataSize, lowValue, highValue); } } } diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/RangeColumnStatistics.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/RangeColumnStatistics.java deleted file mode 100644 index 91749ce0bce1e..0000000000000 --- a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/RangeColumnStatistics.java +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.facebook.presto.spi.statistics; - -import java.util.Objects; -import java.util.Optional; - -import static com.facebook.presto.spi.statistics.Estimate.unknownValue; -import static java.util.Objects.requireNonNull; - -public final class RangeColumnStatistics -{ - private final Optional lowValue; - private final Optional highValue; - private final Estimate fraction; - private final Estimate dataSize; - private final Estimate distinctValuesCount; - - private RangeColumnStatistics( - Optional lowValue, - Optional highValue, - Estimate fraction, - Estimate dataSize, - Estimate distinctValuesCount) - { - this.lowValue = requireNonNull(lowValue, "lowValue can not be null"); - this.highValue = requireNonNull(highValue, "highValue can not be null"); - this.fraction = requireNonNull(fraction, "fraction can not be null"); - this.dataSize = requireNonNull(dataSize, "dataSize can not be null"); - this.distinctValuesCount = requireNonNull(distinctValuesCount, "distinctValuesCount can not be null"); - } - - public Optional getLowValue() - { - return lowValue; - } - - public Optional getHighValue() - { - return highValue; - } - - public Estimate getDataSize() - { - return dataSize; - } - - public Estimate getFraction() - { - return fraction; - } - - public Estimate getDistinctValuesCount() - { - return distinctValuesCount; - } - - @Override - public boolean equals(Object o) - { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - RangeColumnStatistics that = (RangeColumnStatistics) o; - return Objects.equals(lowValue, that.lowValue) && - Objects.equals(highValue, that.highValue) && - Objects.equals(fraction, that.fraction) && - Objects.equals(dataSize, that.dataSize) && - Objects.equals(distinctValuesCount, that.distinctValuesCount); - } - - @Override - public int hashCode() - { - return Objects.hash(lowValue, highValue, fraction, dataSize, distinctValuesCount); - } - - @Override - public String toString() - { - return "RangeColumnStatistics{" + "lowValue=" + lowValue + - ", highValue=" + highValue + - ", fraction=" + fraction + - ", dataSize=" + dataSize + - ", distinctValuesCount=" + distinctValuesCount + - '}'; - } - - public static Builder builder() - { - return new Builder(); - } - - public static final class Builder - { - private Optional lowValue = Optional.empty(); - private Optional highValue = Optional.empty(); - private Estimate dataSize = unknownValue(); - private Estimate fraction = unknownValue(); - private Estimate distinctValuesCount = unknownValue(); - - public Builder setLowValue(Optional lowValue) - { - this.lowValue = lowValue; - return this; - } - - public Builder setHighValue(Optional highValue) - { - this.highValue = highValue; - return this; - } - - public Builder setFraction(Estimate fraction) - { - this.fraction = fraction; - return this; - } - - public Builder setDataSize(Estimate dataSize) - { - this.dataSize = dataSize; - return this; - } - - public Builder setDistinctValuesCount(Estimate distinctValuesCount) - { - this.distinctValuesCount = distinctValuesCount; - return this; - } - - public RangeColumnStatistics build() - { - return new RangeColumnStatistics(lowValue, highValue, fraction, dataSize, distinctValuesCount); - } - } -} diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/TableStatistics.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/TableStatistics.java index 9f87eb3c8c20c..a56dc8a17d135 100644 --- a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/TableStatistics.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/TableStatistics.java @@ -21,6 +21,7 @@ import java.util.Objects; import static com.facebook.presto.spi.statistics.Estimate.unknownValue; +import static java.lang.String.format; import static java.util.Collections.unmodifiableMap; import static java.util.Objects.requireNonNull; @@ -34,6 +35,9 @@ public final class TableStatistics public TableStatistics(Estimate rowCount, Map columnStatistics) { this.rowCount = requireNonNull(rowCount, "rowCount can not be null"); + if (!rowCount.isValueUnknown() && rowCount.getValue() < 0) { + throw new IllegalArgumentException(format("rowCount must be greater than or equal to 0: %s", rowCount.getValue())); + } this.columnStatistics = unmodifiableMap(requireNonNull(columnStatistics, "columnStatistics can not be null")); } diff --git a/presto-tpcds/src/main/java/com/facebook/presto/tpcds/statistics/TpcdsTableStatisticsFactory.java b/presto-tpcds/src/main/java/com/facebook/presto/tpcds/statistics/TpcdsTableStatisticsFactory.java index f22c1a9c49658..a800ab8ae23cc 100644 --- a/presto-tpcds/src/main/java/com/facebook/presto/tpcds/statistics/TpcdsTableStatisticsFactory.java +++ b/presto-tpcds/src/main/java/com/facebook/presto/tpcds/statistics/TpcdsTableStatisticsFactory.java @@ -71,18 +71,10 @@ private ColumnStatistics toColumnStatistics(ColumnStatisticsData columnStatistic ColumnStatistics.Builder columnStatistics = ColumnStatistics.builder(); long nullCount = columnStatisticsData.getNullsCount(); columnStatistics.setNullsFraction(new Estimate((double) nullCount / rowCount)); - columnStatistics.addRange(builder -> builder - .setLowValue( - columnStatisticsData.getMin() - .map(value -> toPrestoValue(value, type))) - .setHighValue( - columnStatisticsData.getMax() - .map(value -> toPrestoValue(value, type))) - .setDistinctValuesCount(new Estimate(columnStatisticsData.getDistinctValuesCount())) - .setDataSize(columnStatisticsData.getDataSize().map(Estimate::new).orElse(Estimate.unknownValue())) - .setFraction(new Estimate(((double) rowCount - nullCount) / rowCount)) - .build()); - + columnStatistics.setLowValue(columnStatisticsData.getMin().map(value -> toPrestoValue(value, type))); + columnStatistics.setHighValue(columnStatisticsData.getMax().map(value -> toPrestoValue(value, type))); + columnStatistics.setDistinctValuesCount(new Estimate(columnStatisticsData.getDistinctValuesCount())); + columnStatistics.setDataSize(columnStatisticsData.getDataSize().map(Estimate::new).orElse(Estimate.unknownValue())); return columnStatistics.build(); } diff --git a/presto-tpcds/src/test/java/com/facebook/presto/tpcds/TestTpcdsMetadataStatistics.java b/presto-tpcds/src/test/java/com/facebook/presto/tpcds/TestTpcdsMetadataStatistics.java index 38c50c7c234eb..e3e2f06f6d620 100644 --- a/presto-tpcds/src/test/java/com/facebook/presto/tpcds/TestTpcdsMetadataStatistics.java +++ b/presto-tpcds/src/test/java/com/facebook/presto/tpcds/TestTpcdsMetadataStatistics.java @@ -20,7 +20,6 @@ import com.facebook.presto.spi.SchemaTableName; import com.facebook.presto.spi.statistics.ColumnStatistics; import com.facebook.presto.spi.statistics.Estimate; -import com.facebook.presto.spi.statistics.RangeColumnStatistics; import com.facebook.presto.spi.statistics.TableStatistics; import com.google.common.primitives.Primitives; import com.teradata.tpcds.Table; @@ -74,11 +73,11 @@ public void testTableStatsExistenceSupportedSchema() assertNotNull(tableStatistics.getColumnStatistics().get(column)); TpcdsColumnHandle tpcdsColumn = (TpcdsColumnHandle) column; - Optional low = tableStatistics.getColumnStatistics().get(column).getOnlyRangeColumnStatistics().getLowValue(); + Optional low = tableStatistics.getColumnStatistics().get(column).getLowValue(); if (low.isPresent()) { assertEquals(low.get().getClass(), Primitives.wrap(tpcdsColumn.getType().getJavaType())); } - Optional high = tableStatistics.getColumnStatistics().get(column).getOnlyRangeColumnStatistics().getLowValue(); + Optional high = tableStatistics.getColumnStatistics().get(column).getLowValue(); if (high.isPresent()) { assertEquals(high.get().getClass(), Primitives.wrap(tpcdsColumn.getType().getJavaType())); } @@ -107,12 +106,9 @@ public void testTableStatsDetails() tableStatistics.getColumnStatistics().get(columnHandles.get(CallCenterColumn.CC_CALL_CENTER_SK.getName())), ColumnStatistics.builder() .setNullsFraction(new Estimate(0)) - .addRange(range -> range - .setFraction(new Estimate(1.0)) - .setDistinctValuesCount(new Estimate(6)) - .setLowValue(Optional.of(1L)) - .setHighValue(Optional.of(6L)) - .build()) + .setDistinctValuesCount(new Estimate(6)) + .setLowValue(Optional.of(1L)) + .setHighValue(Optional.of(6L)) .build()); // varchar @@ -120,13 +116,10 @@ public void testTableStatsDetails() tableStatistics.getColumnStatistics().get(columnHandles.get(CallCenterColumn.CC_CALL_CENTER_ID.getName())), ColumnStatistics.builder() .setNullsFraction(new Estimate(0)) - .addRange(range -> range - .setFraction(new Estimate(1.0)) - .setDistinctValuesCount(new Estimate(3)) - .setLowValue(Optional.of(Slices.utf8Slice("AAAAAAAABAAAAAAA"))) - .setHighValue(Optional.of(Slices.utf8Slice("AAAAAAAAEAAAAAAA"))) - .setDataSize(new Estimate(48.0)) - .build()) + .setDistinctValuesCount(new Estimate(3)) + .setLowValue(Optional.of(Slices.utf8Slice("AAAAAAAABAAAAAAA"))) + .setHighValue(Optional.of(Slices.utf8Slice("AAAAAAAAEAAAAAAA"))) + .setDataSize(new Estimate(48.0)) .build()); // char @@ -134,13 +127,10 @@ public void testTableStatsDetails() tableStatistics.getColumnStatistics().get(columnHandles.get(CallCenterColumn.CC_ZIP.getName())), ColumnStatistics.builder() .setNullsFraction(new Estimate(0)) - .addRange(range -> range - .setFraction(new Estimate(1.0)) - .setDistinctValuesCount(new Estimate(1)) - .setLowValue(Optional.of(Slices.utf8Slice("31904"))) - .setHighValue(Optional.of(Slices.utf8Slice("31904"))) - .setDataSize(new Estimate(5.0)) - .build()) + .setDistinctValuesCount(new Estimate(1)) + .setLowValue(Optional.of(Slices.utf8Slice("31904"))) + .setHighValue(Optional.of(Slices.utf8Slice("31904"))) + .setDataSize(new Estimate(5.0)) .build()); // decimal @@ -148,12 +138,9 @@ public void testTableStatsDetails() tableStatistics.getColumnStatistics().get(columnHandles.get(CallCenterColumn.CC_GMT_OFFSET.getName())), ColumnStatistics.builder() .setNullsFraction(new Estimate(0)) - .addRange(range -> range - .setFraction(new Estimate(1.0)) - .setDistinctValuesCount(new Estimate(1)) - .setLowValue(Optional.of(-500L)) - .setHighValue(Optional.of(-500L)) - .build()) + .setDistinctValuesCount(new Estimate(1)) + .setLowValue(Optional.of(-500L)) + .setHighValue(Optional.of(-500L)) .build()); // date @@ -161,12 +148,9 @@ public void testTableStatsDetails() tableStatistics.getColumnStatistics().get(columnHandles.get(CallCenterColumn.CC_REC_START_DATE.getName())), ColumnStatistics.builder() .setNullsFraction(new Estimate(0)) - .addRange(range -> range - .setFraction(new Estimate(1)) - .setDistinctValuesCount(new Estimate(4)) - .setLowValue(Optional.of(10227L)) - .setHighValue(Optional.of(11688L)) - .build()) + .setDistinctValuesCount(new Estimate(4)) + .setLowValue(Optional.of(10227L)) + .setHighValue(Optional.of(11688L)) .build()); // only null values @@ -174,12 +158,9 @@ public void testTableStatsDetails() tableStatistics.getColumnStatistics().get(columnHandles.get(CallCenterColumn.CC_CLOSED_DATE_SK.getName())), ColumnStatistics.builder() .setNullsFraction(new Estimate(1)) - .addRange(range -> range - .setFraction(new Estimate(0)) - .setDistinctValuesCount(new Estimate(0)) - .setLowValue(Optional.empty()) - .setHighValue(Optional.empty()) - .build()) + .setDistinctValuesCount(new Estimate(0)) + .setLowValue(Optional.empty()) + .setHighValue(Optional.empty()) .build()); } @@ -197,24 +178,18 @@ public void testNullFraction() tableStatistics.getColumnStatistics().get(columnHandles.get(WebSiteColumn.WEB_REC_END_DATE.getName())), ColumnStatistics.builder() .setNullsFraction(new Estimate(0.5)) - .addRange(range -> range - .setFraction(new Estimate(0.5)) - .setDistinctValuesCount(new Estimate(3)) - .setLowValue(Optional.of(10819L)) - .setHighValue(Optional.of(11549L)) - .build()) + .setDistinctValuesCount(new Estimate(3)) + .setLowValue(Optional.of(10819L)) + .setHighValue(Optional.of(11549L)) .build()); } private void assertColumnStatistics(ColumnStatistics actual, ColumnStatistics expected) { estimateAssertion.assertClose(actual.getNullsFraction(), expected.getNullsFraction(), "Nulls fraction"); - RangeColumnStatistics actualRange = actual.getOnlyRangeColumnStatistics(); - RangeColumnStatistics expectedRange = expected.getOnlyRangeColumnStatistics(); - estimateAssertion.assertClose(actualRange.getFraction(), expectedRange.getFraction(), "Fraction"); - estimateAssertion.assertClose(actualRange.getDataSize(), expectedRange.getDataSize(), "Data size"); - estimateAssertion.assertClose(actualRange.getDistinctValuesCount(), expectedRange.getDistinctValuesCount(), "Distinct values count"); - assertEquals(actualRange.getLowValue(), expectedRange.getLowValue()); - assertEquals(actualRange.getHighValue(), expectedRange.getHighValue()); + estimateAssertion.assertClose(actual.getDataSize(), expected.getDataSize(), "Data size"); + estimateAssertion.assertClose(actual.getDistinctValuesCount(), expected.getDistinctValuesCount(), "Distinct values count"); + assertEquals(actual.getLowValue(), expected.getLowValue()); + assertEquals(actual.getHighValue(), expected.getHighValue()); } } diff --git a/presto-tpch/src/main/java/com/facebook/presto/tpch/TpchMetadata.java b/presto-tpch/src/main/java/com/facebook/presto/tpch/TpchMetadata.java index 37077cf4f142e..ed70093825245 100644 --- a/presto-tpch/src/main/java/com/facebook/presto/tpch/TpchMetadata.java +++ b/presto-tpch/src/main/java/com/facebook/presto/tpch/TpchMetadata.java @@ -379,13 +379,11 @@ private ColumnHandle getColumnHandle(TpchTableHandle tpchTableHandle, Map rangeBuilder - .setDistinctValuesCount(stats.getDistinctValuesCount().map(Estimate::new).orElse(Estimate.unknownValue())) - .setDataSize(stats.getDataSize().map(Estimate::new).orElse(Estimate.unknownValue())) - .setLowValue(stats.getMin().map(value -> toPrestoValue(value, columnType))) - .setHighValue(stats.getMax().map(value -> toPrestoValue(value, columnType))) - .setFraction(new Estimate((1)))) .setNullsFraction(Estimate.zeroValue()) + .setDistinctValuesCount(stats.getDistinctValuesCount().map(Estimate::new).orElse(Estimate.unknownValue())) + .setDataSize(stats.getDataSize().map(Estimate::new).orElse(Estimate.unknownValue())) + .setLowValue(stats.getMin().map(value -> toPrestoValue(value, columnType))) + .setHighValue(stats.getMax().map(value -> toPrestoValue(value, columnType))) .build(); } diff --git a/presto-tpch/src/test/java/com/facebook/presto/tpch/TestTpchMetadata.java b/presto-tpch/src/test/java/com/facebook/presto/tpch/TestTpchMetadata.java index e7feaf2aa92da..e2b186fee3fe7 100644 --- a/presto-tpch/src/test/java/com/facebook/presto/tpch/TestTpchMetadata.java +++ b/presto-tpch/src/test/java/com/facebook/presto/tpch/TestTpchMetadata.java @@ -23,7 +23,6 @@ import com.facebook.presto.spi.predicate.TupleDomain; import com.facebook.presto.spi.statistics.ColumnStatistics; import com.facebook.presto.spi.statistics.Estimate; -import com.facebook.presto.spi.statistics.RangeColumnStatistics; import com.facebook.presto.spi.statistics.TableStatistics; import com.facebook.presto.tpch.util.PredicateUtils; import com.google.common.base.Preconditions; @@ -245,14 +244,12 @@ private void testColumnStats(String schema, TpchTable table, TpchColumn co ColumnStatistics actual = tableStatistics.getColumnStatistics().get(columnHandle); EstimateAssertion estimateAssertion = new EstimateAssertion(TOLERANCE); - RangeColumnStatistics actualRange = actual.getOnlyRangeColumnStatistics(); - RangeColumnStatistics expectedRange = expected.getOnlyRangeColumnStatistics(); - estimateAssertion.assertClose(actualRange.getDistinctValuesCount(), expectedRange.getDistinctValuesCount(), "distinctValuesCount"); - estimateAssertion.assertClose(actualRange.getDataSize(), expectedRange.getDataSize(), "dataSize"); + estimateAssertion.assertClose(actual.getDistinctValuesCount(), expected.getDistinctValuesCount(), "distinctValuesCount"); + estimateAssertion.assertClose(actual.getDataSize(), expected.getDataSize(), "dataSize"); estimateAssertion.assertClose(actual.getNullsFraction(), expected.getNullsFraction(), "nullsFraction"); - estimateAssertion.assertClose(actualRange.getLowValue(), expectedRange.getLowValue(), "lowValue"); - estimateAssertion.assertClose(actualRange.getHighValue(), expectedRange.getHighValue(), "highValue"); + estimateAssertion.assertClose(actual.getLowValue(), expected.getLowValue(), "lowValue"); + estimateAssertion.assertClose(actual.getHighValue(), expected.getHighValue(), "highValue"); } @Test @@ -410,13 +407,11 @@ private ColumnStatistics rangeStatistics(double min, double max) private static ColumnStatistics createColumnStatistics(Optional distinctValuesCount, Optional min, Optional max, Optional dataSize) { return ColumnStatistics.builder() - .addRange(rb -> rb - .setDistinctValuesCount(toEstimate(distinctValuesCount)) - .setLowValue(min) - .setHighValue(max) - .setDataSize(toEstimate(dataSize)) - .setFraction(new Estimate(1.0))) .setNullsFraction(zeroValue()) + .setDistinctValuesCount(toEstimate(distinctValuesCount)) + .setLowValue(min) + .setHighValue(max) + .setDataSize(toEstimate(dataSize)) .build(); } From 351a61fae2a8f0b63d570ecbdd0579e40ec4a2fa Mon Sep 17 00:00:00 2001 From: Andrii Rosa Date: Wed, 19 Sep 2018 14:03:09 -0400 Subject: [PATCH 02/14] Refactor Estimate - Rename factory methods: unknownValue -> unknown, zeroValue -> zero - Add factory method create(value). This factory method checks if value is finite - Validate estimates in ColumnStatistics --- .../MetastoreHiveStatisticsProvider.java | 52 +++++++++---------- .../presto/hive/AbstractTestHiveClient.java | 10 ++-- .../presto/sql/rewrite/ShowStatsRewrite.java | 2 +- .../spi/statistics/ColumnStatistics.java | 12 ++--- .../presto/spi/statistics/Estimate.java | 34 ++++++------ .../spi/statistics/TableStatistics.java | 5 +- .../TpcdsTableStatisticsFactory.java | 8 +-- .../presto/tpcds/EstimateAssertion.java | 2 +- .../tpcds/TestTpcdsMetadataStatistics.java | 38 +++++++------- .../facebook/presto/tpch/TpchMetadata.java | 8 +-- .../presto/tpch/EstimateAssertion.java | 2 +- .../presto/tpch/TestTpchMetadata.java | 11 ++-- 12 files changed, 92 insertions(+), 92 deletions(-) diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java b/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java index 8f78bf996e839..e46cba25f0972 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java @@ -207,9 +207,9 @@ private OptionalDouble calculateRowsPerPartition(Map statisticsSample, String column) @@ -235,11 +235,11 @@ private Estimate calculateDistinctValuesCount(Map s private Estimate calculateNullsFraction(Map statisticsSample, int totalPartitionsCount, String column, Estimate rowCount) { - if (rowCount.isValueUnknown()) { - return Estimate.unknownValue(); + if (rowCount.isUnknown()) { + return Estimate.unknown(); } if (rowCount.getValue() == 0.0) { - return Estimate.zeroValue(); + return Estimate.zero(); } Estimate totalNullsCount = summarizePartitionStatistics( @@ -265,16 +265,16 @@ private Estimate calculateNullsFraction(Map statist return OptionalDouble.of(totalPartitionsCount / partitionsWithStatisticsCount * nullsCount); }); - if (totalNullsCount.isValueUnknown()) { - return Estimate.unknownValue(); + if (totalNullsCount.isUnknown()) { + return Estimate.unknown(); } - return new Estimate(totalNullsCount.getValue() / rowCount.getValue()); + return Estimate.of(totalNullsCount.getValue() / rowCount.getValue()); } private Estimate calculateDataSize(Map statisticsSample, String columnName, Estimate rowCount) { - if (rowCount.isValueUnknown()) { - return Estimate.unknownValue(); + if (rowCount.isUnknown()) { + return Estimate.unknown(); } int knownPartitionCount = 0; @@ -299,20 +299,20 @@ private Estimate calculateDataSize(Map statisticsSa } if (knownPartitionCount == 0) { - return Estimate.unknownValue(); + return Estimate.unknown(); } if (knownDataSize == 0) { - return Estimate.zeroValue(); + return Estimate.zero(); } verify(knownRowCount > 0); - return new Estimate(knownDataSize / knownRowCount * rowCount.getValue()); + return Estimate.of(knownDataSize / knownRowCount * rowCount.getValue()); } private Estimate countDistinctPartitionKeys(HiveColumnHandle partitionColumn, List partitions) { - return new Estimate(partitions.stream() + return Estimate.of(partitions.stream() .map(HivePartition::getKeys) .map(keys -> keys.get(partitionColumn)) .distinct() @@ -326,14 +326,14 @@ private Estimate calculateNullsFractionForPartitioningKey( Estimate rowCount, OptionalDouble rowsPerPartition) { - if (rowCount.isValueUnknown()) { - return Estimate.unknownValue(); + if (rowCount.isUnknown()) { + return Estimate.unknown(); } if (rowCount.getValue() == 0.0) { - return Estimate.zeroValue(); + return Estimate.zero(); } if (!rowsPerPartition.isPresent()) { - return Estimate.unknownValue(); + return Estimate.unknown(); } double estimatedNullsCount = queriedPartitions.stream() @@ -341,7 +341,7 @@ private Estimate calculateNullsFractionForPartitioningKey( .map(HivePartition::getPartitionId) .mapToDouble(partitionId -> orElse(statisticsSample.get(partitionId).getBasicStatistics().getRowCount(), rowsPerPartition.getAsDouble())) .sum(); - return new Estimate(estimatedNullsCount / rowCount.getValue()); + return Estimate.of(estimatedNullsCount / rowCount.getValue()); } private Estimate calculateDataSizeForPartitioningKey( @@ -351,14 +351,14 @@ private Estimate calculateDataSizeForPartitioningKey( Estimate rowCount, OptionalDouble rowsPerPartition) { - if (rowCount.isValueUnknown() || !rowsPerPartition.isPresent()) { - return Estimate.unknownValue(); + if (rowCount.isUnknown() || !rowsPerPartition.isPresent()) { + return Estimate.unknown(); } String baseType = partitionColumn.getTypeSignature().getBase(); if (!VARCHAR.equals(baseType) && !CHAR.equalsIgnoreCase(baseType)) { // TODO support VARBINARY - return Estimate.unknownValue(); + return Estimate.unknown(); } double knownRowCount = 0; @@ -379,10 +379,10 @@ private Estimate calculateDataSizeForPartitioningKey( } if (knownRowCount == 0) { - return Estimate.unknownValue(); + return Estimate.unknown(); } - return new Estimate(knownDataSize / knownRowCount * rowCount.getValue()); + return Estimate.of(knownDataSize / knownRowCount * rowCount.getValue()); } private Estimate summarizePartitionStatistics( @@ -402,9 +402,9 @@ private Estimate summarizePartitionStatistics( OptionalDouble statisticsValue = valueAggregateFunction.apply(intermediateStream); if (!statisticsValue.isPresent()) { - return Estimate.unknownValue(); + return Estimate.unknown(); } - return new Estimate(statisticsValue.getAsDouble()); + return Estimate.of(statisticsValue.getAsDouble()); } private Map getPartitionsStatistics(HiveTableHandle tableHandle, List hivePartitions) diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/AbstractTestHiveClient.java b/presto-hive/src/test/java/com/facebook/presto/hive/AbstractTestHiveClient.java index 7ed5bd34b1458..9aed55b089e57 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/AbstractTestHiveClient.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/AbstractTestHiveClient.java @@ -1309,7 +1309,7 @@ private void assertTableStatsComputed( ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); TableStatistics tableStatistics = metadata.getTableStatistics(session, tableHandle, Constraint.alwaysTrue()); - assertFalse(tableStatistics.getRowCount().isValueUnknown(), "row count is unknown"); + assertFalse(tableStatistics.getRowCount().isUnknown(), "row count is unknown"); Map columnsStatistics = tableStatistics .getColumnStatistics() @@ -1328,21 +1328,21 @@ private void assertTableStatsComputed( Type columnType = metadata.getColumnMetadata(session, tableHandle, columnHandle).getType(); assertFalse( - columnStatistics.getNullsFraction().isValueUnknown(), + columnStatistics.getNullsFraction().isUnknown(), "unknown nulls fraction for " + columnName); assertFalse( - columnStatistics.getDistinctValuesCount().isValueUnknown(), + columnStatistics.getDistinctValuesCount().isUnknown(), "unknown distinct values count for " + columnName); if (isVarcharType(columnType)) { assertFalse( - columnStatistics.getDataSize().isValueUnknown(), + columnStatistics.getDataSize().isUnknown(), "unknown data size for " + columnName); } else { assertTrue( - columnStatistics.getDataSize().isValueUnknown(), + columnStatistics.getDataSize().isUnknown(), "unknown data size for" + columnName); } }); diff --git a/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java b/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java index 20d83564996c5..b08f0ac4173c1 100644 --- a/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java +++ b/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java @@ -347,7 +347,7 @@ private Expression lowHighAsLiteral(Type valueType, Optional value) private static Expression createStatisticValueOrNull(Estimate estimate) { - if (estimate.isValueUnknown()) { + if (estimate.isUnknown()) { return NULL_DOUBLE; } return new DoubleLiteral(Double.toString(estimate.getValue())); diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java index 59ab5989a8713..0ce862828c53f 100644 --- a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java @@ -35,17 +35,17 @@ public ColumnStatistics( Optional highValue) { this.nullsFraction = requireNonNull(nullsFraction, "nullsFraction is null"); - if (!nullsFraction.isValueUnknown()) { + if (!nullsFraction.isUnknown()) { if (nullsFraction.getValue() < 0 || nullsFraction.getValue() > 1) { throw new IllegalArgumentException(format("nullsFraction must be between 0 and 1: %s", nullsFraction.getValue())); } } this.distinctValuesCount = requireNonNull(distinctValuesCount, "distinctValuesCount is null"); - if (!distinctValuesCount.isValueUnknown() && distinctValuesCount.getValue() < 0) { + if (!distinctValuesCount.isUnknown() && distinctValuesCount.getValue() < 0) { throw new IllegalArgumentException(format("distinctValuesCount must be greater than or equal to 0: %s", distinctValuesCount.getValue())); } this.dataSize = requireNonNull(dataSize, "dataSize is null"); - if (!dataSize.isValueUnknown() && dataSize.getValue() < 0) { + if (!dataSize.isUnknown() && dataSize.getValue() < 0) { throw new IllegalArgumentException(format("dataSize must be greater than or equal to 0: %s", dataSize.getValue())); } this.lowValue = requireNonNull(lowValue, "lowValue is null"); @@ -119,9 +119,9 @@ public static Builder builder() public static final class Builder { - private Estimate nullsFraction = Estimate.unknownValue(); - private Estimate distinctValuesCount = Estimate.unknownValue(); - private Estimate dataSize = Estimate.unknownValue(); + private Estimate nullsFraction = Estimate.unknown(); + private Estimate distinctValuesCount = Estimate.unknown(); + private Estimate dataSize = Estimate.unknown(); private Optional lowValue = Optional.empty(); private Optional highValue = Optional.empty(); diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/Estimate.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/Estimate.java index 1b04992473ed4..de101724abc2e 100644 --- a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/Estimate.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/Estimate.java @@ -15,8 +15,9 @@ package com.facebook.presto.spi.statistics; import java.util.Objects; -import java.util.function.Function; +import static java.lang.Double.NaN; +import static java.lang.Double.isInfinite; import static java.lang.Double.isNaN; public final class Estimate @@ -25,27 +26,38 @@ public final class Estimate // Skipping for now as there hard to compute it properly and so far we do not have // usecase for that. - private static final Estimate UNKNOWN = new Estimate(Double.NaN); + private static final Estimate UNKNOWN = new Estimate(NaN); private static final Estimate ZERO = new Estimate(0); private final double value; - public static Estimate unknownValue() + public static Estimate unknown() { return UNKNOWN; } - public static Estimate zeroValue() + public static Estimate zero() { return ZERO; } - public Estimate(double value) + public static Estimate of(double value) + { + if (isNaN(value)) { + throw new IllegalArgumentException("value is NaN"); + } + if (isInfinite(value)) { + throw new IllegalArgumentException("value is infinite"); + } + return new Estimate(value); + } + + private Estimate(double value) { this.value = value; } - public boolean isValueUnknown() + public boolean isUnknown() { return isNaN(value); } @@ -55,16 +67,6 @@ public double getValue() return value; } - public Estimate map(Function mappingFunction) - { - if (isValueUnknown()) { - return this; - } - else { - return new Estimate(mappingFunction.apply(value)); - } - } - @Override public boolean equals(Object o) { diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/TableStatistics.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/TableStatistics.java index a56dc8a17d135..7fee02ac4fc11 100644 --- a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/TableStatistics.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/TableStatistics.java @@ -20,7 +20,6 @@ import java.util.Map; import java.util.Objects; -import static com.facebook.presto.spi.statistics.Estimate.unknownValue; import static java.lang.String.format; import static java.util.Collections.unmodifiableMap; import static java.util.Objects.requireNonNull; @@ -35,7 +34,7 @@ public final class TableStatistics public TableStatistics(Estimate rowCount, Map columnStatistics) { this.rowCount = requireNonNull(rowCount, "rowCount can not be null"); - if (!rowCount.isValueUnknown() && rowCount.getValue() < 0) { + if (!rowCount.isUnknown() && rowCount.getValue() < 0) { throw new IllegalArgumentException(format("rowCount must be greater than or equal to 0: %s", rowCount.getValue())); } this.columnStatistics = unmodifiableMap(requireNonNull(columnStatistics, "columnStatistics can not be null")); @@ -78,7 +77,7 @@ public static Builder builder() public static final class Builder { - private Estimate rowCount = unknownValue(); + private Estimate rowCount = Estimate.unknown(); private Map columnStatisticsMap = new HashMap<>(); public Builder setRowCount(Estimate rowCount) diff --git a/presto-tpcds/src/main/java/com/facebook/presto/tpcds/statistics/TpcdsTableStatisticsFactory.java b/presto-tpcds/src/main/java/com/facebook/presto/tpcds/statistics/TpcdsTableStatisticsFactory.java index a800ab8ae23cc..1809ce0c479f3 100644 --- a/presto-tpcds/src/main/java/com/facebook/presto/tpcds/statistics/TpcdsTableStatisticsFactory.java +++ b/presto-tpcds/src/main/java/com/facebook/presto/tpcds/statistics/TpcdsTableStatisticsFactory.java @@ -53,7 +53,7 @@ private TableStatistics toTableStatistics(Map columnHandle { long rowCount = statisticsData.getRowCount(); TableStatistics.Builder tableStatistics = TableStatistics.builder() - .setRowCount(new Estimate(rowCount)); + .setRowCount(Estimate.of(rowCount)); if (rowCount > 0) { Map columnsData = statisticsData.getColumns(); @@ -70,11 +70,11 @@ private ColumnStatistics toColumnStatistics(ColumnStatisticsData columnStatistic { ColumnStatistics.Builder columnStatistics = ColumnStatistics.builder(); long nullCount = columnStatisticsData.getNullsCount(); - columnStatistics.setNullsFraction(new Estimate((double) nullCount / rowCount)); + columnStatistics.setNullsFraction(Estimate.of((double) nullCount / rowCount)); columnStatistics.setLowValue(columnStatisticsData.getMin().map(value -> toPrestoValue(value, type))); columnStatistics.setHighValue(columnStatisticsData.getMax().map(value -> toPrestoValue(value, type))); - columnStatistics.setDistinctValuesCount(new Estimate(columnStatisticsData.getDistinctValuesCount())); - columnStatistics.setDataSize(columnStatisticsData.getDataSize().map(Estimate::new).orElse(Estimate.unknownValue())); + columnStatistics.setDistinctValuesCount(Estimate.of(columnStatisticsData.getDistinctValuesCount())); + columnStatistics.setDataSize(columnStatisticsData.getDataSize().map(Estimate::of).orElse(Estimate.unknown())); return columnStatistics.build(); } diff --git a/presto-tpcds/src/test/java/com/facebook/presto/tpcds/EstimateAssertion.java b/presto-tpcds/src/test/java/com/facebook/presto/tpcds/EstimateAssertion.java index 7374ad1634692..2710d9a533a4f 100644 --- a/presto-tpcds/src/test/java/com/facebook/presto/tpcds/EstimateAssertion.java +++ b/presto-tpcds/src/test/java/com/facebook/presto/tpcds/EstimateAssertion.java @@ -39,7 +39,7 @@ public void assertClose(Estimate actual, Estimate expected, String comparedValue private Optional toOptional(Estimate estimate) { - return estimate.isValueUnknown() ? empty() : Optional.of(estimate.getValue()); + return estimate.isUnknown() ? empty() : Optional.of(estimate.getValue()); } public void assertClose(Optional actual, Optional expected, String comparedValue) diff --git a/presto-tpcds/src/test/java/com/facebook/presto/tpcds/TestTpcdsMetadataStatistics.java b/presto-tpcds/src/test/java/com/facebook/presto/tpcds/TestTpcdsMetadataStatistics.java index e3e2f06f6d620..4051a3e8e1553 100644 --- a/presto-tpcds/src/test/java/com/facebook/presto/tpcds/TestTpcdsMetadataStatistics.java +++ b/presto-tpcds/src/test/java/com/facebook/presto/tpcds/TestTpcdsMetadataStatistics.java @@ -53,7 +53,7 @@ public void testNoTableStatsForNotSupportedSchema() SchemaTableName schemaTableName = new SchemaTableName(schemaName, table.getName()); ConnectorTableHandle tableHandle = metadata.getTableHandle(session, schemaTableName); TableStatistics tableStatistics = metadata.getTableStatistics(session, tableHandle, alwaysTrue()); - assertTrue(tableStatistics.getRowCount().isValueUnknown()); + assertTrue(tableStatistics.getRowCount().isUnknown()); assertTrue(tableStatistics.getColumnStatistics().isEmpty()); })); } @@ -67,7 +67,7 @@ public void testTableStatsExistenceSupportedSchema() SchemaTableName schemaTableName = new SchemaTableName(schemaName, table.getName()); ConnectorTableHandle tableHandle = metadata.getTableHandle(session, schemaTableName); TableStatistics tableStatistics = metadata.getTableStatistics(session, tableHandle, alwaysTrue()); - assertFalse(tableStatistics.getRowCount().isValueUnknown()); + assertFalse(tableStatistics.getRowCount().isUnknown()); for (ColumnHandle column : metadata.getColumnHandles(session, tableHandle).values()) { assertTrue(tableStatistics.getColumnStatistics().containsKey(column)); assertNotNull(tableStatistics.getColumnStatistics().get(column)); @@ -92,7 +92,7 @@ public void testTableStatsDetails() ConnectorTableHandle tableHandle = metadata.getTableHandle(session, schemaTableName); TableStatistics tableStatistics = metadata.getTableStatistics(session, tableHandle, alwaysTrue()); - estimateAssertion.assertClose(tableStatistics.getRowCount(), new Estimate(6), "Row count does not match"); + estimateAssertion.assertClose(tableStatistics.getRowCount(), Estimate.of(6), "Row count does not match"); // all columns have stats Map columnHandles = metadata.getColumnHandles(session, tableHandle); @@ -105,8 +105,8 @@ public void testTableStatsDetails() assertColumnStatistics( tableStatistics.getColumnStatistics().get(columnHandles.get(CallCenterColumn.CC_CALL_CENTER_SK.getName())), ColumnStatistics.builder() - .setNullsFraction(new Estimate(0)) - .setDistinctValuesCount(new Estimate(6)) + .setNullsFraction(Estimate.of(0)) + .setDistinctValuesCount(Estimate.of(6)) .setLowValue(Optional.of(1L)) .setHighValue(Optional.of(6L)) .build()); @@ -115,30 +115,30 @@ public void testTableStatsDetails() assertColumnStatistics( tableStatistics.getColumnStatistics().get(columnHandles.get(CallCenterColumn.CC_CALL_CENTER_ID.getName())), ColumnStatistics.builder() - .setNullsFraction(new Estimate(0)) - .setDistinctValuesCount(new Estimate(3)) + .setNullsFraction(Estimate.of(0)) + .setDistinctValuesCount(Estimate.of(3)) .setLowValue(Optional.of(Slices.utf8Slice("AAAAAAAABAAAAAAA"))) .setHighValue(Optional.of(Slices.utf8Slice("AAAAAAAAEAAAAAAA"))) - .setDataSize(new Estimate(48.0)) + .setDataSize(Estimate.of(48.0)) .build()); // char assertColumnStatistics( tableStatistics.getColumnStatistics().get(columnHandles.get(CallCenterColumn.CC_ZIP.getName())), ColumnStatistics.builder() - .setNullsFraction(new Estimate(0)) - .setDistinctValuesCount(new Estimate(1)) + .setNullsFraction(Estimate.of(0)) + .setDistinctValuesCount(Estimate.of(1)) .setLowValue(Optional.of(Slices.utf8Slice("31904"))) .setHighValue(Optional.of(Slices.utf8Slice("31904"))) - .setDataSize(new Estimate(5.0)) + .setDataSize(Estimate.of(5.0)) .build()); // decimal assertColumnStatistics( tableStatistics.getColumnStatistics().get(columnHandles.get(CallCenterColumn.CC_GMT_OFFSET.getName())), ColumnStatistics.builder() - .setNullsFraction(new Estimate(0)) - .setDistinctValuesCount(new Estimate(1)) + .setNullsFraction(Estimate.of(0)) + .setDistinctValuesCount(Estimate.of(1)) .setLowValue(Optional.of(-500L)) .setHighValue(Optional.of(-500L)) .build()); @@ -147,8 +147,8 @@ public void testTableStatsDetails() assertColumnStatistics( tableStatistics.getColumnStatistics().get(columnHandles.get(CallCenterColumn.CC_REC_START_DATE.getName())), ColumnStatistics.builder() - .setNullsFraction(new Estimate(0)) - .setDistinctValuesCount(new Estimate(4)) + .setNullsFraction(Estimate.of(0)) + .setDistinctValuesCount(Estimate.of(4)) .setLowValue(Optional.of(10227L)) .setHighValue(Optional.of(11688L)) .build()); @@ -157,8 +157,8 @@ public void testTableStatsDetails() assertColumnStatistics( tableStatistics.getColumnStatistics().get(columnHandles.get(CallCenterColumn.CC_CLOSED_DATE_SK.getName())), ColumnStatistics.builder() - .setNullsFraction(new Estimate(1)) - .setDistinctValuesCount(new Estimate(0)) + .setNullsFraction(Estimate.of(1)) + .setDistinctValuesCount(Estimate.of(0)) .setLowValue(Optional.empty()) .setHighValue(Optional.empty()) .build()); @@ -177,8 +177,8 @@ public void testNullFraction() assertColumnStatistics( tableStatistics.getColumnStatistics().get(columnHandles.get(WebSiteColumn.WEB_REC_END_DATE.getName())), ColumnStatistics.builder() - .setNullsFraction(new Estimate(0.5)) - .setDistinctValuesCount(new Estimate(3)) + .setNullsFraction(Estimate.of(0.5)) + .setDistinctValuesCount(Estimate.of(3)) .setLowValue(Optional.of(10819L)) .setHighValue(Optional.of(11549L)) .build()); diff --git a/presto-tpch/src/main/java/com/facebook/presto/tpch/TpchMetadata.java b/presto-tpch/src/main/java/com/facebook/presto/tpch/TpchMetadata.java index ed70093825245..0ecbead3d8169 100644 --- a/presto-tpch/src/main/java/com/facebook/presto/tpch/TpchMetadata.java +++ b/presto-tpch/src/main/java/com/facebook/presto/tpch/TpchMetadata.java @@ -362,7 +362,7 @@ private Map, List> avoidTrivialOrderStatusRestriction(List private TableStatistics toTableStatistics(TableStatisticsData tableStatisticsData, TpchTableHandle tpchTableHandle, Map columnHandles) { TableStatistics.Builder builder = TableStatistics.builder() - .setRowCount(new Estimate(tableStatisticsData.getRowCount())); + .setRowCount(Estimate.of(tableStatisticsData.getRowCount())); tableStatisticsData.getColumns().forEach((columnName, stats) -> { TpchColumnHandle columnHandle = (TpchColumnHandle) getColumnHandle(tpchTableHandle, columnHandles, columnName); builder.setColumnStatistics(columnHandle, toColumnStatistics(stats, columnHandle.getType())); @@ -379,9 +379,9 @@ private ColumnHandle getColumnHandle(TpchTableHandle tpchTableHandle, Map toPrestoValue(value, columnType))) .setHighValue(stats.getMax().map(value -> toPrestoValue(value, columnType))) .build(); diff --git a/presto-tpch/src/test/java/com/facebook/presto/tpch/EstimateAssertion.java b/presto-tpch/src/test/java/com/facebook/presto/tpch/EstimateAssertion.java index 35905ee29f6ae..f03c91bbef8c3 100644 --- a/presto-tpch/src/test/java/com/facebook/presto/tpch/EstimateAssertion.java +++ b/presto-tpch/src/test/java/com/facebook/presto/tpch/EstimateAssertion.java @@ -38,7 +38,7 @@ public void assertClose(Estimate actual, Estimate expected, String comparedValue private Optional toOptional(Estimate estimate) { - return estimate.isValueUnknown() ? empty() : Optional.of(estimate.getValue()); + return estimate.isUnknown() ? empty() : Optional.of(estimate.getValue()); } public void assertClose(Optional actual, Optional expected, String comparedValue) diff --git a/presto-tpch/src/test/java/com/facebook/presto/tpch/TestTpchMetadata.java b/presto-tpch/src/test/java/com/facebook/presto/tpch/TestTpchMetadata.java index e2b186fee3fe7..78a4b75f44ec5 100644 --- a/presto-tpch/src/test/java/com/facebook/presto/tpch/TestTpchMetadata.java +++ b/presto-tpch/src/test/java/com/facebook/presto/tpch/TestTpchMetadata.java @@ -42,7 +42,6 @@ import static com.facebook.presto.spi.Constraint.alwaysFalse; import static com.facebook.presto.spi.Constraint.alwaysTrue; -import static com.facebook.presto.spi.statistics.Estimate.zeroValue; import static com.facebook.presto.tpch.TpchMetadata.getPrestoType; import static com.facebook.presto.tpch.util.PredicateUtils.filterOutColumnFromPredicate; import static com.google.common.collect.Iterables.getOnlyElement; @@ -140,7 +139,7 @@ private void testTableStats(String schema, TpchTable table, Constraint table) { TpchTableHandle tableHandle = tpchMetadata.getTableHandle(session, new SchemaTableName(schema, table.getTableName())); TableStatistics tableStatistics = tpchMetadata.getTableStatistics(session, tableHandle, alwaysTrue()); - assertTrue(tableStatistics.getRowCount().isValueUnknown()); + assertTrue(tableStatistics.getRowCount().isUnknown()); } @Test @@ -407,7 +406,7 @@ private ColumnStatistics rangeStatistics(double min, double max) private static ColumnStatistics createColumnStatistics(Optional distinctValuesCount, Optional min, Optional max, Optional dataSize) { return ColumnStatistics.builder() - .setNullsFraction(zeroValue()) + .setNullsFraction(Estimate.zero()) .setDistinctValuesCount(toEstimate(distinctValuesCount)) .setLowValue(min) .setHighValue(max) @@ -418,7 +417,7 @@ private static ColumnStatistics createColumnStatistics(Optional distinct private static Estimate toEstimate(Optional value) { return value - .map(Estimate::new) - .orElse(Estimate.unknownValue()); + .map(Estimate::of) + .orElse(Estimate.unknown()); } } From 31ce36963e537947f0b785f0cd884d3c7805d6bf Mon Sep 17 00:00:00 2001 From: Andrii Rosa Date: Wed, 19 Sep 2018 14:03:10 -0400 Subject: [PATCH 03/14] Add TableStatistics#empty() method Instead of TableStatistics#EMPTY_STATISTICS static field --- .../main/java/com/facebook/presto/hive/HiveMetadata.java | 3 +-- .../hive/statistics/MetastoreHiveStatisticsProvider.java | 2 +- .../facebook/presto/spi/connector/ConnectorMetadata.java | 3 +-- .../facebook/presto/spi/statistics/TableStatistics.java | 7 ++++++- .../tpcds/statistics/TpcdsTableStatisticsFactory.java | 2 +- .../main/java/com/facebook/presto/tpch/TpchMetadata.java | 2 +- 6 files changed, 11 insertions(+), 8 deletions(-) diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/HiveMetadata.java b/presto-hive/src/main/java/com/facebook/presto/hive/HiveMetadata.java index 8a332be43bfb8..91b226dffef04 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/HiveMetadata.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/HiveMetadata.java @@ -180,7 +180,6 @@ import static com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED; import static com.facebook.presto.spi.StandardErrorCode.SCHEMA_NOT_EMPTY; import static com.facebook.presto.spi.predicate.TupleDomain.withColumnDomains; -import static com.facebook.presto.spi.statistics.TableStatistics.EMPTY_STATISTICS; import static com.google.common.base.MoreObjects.firstNonNull; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Verify.verify; @@ -525,7 +524,7 @@ public Map> listTableColumns(ConnectorSess public TableStatistics getTableStatistics(ConnectorSession session, ConnectorTableHandle tableHandle, Constraint constraint) { if (!isStatisticsEnabled(session)) { - return EMPTY_STATISTICS; + return TableStatistics.empty(); } List hivePartitions = getPartitionsAsList(tableHandle, constraint); Map tableColumns = getColumnHandles(session, tableHandle) diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java b/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java index e46cba25f0972..95aee594a1376 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java @@ -96,7 +96,7 @@ public MetastoreHiveStatisticsProvider(TypeManager typeManager, SemiTransactiona public TableStatistics getTableStatistics(ConnectorSession session, ConnectorTableHandle tableHandle, List queriedPartitions, Map tableColumns) { if (!isStatisticsEnabled(session)) { - return TableStatistics.EMPTY_STATISTICS; + return TableStatistics.empty(); } int queriedPartitionsCount = queriedPartitions.size(); diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/connector/ConnectorMetadata.java b/presto-spi/src/main/java/com/facebook/presto/spi/connector/ConnectorMetadata.java index 04e1079b4c43f..e4669cab77d29 100644 --- a/presto-spi/src/main/java/com/facebook/presto/spi/connector/ConnectorMetadata.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/connector/ConnectorMetadata.java @@ -49,7 +49,6 @@ import static com.facebook.presto.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR; import static com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED; -import static com.facebook.presto.spi.statistics.TableStatistics.EMPTY_STATISTICS; import static java.util.Collections.emptyList; import static java.util.Collections.emptyMap; import static java.util.stream.Collectors.toList; @@ -158,7 +157,7 @@ default List listTables(ConnectorSession session, Optional constraint) { - return EMPTY_STATISTICS; + return TableStatistics.empty(); } /** diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/TableStatistics.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/TableStatistics.java index 7fee02ac4fc11..f4c3c22323457 100644 --- a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/TableStatistics.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/TableStatistics.java @@ -26,11 +26,16 @@ public final class TableStatistics { - public static final TableStatistics EMPTY_STATISTICS = TableStatistics.builder().build(); + private static final TableStatistics EMPTY = TableStatistics.builder().build(); private final Estimate rowCount; private final Map columnStatistics; + public static TableStatistics empty() + { + return EMPTY; + } + public TableStatistics(Estimate rowCount, Map columnStatistics) { this.rowCount = requireNonNull(rowCount, "rowCount can not be null"); diff --git a/presto-tpcds/src/main/java/com/facebook/presto/tpcds/statistics/TpcdsTableStatisticsFactory.java b/presto-tpcds/src/main/java/com/facebook/presto/tpcds/statistics/TpcdsTableStatisticsFactory.java index 1809ce0c479f3..ef6f503a798f1 100644 --- a/presto-tpcds/src/main/java/com/facebook/presto/tpcds/statistics/TpcdsTableStatisticsFactory.java +++ b/presto-tpcds/src/main/java/com/facebook/presto/tpcds/statistics/TpcdsTableStatisticsFactory.java @@ -46,7 +46,7 @@ public TableStatistics create(String schemaName, Table table, Map statisticsDataOptional = statisticsDataRepository.load(schemaName, table); return statisticsDataOptional.map(statisticsData -> toTableStatistics(columnHandles, statisticsData)) - .orElse(TableStatistics.EMPTY_STATISTICS); + .orElse(TableStatistics.empty()); } private TableStatistics toTableStatistics(Map columnHandles, TableStatisticsData statisticsData) diff --git a/presto-tpch/src/main/java/com/facebook/presto/tpch/TpchMetadata.java b/presto-tpch/src/main/java/com/facebook/presto/tpch/TpchMetadata.java index 0ecbead3d8169..53dcba6be6ac1 100644 --- a/presto-tpch/src/main/java/com/facebook/presto/tpch/TpchMetadata.java +++ b/presto-tpch/src/main/java/com/facebook/presto/tpch/TpchMetadata.java @@ -323,7 +323,7 @@ public TableStatistics getTableStatistics(ConnectorSession session, ConnectorTab Map columnHandles = getColumnHandles(session, tpchTableHandle); return optionalTableStatisticsData .map(tableStatisticsData -> toTableStatistics(optionalTableStatisticsData.get(), tpchTableHandle, columnHandles)) - .orElse(TableStatistics.EMPTY_STATISTICS); + .orElse(TableStatistics.empty()); } private Map, List> getColumnValuesRestrictions(TpchTable tpchTable, Constraint constraint) From aed92f8e1724cadd56d6f1cdf1234076349e09e5 Mon Sep 17 00:00:00 2001 From: Andrii Rosa Date: Wed, 19 Sep 2018 14:03:11 -0400 Subject: [PATCH 04/14] Refactor TableStatistics --- .../presto/spi/statistics/TableStatistics.java | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/TableStatistics.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/TableStatistics.java index f4c3c22323457..af75c0fd46311 100644 --- a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/TableStatistics.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/TableStatistics.java @@ -75,6 +75,15 @@ public int hashCode() return Objects.hash(rowCount, columnStatistics); } + @Override + public String toString() + { + return "TableStatistics{" + + "rowCount=" + rowCount + + ", columnStatistics=" + columnStatistics + + '}'; + } + public static Builder builder() { return new Builder(); @@ -91,11 +100,11 @@ public Builder setRowCount(Estimate rowCount) return this; } - public Builder setColumnStatistics(ColumnHandle columnName, ColumnStatistics columnStatistics) + public Builder setColumnStatistics(ColumnHandle columnHandle, ColumnStatistics columnStatistics) { - requireNonNull(columnName, "columnName can not be null"); + requireNonNull(columnHandle, "columnHandle can not be null"); requireNonNull(columnStatistics, "columnStatistics can not be null"); - this.columnStatisticsMap.put(columnName, columnStatistics); + this.columnStatisticsMap.put(columnHandle, columnStatistics); return this; } From 1ff1e28f2a1492369923d84954befeed923c8461 Mon Sep 17 00:00:00 2001 From: Andrii Rosa Date: Wed, 19 Sep 2018 14:03:12 -0400 Subject: [PATCH 05/14] Extract parsePartition in PartitionManager --- .../facebook/presto/hive/HiveMetadata.java | 4 +- .../presto/hive/HivePartitionManager.java | 37 ++++++++++----- .../metastore/file/FileHiveMetastore.java | 6 +-- .../TestMetastoreHiveStatisticsProvider.java | 46 +++++++++++-------- 4 files changed, 58 insertions(+), 35 deletions(-) diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/HiveMetadata.java b/presto-hive/src/main/java/com/facebook/presto/hive/HiveMetadata.java index 91b226dffef04..da7b438317206 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/HiveMetadata.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/HiveMetadata.java @@ -123,7 +123,7 @@ import static com.facebook.presto.hive.HiveErrorCode.HIVE_UNKNOWN_ERROR; import static com.facebook.presto.hive.HiveErrorCode.HIVE_UNSUPPORTED_FORMAT; import static com.facebook.presto.hive.HiveErrorCode.HIVE_WRITER_CLOSE_ERROR; -import static com.facebook.presto.hive.HivePartitionManager.extractPartitionKeyValues; +import static com.facebook.presto.hive.HivePartitionManager.extractPartitionValues; import static com.facebook.presto.hive.HiveSessionProperties.getHiveStorageFormat; import static com.facebook.presto.hive.HiveSessionProperties.isBucketExecutionEnabled; import static com.facebook.presto.hive.HiveSessionProperties.isCollectColumnStatisticsOnWrite; @@ -1271,7 +1271,7 @@ private Partition buildPartitionObject(ConnectorSession session, Table table, Pa .setDatabaseName(table.getDatabaseName()) .setTableName(table.getTableName()) .setColumns(table.getDataColumns()) - .setValues(extractPartitionKeyValues(partitionUpdate.getName())) + .setValues(extractPartitionValues(partitionUpdate.getName())) .setParameters(ImmutableMap.builder() .put(PRESTO_VERSION_NAME, prestoVersion) .put(PRESTO_QUERY_ID_NAME, session.getQueryId()) diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/HivePartitionManager.java b/presto-hive/src/main/java/com/facebook/presto/hive/HivePartitionManager.java index 976a4586b0d87..de0c970bd861d 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/HivePartitionManager.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/HivePartitionManager.java @@ -181,27 +181,22 @@ private Optional parseValuesAndFilterPartition( List partitionColumnTypes, Constraint constraint) { - List keys = extractPartitionKeyValues(partitionId); + HivePartition partition = parsePartition(tableName, partitionId, partitionColumns, partitionColumnTypes, timeZone); Map domains = constraint.getSummary().getDomains().get(); - ImmutableMap.Builder builder = ImmutableMap.builder(); - for (int i = 0; i < partitionColumns.size(); i++) { - HiveColumnHandle column = partitionColumns.get(i); - NullableValue parsedValue = parsePartitionValue(partitionId, keys.get(i), partitionColumnTypes.get(i), timeZone); - + for (HiveColumnHandle column : partitionColumns) { + NullableValue value = partition.getKeys().get(column); Domain allowedDomain = domains.get(column); - if (allowedDomain != null && !allowedDomain.includesNullableValue(parsedValue.getValue())) { + if (allowedDomain != null && !allowedDomain.includesNullableValue(value.getValue())) { return Optional.empty(); } - builder.put(column, parsedValue); } - Map values = builder.build(); - if (constraint.predicate().isPresent() && !constraint.predicate().get().test(values)) { + if (constraint.predicate().isPresent() && !constraint.predicate().get().test(partition.getKeys())) { return Optional.empty(); } - return Optional.of(new HivePartition(tableName, partitionId, values)); + return Optional.of(partition); } private Table getTable(SemiTransactionalHiveMetastore metastore, SchemaTableName tableName) @@ -282,7 +277,25 @@ else if (type instanceof TinyintType .orElseThrow(() -> new TableNotFoundException(tableName)); } - public static List extractPartitionKeyValues(String partitionName) + public static HivePartition parsePartition( + SchemaTableName tableName, + String partitionName, + List partitionColumns, + List partitionColumnTypes, + DateTimeZone timeZone) + { + List partitionValues = extractPartitionValues(partitionName); + ImmutableMap.Builder builder = ImmutableMap.builder(); + for (int i = 0; i < partitionColumns.size(); i++) { + HiveColumnHandle column = partitionColumns.get(i); + NullableValue parsedValue = parsePartitionValue(partitionName, partitionValues.get(i), partitionColumnTypes.get(i), timeZone); + builder.put(column, parsedValue); + } + Map values = builder.build(); + return new HivePartition(tableName, partitionName, values); + } + + public static List extractPartitionValues(String partitionName) { ImmutableList.Builder values = ImmutableList.builder(); diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/metastore/file/FileHiveMetastore.java b/presto-hive/src/main/java/com/facebook/presto/hive/metastore/file/FileHiveMetastore.java index ae95f709068a6..c30e39e444747 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/metastore/file/FileHiveMetastore.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/metastore/file/FileHiveMetastore.java @@ -79,7 +79,7 @@ import static com.facebook.presto.hive.HiveErrorCode.HIVE_METASTORE_ERROR; import static com.facebook.presto.hive.HiveErrorCode.HIVE_PARTITION_DROPPED_DURING_QUERY; -import static com.facebook.presto.hive.HivePartitionManager.extractPartitionKeyValues; +import static com.facebook.presto.hive.HivePartitionManager.extractPartitionValues; import static com.facebook.presto.hive.HiveUtil.toPartitionValues; import static com.facebook.presto.hive.metastore.Database.DEFAULT_DATABASE_NAME; import static com.facebook.presto.hive.metastore.HivePrivilegeInfo.HivePrivilege.OWNERSHIP; @@ -302,7 +302,7 @@ public synchronized Map getPartitionStatistics(Stri Table table = getRequiredTable(databaseName, tableName); ImmutableMap.Builder statistics = ImmutableMap.builder(); for (String partitionName : partitionNames) { - List partitionValues = extractPartitionKeyValues(partitionName); + List partitionValues = extractPartitionValues(partitionName); Path partitionDirectory = getPartitionMetadataDirectory(table, ImmutableList.copyOf(partitionValues)); PartitionMetadata partitionMetadata = readSchemaFile("partition", partitionDirectory, partitionCodec) .orElseThrow(() -> new PartitionNotFoundException(new SchemaTableName(databaseName, tableName), partitionValues)); @@ -352,7 +352,7 @@ public synchronized void updatePartitionStatistics(String databaseName, String t PartitionStatistics updatedStatistics = update.apply(originalStatistics); Table table = getRequiredTable(databaseName, tableName); - List partitionValues = extractPartitionKeyValues(partitionName); + List partitionValues = extractPartitionValues(partitionName); Path partitionDirectory = getPartitionMetadataDirectory(table, partitionValues); PartitionMetadata partitionMetadata = readSchemaFile("partition", partitionDirectory, partitionCodec) .orElseThrow(() -> new PartitionNotFoundException(new SchemaTableName(databaseName, tableName), partitionValues)); diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/statistics/TestMetastoreHiveStatisticsProvider.java b/presto-hive/src/test/java/com/facebook/presto/hive/statistics/TestMetastoreHiveStatisticsProvider.java index 5120fbf0c57e2..f900a2db9b5a2 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/statistics/TestMetastoreHiveStatisticsProvider.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/statistics/TestMetastoreHiveStatisticsProvider.java @@ -13,41 +13,51 @@ */ package com.facebook.presto.hive.statistics; +import com.facebook.presto.hive.HiveColumnHandle; import com.facebook.presto.hive.HivePartition; import com.facebook.presto.spi.SchemaTableName; import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; +import org.joda.time.DateTimeZone; import org.testng.annotations.Test; +import java.util.Optional; + +import static com.facebook.presto.hive.HiveColumnHandle.ColumnType.PARTITION_KEY; +import static com.facebook.presto.hive.HivePartitionManager.parsePartition; +import static com.facebook.presto.hive.HiveType.HIVE_LONG; +import static com.facebook.presto.hive.HiveType.HIVE_STRING; import static com.facebook.presto.hive.statistics.MetastoreHiveStatisticsProvider.getPartitionsSample; +import static com.facebook.presto.spi.type.BigintType.BIGINT; +import static com.facebook.presto.spi.type.VarcharType.VARCHAR; import static org.assertj.core.api.Assertions.assertThat; import static org.testng.Assert.assertEquals; public class TestMetastoreHiveStatisticsProvider { + private static final SchemaTableName TABLE = new SchemaTableName("schema", "table"); + + private static final HiveColumnHandle PARTITION_COLUMN_1 = new HiveColumnHandle("p1", HIVE_STRING, VARCHAR.getTypeSignature(), 0, PARTITION_KEY, Optional.empty()); + private static final HiveColumnHandle PARTITION_COLUMN_2 = new HiveColumnHandle("p2", HIVE_LONG, BIGINT.getTypeSignature(), 1, PARTITION_KEY, Optional.empty()); + @Test public void testGetPartitionsSample() { - assertEquals(getPartitionsSample(ImmutableList.of(partition("p1")), 1), ImmutableList.of(partition("p1"))); - assertEquals(getPartitionsSample(ImmutableList.of(partition("p1")), 2), ImmutableList.of(partition("p1"))); - assertEquals( - getPartitionsSample(ImmutableList.of(partition("p1"), partition("p2")), 2), - ImmutableList.of(partition("p1"), partition("p2"))); - assertEquals( - getPartitionsSample(ImmutableList.of(partition("p1"), partition("p2"), partition("p3")), 2), - ImmutableList.of(partition("p1"), partition("p3"))); - assertEquals( - getPartitionsSample(ImmutableList.of(partition("p1"), partition("p2"), partition("p3"), partition("p4")), 1), - getPartitionsSample(ImmutableList.of(partition("p1"), partition("p2"), partition("p3"), partition("p4")), 1)); - assertEquals( - getPartitionsSample(ImmutableList.of(partition("p1"), partition("p2"), partition("p3"), partition("p4")), 3), - getPartitionsSample(ImmutableList.of(partition("p1"), partition("p2"), partition("p3"), partition("p4")), 3)); - assertThat(getPartitionsSample(ImmutableList.of(partition("p1"), partition("p2"), partition("p3"), partition("p4")), 3)) - .contains(partition("p1"), partition("p3")); + HivePartition p1 = partition("p1=string1/p2=1234"); + HivePartition p2 = partition("p1=string2/p2=2345"); + HivePartition p3 = partition("p1=string3/p2=3456"); + HivePartition p4 = partition("p1=string4/p2=4567"); + + assertEquals(getPartitionsSample(ImmutableList.of(p1), 1), ImmutableList.of(p1)); + assertEquals(getPartitionsSample(ImmutableList.of(p1), 2), ImmutableList.of(p1)); + assertEquals(getPartitionsSample(ImmutableList.of(p1, p2), 2), ImmutableList.of(p1, p2)); + assertEquals(getPartitionsSample(ImmutableList.of(p1, p2, p3), 2), ImmutableList.of(p1, p3)); + assertEquals(getPartitionsSample(ImmutableList.of(p1, p2, p3, p4), 1), getPartitionsSample(ImmutableList.of(p1, p2, p3, p4), 1)); + assertEquals(getPartitionsSample(ImmutableList.of(p1, p2, p3, p4), 3), getPartitionsSample(ImmutableList.of(p1, p2, p3, p4), 3)); + assertThat(getPartitionsSample(ImmutableList.of(p1, p2, p3, p4), 3)).contains(p1, p4); } private static HivePartition partition(String name) { - return new HivePartition(new SchemaTableName("schema", "table"), name, ImmutableMap.of()); + return parsePartition(TABLE, name, ImmutableList.of(PARTITION_COLUMN_1, PARTITION_COLUMN_2), ImmutableList.of(VARCHAR, BIGINT), DateTimeZone.getDefault()); } } From b3827bfb7fdd3965a37c37a62b1272947f3ad457 Mon Sep 17 00:00:00 2001 From: Andrii Rosa Date: Wed, 19 Sep 2018 14:03:13 -0400 Subject: [PATCH 06/14] Fix incorrect partitions sample size --- .../presto/hive/statistics/MetastoreHiveStatisticsProvider.java | 2 +- .../hive/statistics/TestMetastoreHiveStatisticsProvider.java | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java b/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java index 95aee594a1376..805db28bb86ae 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java @@ -471,7 +471,7 @@ else if (partition.getPartitionId().compareTo(max.getPartitionId()) > 0) { .filter(partition -> !result.contains(partition)) .map(partition -> immutableEntry(partition, hashFunction.hashUnencodedChars(partition.getPartitionId()).asInt())) .sorted(hashComparator) - .limit(sampleSize) + .limit(samplesLeft) .forEach(entry -> result.add(entry.getKey())); } diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/statistics/TestMetastoreHiveStatisticsProvider.java b/presto-hive/src/test/java/com/facebook/presto/hive/statistics/TestMetastoreHiveStatisticsProvider.java index f900a2db9b5a2..79430787aedb2 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/statistics/TestMetastoreHiveStatisticsProvider.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/statistics/TestMetastoreHiveStatisticsProvider.java @@ -46,6 +46,7 @@ public void testGetPartitionsSample() HivePartition p2 = partition("p1=string2/p2=2345"); HivePartition p3 = partition("p1=string3/p2=3456"); HivePartition p4 = partition("p1=string4/p2=4567"); + HivePartition p5 = partition("p1=string5/p2=5678"); assertEquals(getPartitionsSample(ImmutableList.of(p1), 1), ImmutableList.of(p1)); assertEquals(getPartitionsSample(ImmutableList.of(p1), 2), ImmutableList.of(p1)); @@ -54,6 +55,7 @@ public void testGetPartitionsSample() assertEquals(getPartitionsSample(ImmutableList.of(p1, p2, p3, p4), 1), getPartitionsSample(ImmutableList.of(p1, p2, p3, p4), 1)); assertEquals(getPartitionsSample(ImmutableList.of(p1, p2, p3, p4), 3), getPartitionsSample(ImmutableList.of(p1, p2, p3, p4), 3)); assertThat(getPartitionsSample(ImmutableList.of(p1, p2, p3, p4), 3)).contains(p1, p4); + assertEquals(getPartitionsSample(ImmutableList.of(p1, p2, p3, p4, p5), 3).size(), 3); } private static HivePartition partition(String name) From f847336c88fa2578e19b4af6db6ceb20474e44d9 Mon Sep 17 00:00:00 2001 From: Andrii Rosa Date: Wed, 19 Sep 2018 14:03:14 -0400 Subject: [PATCH 07/14] Make getPartitionsSample deterministic Use deterministic murmur3 hash. goodFastHash changes its seed on every restart. --- .../statistics/MetastoreHiveStatisticsProvider.java | 13 ++++++------- .../TestMetastoreHiveStatisticsProvider.java | 4 +--- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java b/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java index 805db28bb86ae..23fd090743c05 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java @@ -72,7 +72,7 @@ import static com.google.common.collect.ImmutableList.toImmutableList; import static com.google.common.collect.ImmutableSet.toImmutableSet; import static com.google.common.collect.Maps.immutableEntry; -import static com.google.common.hash.Hashing.goodFastHash; +import static com.google.common.hash.Hashing.murmur3_128; import static java.util.Collections.unmodifiableList; import static java.util.Objects.requireNonNull; @@ -454,7 +454,6 @@ else if (partition.getPartitionId().compareTo(max.getPartitionId()) > 0) { } } - verify(samplesLeft > 0); result.add(min); samplesLeft--; if (samplesLeft > 0) { @@ -463,16 +462,16 @@ else if (partition.getPartitionId().compareTo(max.getPartitionId()) > 0) { } if (samplesLeft > 0) { - HashFunction hashFunction = goodFastHash(32); - Comparator> hashComparator = Comparator - ., Integer>comparing(Map.Entry::getValue) + HashFunction hashFunction = murmur3_128(); + Comparator> hashComparator = Comparator + ., Long>comparing(Map.Entry::getValue) .thenComparing(entry -> entry.getKey().getPartitionId()); partitions.stream() .filter(partition -> !result.contains(partition)) - .map(partition -> immutableEntry(partition, hashFunction.hashUnencodedChars(partition.getPartitionId()).asInt())) + .map(partition -> immutableEntry(partition, hashFunction.hashUnencodedChars(partition.getPartitionId()).asLong())) .sorted(hashComparator) .limit(samplesLeft) - .forEach(entry -> result.add(entry.getKey())); + .forEachOrdered(entry -> result.add(entry.getKey())); } return unmodifiableList(result); diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/statistics/TestMetastoreHiveStatisticsProvider.java b/presto-hive/src/test/java/com/facebook/presto/hive/statistics/TestMetastoreHiveStatisticsProvider.java index 79430787aedb2..6f13e49d698c8 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/statistics/TestMetastoreHiveStatisticsProvider.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/statistics/TestMetastoreHiveStatisticsProvider.java @@ -29,7 +29,6 @@ import static com.facebook.presto.hive.statistics.MetastoreHiveStatisticsProvider.getPartitionsSample; import static com.facebook.presto.spi.type.BigintType.BIGINT; import static com.facebook.presto.spi.type.VarcharType.VARCHAR; -import static org.assertj.core.api.Assertions.assertThat; import static org.testng.Assert.assertEquals; public class TestMetastoreHiveStatisticsProvider @@ -54,8 +53,7 @@ public void testGetPartitionsSample() assertEquals(getPartitionsSample(ImmutableList.of(p1, p2, p3), 2), ImmutableList.of(p1, p3)); assertEquals(getPartitionsSample(ImmutableList.of(p1, p2, p3, p4), 1), getPartitionsSample(ImmutableList.of(p1, p2, p3, p4), 1)); assertEquals(getPartitionsSample(ImmutableList.of(p1, p2, p3, p4), 3), getPartitionsSample(ImmutableList.of(p1, p2, p3, p4), 3)); - assertThat(getPartitionsSample(ImmutableList.of(p1, p2, p3, p4), 3)).contains(p1, p4); - assertEquals(getPartitionsSample(ImmutableList.of(p1, p2, p3, p4, p5), 3).size(), 3); + assertEquals(getPartitionsSample(ImmutableList.of(p1, p2, p3, p4, p5), 3), ImmutableList.of(p1, p5, p4)); } private static HivePartition partition(String name) From 845a0d8a365626621a3165f9bb2c801b88ece839 Mon Sep 17 00:00:00 2001 From: Andrii Rosa Date: Wed, 19 Sep 2018 14:03:15 -0400 Subject: [PATCH 08/14] Print column statistics in deterministic manner Make SHOW STATS to print column statistics in the same order as they appear in the table. Also print rows with all nulls for columns with missing statistics. --- .../presto/sql/rewrite/ShowStatsRewrite.java | 60 ++++++++++--------- .../presto/tests/TestLocalQueries.java | 4 +- 2 files changed, 35 insertions(+), 29 deletions(-) diff --git a/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java b/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java index b08f0ac4173c1..bd94bbb9e037b 100644 --- a/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java +++ b/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java @@ -19,8 +19,10 @@ import com.facebook.presto.metadata.QualifiedObjectName; import com.facebook.presto.metadata.Signature; import com.facebook.presto.metadata.TableHandle; +import com.facebook.presto.metadata.TableMetadata; import com.facebook.presto.security.AccessControl; import com.facebook.presto.spi.ColumnHandle; +import com.facebook.presto.spi.ColumnMetadata; import com.facebook.presto.spi.Constraint; import com.facebook.presto.spi.statistics.ColumnStatistics; import com.facebook.presto.spi.statistics.Estimate; @@ -80,8 +82,6 @@ import static com.google.common.collect.ImmutableList.toImmutableList; import static java.util.Collections.singletonList; import static java.util.Objects.requireNonNull; -import static java.util.function.Function.identity; -import static java.util.stream.Collectors.toMap; public class ShowStatsRewrite implements StatementRewrite.Rewrite @@ -204,9 +204,9 @@ private Node rewriteShowStats(ShowStats node, Table table, Constraint statsColumnNames = buildColumnsNames(); List selectItems = buildSelectItems(statsColumnNames); - Map tableColumnNames = getStatisticsColumnNames(tableStatistics, tableHandle); - Map tableColumnTypes = getStatisticsColumnTypes(tableStatistics, tableHandle); - List resultRows = buildStatisticsRows(tableStatistics, tableColumnNames, tableColumnTypes); + TableMetadata tableMetadata = metadata.getTableMetadata(session, tableHandle); + Map columnHandles = metadata.getColumnHandles(session, tableHandle); + List resultRows = buildStatisticsRows(tableMetadata, columnHandles, tableStatistics); return simpleQuery(selectAll(selectItems), aliased(new Values(resultRows), @@ -246,20 +246,6 @@ private Constraint getConstraint(QuerySpecification specification) return new Constraint<>(scanNode.get().getCurrentConstraint()); } - private Map getStatisticsColumnNames(TableStatistics statistics, TableHandle tableHandle) - { - return statistics.getColumnStatistics() - .keySet().stream() - .collect(toMap(identity(), column -> metadata.getColumnMetadata(session, tableHandle, column).getName())); - } - - private Map getStatisticsColumnTypes(TableStatistics statistics, TableHandle tableHandle) - { - return statistics.getColumnStatistics() - .keySet().stream() - .collect(toMap(identity(), column -> metadata.getColumnMetadata(session, tableHandle, column).getType())); - } - private TableHandle getTableHandle(ShowStats node, QualifiedName table) { QualifiedObjectName qualifiedTableName = createQualifiedObjectName(session, node, table); @@ -287,19 +273,26 @@ private static List buildSelectItems(List columnNames) .collect(toImmutableList()); } - private List buildStatisticsRows(TableStatistics tableStatistics, Map columnNames, Map columnTypes) + private List buildStatisticsRows(TableMetadata tableMetadata, Map columnHandles, TableStatistics tableStatistics) { ImmutableList.Builder rowsBuilder = ImmutableList.builder(); - - // Stats for columns - for (Map.Entry columnStats : tableStatistics.getColumnStatistics().entrySet()) { - ColumnHandle columnHandle = columnStats.getKey(); - rowsBuilder.add(createColumnStatsRow(columnNames.get(columnHandle), columnTypes.get(columnHandle), columnStats.getValue())); + for (ColumnMetadata columnMetadata : tableMetadata.getColumns()) { + if (columnMetadata.isHidden()) { + continue; + } + String columnName = columnMetadata.getName(); + Type columnType = columnMetadata.getType(); + ColumnHandle columnHandle = columnHandles.get(columnName); + ColumnStatistics columnStatistics = tableStatistics.getColumnStatistics().get(columnHandle); + if (columnStatistics != null) { + rowsBuilder.add(createColumnStatsRow(columnName, columnType, columnStatistics)); + } + else { + rowsBuilder.add(createEmptyColumnStatsRow(columnName)); + } } - // Stats for whole table rowsBuilder.add(createTableStatsRow(tableStatistics)); - return rowsBuilder.build(); } @@ -316,6 +309,19 @@ private Row createColumnStatsRow(String columnName, Type type, ColumnStatistics return new Row(rowValues.build()); } + private Expression createEmptyColumnStatsRow(String columnName) + { + ImmutableList.Builder rowValues = ImmutableList.builder(); + rowValues.add(new StringLiteral(columnName)); + rowValues.add(NULL_DOUBLE); + rowValues.add(NULL_DOUBLE); + rowValues.add(NULL_DOUBLE); + rowValues.add(NULL_DOUBLE); + rowValues.add(NULL_VARCHAR); + rowValues.add(NULL_VARCHAR); + return new Row(rowValues.build()); + } + private static Row createTableStatsRow(TableStatistics tableStatistics) { ImmutableList.Builder rowValues = ImmutableList.builder(); diff --git a/presto-tests/src/test/java/com/facebook/presto/tests/TestLocalQueries.java b/presto-tests/src/test/java/com/facebook/presto/tests/TestLocalQueries.java index 5189f79fd9bc1..4f8f031a0d29a 100644 --- a/presto-tests/src/test/java/com/facebook/presto/tests/TestLocalQueries.java +++ b/presto-tests/src/test/java/com/facebook/presto/tests/TestLocalQueries.java @@ -87,10 +87,10 @@ public void testShowColumnStats() MaterializedResult expectedStatistics = resultBuilder(getSession(), VARCHAR, DOUBLE, DOUBLE, DOUBLE, DOUBLE, VARCHAR, VARCHAR) - .row("regionkey", null, 5.0, 0.0, null, "0", "4") + .row("nationkey", null, 25.0, 0.0, null, "0", "24") .row("name", 177.0, 25.0, 0.0, null, "ALGERIA", "VIETNAM") + .row("regionkey", null, 5.0, 0.0, null, "0", "4") .row("comment", 1857.0, 25.0, 0.0, null, " haggle. carefully final deposit...", "y final packages. slow foxes caj...") - .row("nationkey", null, 25.0, 0.0, null, "0", "24") .row(null, null, null, null, 25.0, null, null) .build(); From 9229cda96459b7bae57940117ce939ba37321663 Mon Sep 17 00:00:00 2001 From: Andrii Rosa Date: Wed, 19 Sep 2018 14:03:17 -0400 Subject: [PATCH 09/14] Remove validation from HiveBasicStatistics This class must be able to store exactly what is stored in metastore. Sanity checks must be applied explicitly in MetastoreHiveStatisticsProvider. --- .../java/com/facebook/presto/hive/HiveBasicStatistics.java | 5 ----- 1 file changed, 5 deletions(-) diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/HiveBasicStatistics.java b/presto-hive/src/main/java/com/facebook/presto/hive/HiveBasicStatistics.java index 941b2ca271bc6..82192c40ce6bc 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/HiveBasicStatistics.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/HiveBasicStatistics.java @@ -17,7 +17,6 @@ import java.util.OptionalLong; import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkArgument; import static java.util.Objects.requireNonNull; public class HiveBasicStatistics @@ -49,13 +48,9 @@ public HiveBasicStatistics( OptionalLong onDiskDataSizeInBytes) { this.fileCount = requireNonNull(fileCount, "fileCount is null"); - fileCount.ifPresent(count -> checkArgument(count >= 0, "fileCount is negative: %d", count)); this.rowCount = requireNonNull(rowCount, "rowCount is null"); - rowCount.ifPresent(count -> checkArgument(count >= 0, "rowCount is negative: %d", count)); this.inMemoryDataSizeInBytes = requireNonNull(inMemoryDataSizeInBytes, "inMemoryDataSizeInBytes is null"); - inMemoryDataSizeInBytes.ifPresent(size -> checkArgument(size >= 0, "inMemoryDataSizeInBytes is negative: %d", size)); this.onDiskDataSizeInBytes = requireNonNull(onDiskDataSizeInBytes, "onDiskDataSizeInBytes is null"); - onDiskDataSizeInBytes.ifPresent(size -> checkArgument(size >= 0, "onDiskDataSizeInBytes is negative: %d", size)); } public OptionalLong getFileCount() From 5b9a58fc62d41804baa100d8a244f0a2846cb537 Mon Sep 17 00:00:00 2001 From: Andrii Rosa Date: Wed, 19 Sep 2018 14:03:18 -0400 Subject: [PATCH 10/14] Remove MIN/MAX statistics support for TIMESTAMP in Hive Connector MIN and MAX for TIMESTAMP column is not used by the optimizer. Having it in the Hive connector is misleading. --- .../MetastoreHiveStatisticsProvider.java | 4 +--- .../hive/TestHiveIntegrationSmokeTest.java | 8 ++++---- .../tests/hive/TestHiveTableStatistics.java | 20 +++++++++---------- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java b/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java index 23fd090743c05..a8c7c8feb7953 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java @@ -65,7 +65,6 @@ import static com.facebook.presto.spi.type.SmallintType.SMALLINT; import static com.facebook.presto.spi.type.StandardTypes.CHAR; import static com.facebook.presto.spi.type.StandardTypes.VARCHAR; -import static com.facebook.presto.spi.type.TimestampType.TIMESTAMP; import static com.facebook.presto.spi.type.TinyintType.TINYINT; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Verify.verify; @@ -187,8 +186,7 @@ private boolean isLowHighSupportedForType(Type type) || type.equals(BIGINT) || type.equals(REAL) || type.equals(DOUBLE) - || type.equals(DATE) - || type.equals(TIMESTAMP)) { + || type.equals(DATE)) { return true; } return false; diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveIntegrationSmokeTest.java b/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveIntegrationSmokeTest.java index 404eeaaa0ee8b..ce1b17166025d 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveIntegrationSmokeTest.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveIntegrationSmokeTest.java @@ -2702,7 +2702,7 @@ public void testCollectColumnStatisticsOnCreateTable() "('c_boolean', null, 2.0E0, 0.5E0, null, null, null), " + "('c_bigint', null, 2.0E0, 0.5E0, null, '0', '1'), " + "('c_double', null, 2.0E0, 0.5E0, null, '1.2', '2.2'), " + - "('c_timestamp', null, 2.0E0, 0.5E0, null, '2012-08-08 00:00:00.000', '2012-08-08 01:00:00.000'), " + + "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null), " + "('c_varchar', 8.0E0, 2.0E0, 0.5E0, null, null, null), " + "('c_varbinary', 8.0E0, null, 0.5E0, null, null, null), " + "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null), " + @@ -2712,7 +2712,7 @@ public void testCollectColumnStatisticsOnCreateTable() "('c_boolean', null, 2.0E0, 0.5E0, null, null, null), " + "('c_bigint', null, 2.0E0, 0.5E0, null, '1', '2'), " + "('c_double', null, 2.0E0, 0.5E0, null, '2.3', '3.3'), " + - "('c_timestamp', null, 2.0E0, 0.5E0, null, '2012-09-09 00:00:00.000', '2012-09-09 01:00:00.000'), " + + "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null), " + "('c_varchar', 8.0E0, 2.0E0, 0.5E0, null, null, null), " + "('c_varbinary', 8.0E0, null, 0.5E0, null, null, null), " + "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null), " + @@ -2759,7 +2759,7 @@ public void testCollectColumnStatisticsOnInsert() "('c_boolean', null, 2.0E0, 0.5E0, null, null, null), " + "('c_bigint', null, 2.0E0, 0.5E0, null, '0', '1'), " + "('c_double', null, 2.0E0, 0.5E0, null, '1.2', '2.2'), " + - "('c_timestamp', null, 2.0E0, 0.5E0, null, '2012-08-08 00:00:00.000', '2012-08-08 01:00:00.000'), " + + "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null), " + "('c_varchar', 8.0E0, 2.0E0, 0.5E0, null, null, null), " + "('c_varbinary', 8.0E0, null, 0.5E0, null, null, null), " + "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null), " + @@ -2769,7 +2769,7 @@ public void testCollectColumnStatisticsOnInsert() "('c_boolean', null, 2.0E0, 0.5E0, null, null, null), " + "('c_bigint', null, 2.0E0, 0.5E0, null, '1', '2'), " + "('c_double', null, 2.0E0, 0.5E0, null, '2.3', '3.3'), " + - "('c_timestamp', null, 2.0E0, 0.5E0, null, '2012-09-09 00:00:00.000', '2012-09-09 01:00:00.000'), " + + "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null), " + "('c_varchar', 8.0E0, 2.0E0, 0.5E0, null, null, null), " + "('c_varbinary', 8.0E0, null, 0.5E0, null, null, null), " + "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null), " + diff --git a/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveTableStatistics.java b/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveTableStatistics.java index 856db802dcfb7..c7dd8f47c9a99 100644 --- a/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveTableStatistics.java +++ b/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveTableStatistics.java @@ -114,7 +114,7 @@ public Requirement getRequirements(Configuration configuration) row("c_double", null, 2.0, 0.0, null, "234.561", "235.567"), row("c_decimal", null, 2.0, 0.0, null, "345", "346"), row("c_decimal_w_params", null, 2.0, 0.0, null, "345.67100", "345.67800"), - row("c_timestamp", null, 2.0, 0.0, null, "2015-05-10 12:15:31.000", "2015-05-10 12:15:35.000"), + row("c_timestamp", null, 2.0, 0.0, null, null, null), row("c_date", null, 2.0, 0.0, null, "2015-05-09", "2015-06-10"), row("c_string", 22.0, 2.0, 0.0, null, null, null), row("c_varchar", 20.0, 2.0, 0.0, null, null, null), @@ -505,7 +505,7 @@ public void testStatisticsForAllDataTypes() row("c_double", null, 2.0, 0.0, null, "234.561", "235.567"), row("c_decimal", null, 2.0, 0.0, null, "345", "346"), row("c_decimal_w_params", null, 2.0, 0.0, null, "345.67100", "345.67800"), - row("c_timestamp", null, 2.0, 0.0, null, "2015-05-10 06:30:31.000", "2015-05-10 06:30:35.000"), // timestamp is shifted by hive.time-zone on read + row("c_timestamp", null, 2.0, 0.0, null, null, null), row("c_date", null, 2.0, 0.0, null, "2015-05-09", "2015-06-10"), row("c_string", 22.0, 2.0, 0.0, null, null, null), row("c_varchar", 20.0, 2.0, 0.0, null, null, null), @@ -651,7 +651,7 @@ public void testComputeTableStatisticsOnInsert() row("c_double", null, 2.0, 0.5, null, "234.561", "235.567"), row("c_decimal", null, 2.0, 0.5, null, "345", "346"), row("c_decimal_w_params", null, 2.0, 0.5, null, "345.67100", "345.67800"), - row("c_timestamp", null, 2.0, 0.5, null, "2015-05-10 12:15:31.000", "2015-05-10 12:15:35.000"), + row("c_timestamp", null, 2.0, 0.5, null, null, null), row("c_date", null, 2.0, 0.5, null, "2015-05-09", "2015-06-10"), row("c_string", 22.0, 2.0, 0.5, null, null, null), row("c_varchar", 20.0, 2.0, 0.5, null, null, null), @@ -686,7 +686,7 @@ public void testComputeTableStatisticsOnInsert() row("c_double", null, 2.0, 0.4, null, "234.56", "235.567"), row("c_decimal", null, 2.0, 0.4, null, "343", "346"), row("c_decimal_w_params", null, 2.0, 0.4, null, "345.67000", "345.67800"), - row("c_timestamp", null, 2.0, 0.4, null, "2015-05-10 12:15:30.000", "2015-05-10 12:15:35.000"), + row("c_timestamp", null, 2.0, 0.4, null, null, null), row("c_date", null, 2.0, 0.4, null, "2015-05-08", "2015-06-10"), row("c_string", 32.0, 2.0, 0.4, null, null, null), row("c_varchar", 29.0, 2.0, 0.4, null, null, null), @@ -762,7 +762,7 @@ public void testComputePartitionStatisticsOnCreateTable() row("c_double", null, 1.0, 0.5, null, "234.56", "234.56"), row("c_decimal", null, 1.0, 0.5, null, "343", "343"), row("c_decimal_w_params", null, 1.0, 0.5, null, "345.67000", "345.67000"), - row("c_timestamp", null, 1.0, 0.5, null, "2015-05-10 12:15:30.000", "2015-05-10 12:15:30.000"), + row("c_timestamp", null, 1.0, 0.5, null, null, null), row("c_date", null, 1.0, 0.5, null, "2015-05-08", "2015-05-08"), row("c_string", 10.0, 1.0, 0.5, null, null, null), row("c_varchar", 10.0, 1.0, 0.5, null, null, null), @@ -782,7 +782,7 @@ public void testComputePartitionStatisticsOnCreateTable() row("c_double", null, 1.0, 0.5, null, "777.56", "777.56"), row("c_decimal", null, 1.0, 0.5, null, "888", "888"), row("c_decimal_w_params", null, 1.0, 0.5, null, "999.67000", "999.67000"), - row("c_timestamp", null, 1.0, 0.5, null, "2015-05-10 12:45:30.000", "2015-05-10 12:45:30.000"), + row("c_timestamp", null, 1.0, 0.5, null, null, null), row("c_date", null, 1.0, 0.5, null, "2015-05-09", "2015-05-09"), row("c_string", 10.0, 1.0, 0.5, null, null, null), row("c_varchar", 10.0, 1.0, 0.5, null, null, null), @@ -848,7 +848,7 @@ public void testComputePartitionStatisticsOnInsert() row("c_double", null, 1.0, 0.5, null, "234.56", "234.56"), row("c_decimal", null, 1.0, 0.5, null, "343", "343"), row("c_decimal_w_params", null, 1.0, 0.5, null, "345.67000", "345.67000"), - row("c_timestamp", null, 1.0, 0.5, null, "2015-05-10 12:15:30.000", "2015-05-10 12:15:30.000"), + row("c_timestamp", null, 1.0, 0.5, null, null, null), row("c_date", null, 1.0, 0.5, null, "2015-05-08", "2015-05-08"), row("c_string", 10.0, 1.0, 0.5, null, null, null), row("c_varchar", 10.0, 1.0, 0.5, null, null, null), @@ -868,7 +868,7 @@ public void testComputePartitionStatisticsOnInsert() row("c_double", null, 1.0, 0.5, null, "777.56", "777.56"), row("c_decimal", null, 1.0, 0.5, null, "888", "888"), row("c_decimal_w_params", null, 1.0, 0.5, null, "999.67000", "999.67000"), - row("c_timestamp", null, 1.0, 0.5, null, "2015-05-10 12:45:30.000", "2015-05-10 12:45:30.000"), + row("c_timestamp", null, 1.0, 0.5, null, null, null), row("c_date", null, 1.0, 0.5, null, "2015-05-09", "2015-05-09"), row("c_string", 10.0, 1.0, 0.5, null, null, null), row("c_varchar", 10.0, 1.0, 0.5, null, null, null), @@ -891,7 +891,7 @@ public void testComputePartitionStatisticsOnInsert() row("c_double", null, 1.0, 0.5, null, "233.56", "234.56"), row("c_decimal", null, 1.0, 0.5, null, "342", "343"), row("c_decimal_w_params", null, 1.0, 0.5, null, "344.67000", "345.67000"), - row("c_timestamp", null, 1.0, 0.5, null, "2015-05-10 12:15:29.000", "2015-05-10 12:15:30.000"), + row("c_timestamp", null, 1.0, 0.5, null, null, null), row("c_date", null, 1.0, 0.5, null, "2015-05-07", "2015-05-08"), row("c_string", 20.0, 1.0, 0.5, null, null, null), row("c_varchar", 20.0, 1.0, 0.5, null, null, null), @@ -914,7 +914,7 @@ public void testComputePartitionStatisticsOnInsert() row("c_double", null, 1.0, 0.5, null, "777.56", "778.56"), row("c_decimal", null, 1.0, 0.5, null, "888", "889"), row("c_decimal_w_params", null, 1.0, 0.5, null, "999.67000", "1000.67000"), - row("c_timestamp", null, 1.0, 0.5, null, "2015-05-10 12:45:30.000", "2015-05-10 12:45:31.000"), + row("c_timestamp", null, 1.0, 0.5, null, null, null), row("c_date", null, 1.0, 0.5, null, "2015-05-09", "2015-05-10"), row("c_string", 20.0, 1.0, 0.5, null, null, null), row("c_varchar", 20.0, 1.0, 0.5, null, null, null), From ea3a81417b6d2ba07e6a276564bdc197f03293d5 Mon Sep 17 00:00:00 2001 From: Andrii Rosa Date: Wed, 19 Sep 2018 14:03:19 -0400 Subject: [PATCH 11/14] Represent min and max statistics in SPI as double min and max for other types than numeric were simply ignored by the optimizer. Although SHOW STATS used to print min and max statistics for strings. Since the min and max are represented as double, the SHOW STATS command will no longer print these statistics, better representing the statistics that the optimizer actually takes into account. --- .../MetastoreHiveStatisticsProvider.java | 45 ++++++++++- .../hive/TestHiveIntegrationSmokeTest.java | 8 +- .../presto/cost/TableScanStatsRule.java | 33 +++----- .../presto/sql/rewrite/ShowStatsRewrite.java | 68 +++++++++------- .../tests/hive/TestHiveTableStatistics.java | 56 ++++++------- presto-spi/pom.xml | 6 ++ .../spi/statistics/ColumnStatistics.java | 39 ++++----- .../presto/spi/statistics/DoubleRange.java | 79 +++++++++++++++++++ .../spi/statistics/TestDoubleRange.java | 66 ++++++++++++++++ .../presto/tests/TestLocalQueries.java | 4 +- .../TpcdsTableStatisticsFactory.java | 50 +++++++----- .../tpcds/TestTpcdsMetadataStatistics.java | 35 ++------ .../facebook/presto/tpch/TpchMetadata.java | 24 ++++-- .../presto/tpch/EstimateAssertion.java | 7 ++ .../presto/tpch/TestTpchMetadata.java | 48 +++++------ 15 files changed, 374 insertions(+), 194 deletions(-) create mode 100644 presto-spi/src/main/java/com/facebook/presto/spi/statistics/DoubleRange.java create mode 100644 presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestDoubleRange.java diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java b/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java index a8c7c8feb7953..7318b53f3d2e7 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java @@ -28,9 +28,11 @@ import com.facebook.presto.spi.block.Block; import com.facebook.presto.spi.predicate.NullableValue; import com.facebook.presto.spi.statistics.ColumnStatistics; +import com.facebook.presto.spi.statistics.DoubleRange; import com.facebook.presto.spi.statistics.Estimate; import com.facebook.presto.spi.statistics.TableStatistics; import com.facebook.presto.spi.type.DecimalType; +import com.facebook.presto.spi.type.Decimals; import com.facebook.presto.spi.type.Type; import com.facebook.presto.spi.type.TypeManager; import com.google.common.annotations.VisibleForTesting; @@ -59,6 +61,8 @@ import static com.facebook.presto.spi.predicate.Utils.nativeValueToBlock; import static com.facebook.presto.spi.type.BigintType.BIGINT; import static com.facebook.presto.spi.type.DateType.DATE; +import static com.facebook.presto.spi.type.Decimals.isLongDecimal; +import static com.facebook.presto.spi.type.Decimals.isShortDecimal; import static com.facebook.presto.spi.type.DoubleType.DOUBLE; import static com.facebook.presto.spi.type.IntegerType.INTEGER; import static com.facebook.presto.spi.type.RealType.REAL; @@ -72,6 +76,8 @@ import static com.google.common.collect.ImmutableSet.toImmutableSet; import static com.google.common.collect.Maps.immutableEntry; import static com.google.common.hash.Hashing.murmur3_128; +import static java.lang.Double.parseDouble; +import static java.lang.Float.intBitsToFloat; import static java.util.Collections.unmodifiableList; import static java.util.Objects.requireNonNull; @@ -165,8 +171,11 @@ public TableStatistics getTableStatistics(ConnectorSession session, ConnectorTab Block rightBlock = nativeValueToBlock(prestoType, rightValue); return prestoType.compareTo(leftBlock, 0, rightBlock, 0); }; - columnStatistics.setLowValue(lowValueCandidates.stream().min(comparator)); - columnStatistics.setHighValue(highValueCandidates.stream().max(comparator)); + Optional min = lowValueCandidates.stream().min(comparator); + Optional max = highValueCandidates.stream().max(comparator); + if (min.isPresent() && max.isPresent()) { + columnStatistics.setRange(createPrestoRange(prestoType, min.get(), max.get())); + } columnStatistics.setDataSize(dataSize); columnStatistics.setNullsFraction(nullsFraction); @@ -192,6 +201,38 @@ private boolean isLowHighSupportedForType(Type type) return false; } + public static DoubleRange createPrestoRange(Type type, Object min, Object max) + { + return new DoubleRange(convertPrestoValueToStatsRepresentation(type, min), convertPrestoValueToStatsRepresentation(type, max)); + } + + private static double convertPrestoValueToStatsRepresentation(Type type, Object value) + { + if (type.equals(BIGINT) || type.equals(INTEGER) || type.equals(SMALLINT) || type.equals(TINYINT)) { + return (Long) value; + } + if (type.equals(DOUBLE)) { + return (Double) value; + } + if (type.equals(REAL)) { + return intBitsToFloat(((Long) value).intValue()); + } + if (type instanceof DecimalType) { + DecimalType decimalType = (DecimalType) type; + if (isShortDecimal(decimalType)) { + return parseDouble(Decimals.toString((Long) value, decimalType.getScale())); + } + if (isLongDecimal(decimalType)) { + return parseDouble(Decimals.toString((Slice) value, decimalType.getScale())); + } + throw new IllegalArgumentException("Unexpected decimal type: " + decimalType); + } + if (type.equals(DATE)) { + return (Long) value; + } + throw new IllegalArgumentException("Unsupported type: " + type); + } + private OptionalDouble calculateRowsPerPartition(Map statisticsSample) { return statisticsSample.values().stream() diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveIntegrationSmokeTest.java b/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveIntegrationSmokeTest.java index ce1b17166025d..0e71362113d90 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveIntegrationSmokeTest.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveIntegrationSmokeTest.java @@ -2799,8 +2799,8 @@ public void testInsertMultipleColumnsFromSameChannel() assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar_1 = '2' AND p_varchar_2 = '2')", tableName), "SELECT * FROM VALUES " + - "('c_bigint_1', null, 1.0E0, 0.0E0, null, 1, 1), " + - "('c_bigint_2', null, 1.0E0, 0.0E0, null, 1, 1), " + + "('c_bigint_1', null, 1.0E0, 0.0E0, null, '1', '1'), " + + "('c_bigint_2', null, 1.0E0, 0.0E0, null, '1', '1'), " + "('p_varchar_1', 1.0E0, 1.0E0, 0.0E0, null, null, null), " + "('p_varchar_2', 1.0E0, 1.0E0, 0.0E0, null, null, null), " + "(null, null, null, null, 1.0E0, null, null)"); @@ -2813,8 +2813,8 @@ public void testInsertMultipleColumnsFromSameChannel() assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar_1 = 'O' AND p_varchar_2 = 'O')", tableName), "SELECT * FROM VALUES " + - "('c_bigint_1', null, 1.0E0, 0.0E0, null, 15008, 15008), " + - "('c_bigint_2', null, 1.0E0, 0.0E0, null, 15008, 15008), " + + "('c_bigint_1', null, 1.0E0, 0.0E0, null, '15008', '15008'), " + + "('c_bigint_2', null, 1.0E0, 0.0E0, null, '15008', '15008'), " + "('p_varchar_1', 1.0E0, 1.0E0, 0.0E0, null, null, null), " + "('p_varchar_2', 1.0E0, 1.0E0, 0.0E0, null, null, null), " + "(null, null, null, null, 1.0E0, null, null)"); diff --git a/presto-main/src/main/java/com/facebook/presto/cost/TableScanStatsRule.java b/presto-main/src/main/java/com/facebook/presto/cost/TableScanStatsRule.java index 1fdf3fc4d923a..19c7faf74af90 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/TableScanStatsRule.java +++ b/presto-main/src/main/java/com/facebook/presto/cost/TableScanStatsRule.java @@ -20,7 +20,6 @@ import com.facebook.presto.spi.Constraint; import com.facebook.presto.spi.statistics.ColumnStatistics; import com.facebook.presto.spi.statistics.TableStatistics; -import com.facebook.presto.spi.type.Type; import com.facebook.presto.sql.planner.Symbol; import com.facebook.presto.sql.planner.TypeProvider; import com.facebook.presto.sql.planner.iterative.Lookup; @@ -29,13 +28,9 @@ import java.util.HashMap; import java.util.Map; import java.util.Optional; -import java.util.OptionalDouble; -import static com.facebook.presto.cost.StatsUtil.toStatsRepresentation; import static com.facebook.presto.cost.SymbolStatsEstimate.UNKNOWN_STATS; import static com.facebook.presto.sql.planner.plan.Patterns.tableScan; -import static java.lang.Double.NEGATIVE_INFINITY; -import static java.lang.Double.POSITIVE_INFINITY; import static java.util.Objects.requireNonNull; public class TableScanStatsRule @@ -68,9 +63,8 @@ protected Optional doCalculate(TableScanNode node, StatsP for (Map.Entry entry : node.getAssignments().entrySet()) { Symbol symbol = entry.getKey(); - Type symbolType = types.get(symbol); Optional columnStatistics = Optional.ofNullable(tableStatistics.getColumnStatistics().get(entry.getValue())); - outputSymbolStats.put(symbol, columnStatistics.map(statistics -> toSymbolStatistics(tableStatistics, statistics, session, symbolType)).orElse(UNKNOWN_STATS)); + outputSymbolStats.put(symbol, columnStatistics.map(statistics -> toSymbolStatistics(tableStatistics, statistics)).orElse(UNKNOWN_STATS)); } return Optional.of(PlanNodeStatsEstimate.builder() @@ -79,24 +73,19 @@ protected Optional doCalculate(TableScanNode node, StatsP .build()); } - private SymbolStatsEstimate toSymbolStatistics(TableStatistics tableStatistics, ColumnStatistics columnStatistics, Session session, Type type) + private SymbolStatsEstimate toSymbolStatistics(TableStatistics tableStatistics, ColumnStatistics columnStatistics) { double nullsFraction = columnStatistics.getNullsFraction().getValue(); double nonNullRowsCount = tableStatistics.getRowCount().getValue() * (1.0 - nullsFraction); double averageRowSize = nonNullRowsCount == 0 ? 0 : columnStatistics.getDataSize().getValue() / nonNullRowsCount; - return SymbolStatsEstimate.builder() - .setLowValue(asDouble(session, type, columnStatistics.getLowValue()).orElse(NEGATIVE_INFINITY)) - .setHighValue(asDouble(session, type, columnStatistics.getHighValue()).orElse(POSITIVE_INFINITY)) - .setNullsFraction(nullsFraction) - .setDistinctValuesCount(columnStatistics.getDistinctValuesCount().getValue()) - .setAverageRowSize(averageRowSize) - .build(); - } - - private OptionalDouble asDouble(Session session, Type type, Optional optionalValue) - { - return optionalValue - .map(value -> toStatsRepresentation(metadata, session, type, value)) - .orElseGet(OptionalDouble::empty); + SymbolStatsEstimate.Builder result = SymbolStatsEstimate.builder(); + result.setNullsFraction(nullsFraction); + result.setDistinctValuesCount(columnStatistics.getDistinctValuesCount().getValue()); + result.setAverageRowSize(averageRowSize); + columnStatistics.getRange().ifPresent(range -> { + result.setLowValue(range.getMin()); + result.setHighValue(range.getMax()); + }); + return result.build(); } } diff --git a/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java b/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java index bd94bbb9e037b..7da3d875a2468 100644 --- a/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java +++ b/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java @@ -14,10 +14,8 @@ package com.facebook.presto.sql.rewrite; import com.facebook.presto.Session; -import com.facebook.presto.metadata.FunctionRegistry; import com.facebook.presto.metadata.Metadata; import com.facebook.presto.metadata.QualifiedObjectName; -import com.facebook.presto.metadata.Signature; import com.facebook.presto.metadata.TableHandle; import com.facebook.presto.metadata.TableMetadata; import com.facebook.presto.security.AccessControl; @@ -25,11 +23,17 @@ import com.facebook.presto.spi.ColumnMetadata; import com.facebook.presto.spi.Constraint; import com.facebook.presto.spi.statistics.ColumnStatistics; +import com.facebook.presto.spi.statistics.DoubleRange; import com.facebook.presto.spi.statistics.Estimate; import com.facebook.presto.spi.statistics.TableStatistics; +import com.facebook.presto.spi.type.BigintType; +import com.facebook.presto.spi.type.DecimalType; +import com.facebook.presto.spi.type.DoubleType; +import com.facebook.presto.spi.type.IntegerType; +import com.facebook.presto.spi.type.RealType; +import com.facebook.presto.spi.type.SmallintType; +import com.facebook.presto.spi.type.TinyintType; import com.facebook.presto.spi.type.Type; -import com.facebook.presto.spi.type.VarcharType; -import com.facebook.presto.sql.InterpretedFunctionInvoker; import com.facebook.presto.sql.QueryUtil; import com.facebook.presto.sql.analyzer.QueryExplainer; import com.facebook.presto.sql.analyzer.SemanticException; @@ -63,13 +67,14 @@ import com.facebook.presto.sql.tree.TableSubquery; import com.facebook.presto.sql.tree.Values; import com.google.common.collect.ImmutableList; -import io.airlift.slice.Slice; +import java.time.LocalDate; import java.util.List; import java.util.Map; import java.util.Optional; import static com.facebook.presto.metadata.MetadataUtil.createQualifiedObjectName; +import static com.facebook.presto.spi.type.DateType.DATE; import static com.facebook.presto.spi.type.StandardTypes.DOUBLE; import static com.facebook.presto.spi.type.StandardTypes.VARCHAR; import static com.facebook.presto.sql.QueryUtil.aliased; @@ -80,7 +85,7 @@ import static com.facebook.presto.sql.planner.optimizations.PlanNodeSearcher.searchFrom; import static com.google.common.base.Preconditions.checkState; import static com.google.common.collect.ImmutableList.toImmutableList; -import static java.util.Collections.singletonList; +import static java.lang.Math.round; import static java.util.Objects.requireNonNull; public class ShowStatsRewrite @@ -91,7 +96,6 @@ public class ShowStatsRewrite private static final Expression NULL_DOUBLE = new Cast(new NullLiteral(), DOUBLE); private static final Expression NULL_VARCHAR = new Cast(new NullLiteral(), VARCHAR); - private static final int MAX_LOW_HIGH_LENGTH = 32; @Override public Statement rewrite(Session session, Metadata metadata, SqlParser parser, Optional queryExplainer, Statement node, List parameters, AccessControl accessControl) @@ -300,12 +304,12 @@ private Row createColumnStatsRow(String columnName, Type type, ColumnStatistics { ImmutableList.Builder rowValues = ImmutableList.builder(); rowValues.add(new StringLiteral(columnName)); - rowValues.add(createStatisticValueOrNull(columnStatistics.getDataSize())); - rowValues.add(createStatisticValueOrNull(columnStatistics.getDistinctValuesCount())); - rowValues.add(createStatisticValueOrNull(columnStatistics.getNullsFraction())); + rowValues.add(createEstimateRepresentation(columnStatistics.getDataSize())); + rowValues.add(createEstimateRepresentation(columnStatistics.getDistinctValuesCount())); + rowValues.add(createEstimateRepresentation(columnStatistics.getNullsFraction())); rowValues.add(NULL_DOUBLE); - rowValues.add(lowHighAsLiteral(type, columnStatistics.getLowValue())); - rowValues.add(lowHighAsLiteral(type, columnStatistics.getHighValue())); + rowValues.add(toStringLiteral(type, columnStatistics.getRange().map(DoubleRange::getMin))); + rowValues.add(toStringLiteral(type, columnStatistics.getRange().map(DoubleRange::getMax))); return new Row(rowValues.build()); } @@ -329,34 +333,40 @@ private static Row createTableStatsRow(TableStatistics tableStatistics) rowValues.add(NULL_DOUBLE); rowValues.add(NULL_DOUBLE); rowValues.add(NULL_DOUBLE); - rowValues.add(createStatisticValueOrNull(tableStatistics.getRowCount())); + rowValues.add(createEstimateRepresentation(tableStatistics.getRowCount())); rowValues.add(NULL_VARCHAR); rowValues.add(NULL_VARCHAR); return new Row(rowValues.build()); } - private Expression lowHighAsLiteral(Type valueType, Optional value) + private static Expression createEstimateRepresentation(Estimate estimate) { - if (!value.isPresent()) { - return new Cast(new NullLiteral(), VARCHAR); - } - FunctionRegistry functionRegistry = metadata.getFunctionRegistry(); - InterpretedFunctionInvoker functionInvoker = new InterpretedFunctionInvoker(functionRegistry); - Signature castSignature = functionRegistry.getCoercion(valueType, VarcharType.createUnboundedVarcharType()); - Slice varcharValue = (Slice) functionInvoker.invoke(castSignature, session.toConnectorSession(), singletonList(value.get())); - String stringValue = varcharValue.toStringUtf8(); - if (stringValue.length() > MAX_LOW_HIGH_LENGTH) { - stringValue = stringValue.substring(0, MAX_LOW_HIGH_LENGTH) + "..."; + if (estimate.isUnknown()) { + return NULL_DOUBLE; } - return new StringLiteral(stringValue); + return new DoubleLiteral(Double.toString(estimate.getValue())); } - private static Expression createStatisticValueOrNull(Estimate estimate) + private static Expression toStringLiteral(Type type, Optional optionalValue) { - if (estimate.isUnknown()) { - return NULL_DOUBLE; + return optionalValue.map(value -> toStringLiteral(type, value)).orElse(NULL_VARCHAR); + } + + private static Expression toStringLiteral(Type type, double value) + { + if (type.equals(BigintType.BIGINT) || type.equals(IntegerType.INTEGER) || type.equals(SmallintType.SMALLINT) || type.equals(TinyintType.TINYINT)) { + return new StringLiteral(Long.toString(round(value))); } - return new DoubleLiteral(Double.toString(estimate.getValue())); + if (type.equals(DoubleType.DOUBLE) || type instanceof DecimalType) { + return new StringLiteral(Double.toString(value)); + } + if (type.equals(RealType.REAL)) { + return new StringLiteral(Float.toString((float) value)); + } + if (type.equals(DATE)) { + return new StringLiteral(LocalDate.ofEpochDay(round(value)).toString()); + } + throw new IllegalArgumentException("Unexpected type: " + type); } } } diff --git a/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveTableStatistics.java b/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveTableStatistics.java index c7dd8f47c9a99..6e1e4702d850b 100644 --- a/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveTableStatistics.java +++ b/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveTableStatistics.java @@ -109,11 +109,11 @@ public Requirement getRequirements(Configuration configuration) row("c_tinyint", null, 2.0, 0.0, null, "121", "127"), row("c_smallint", null, 2.0, 0.0, null, "32761", "32767"), row("c_int", null, 2.0, 0.0, null, "2147483641", "2147483647"), - row("c_bigint", null, 2.0, 0.0, null, "9223372036854775801", "9223372036854775807"), + row("c_bigint", null, 2.0, 0.0, null, "9223372036854775807", "9223372036854775807"), row("c_float", null, 2.0, 0.0, null, "123.341", "123.345"), row("c_double", null, 2.0, 0.0, null, "234.561", "235.567"), - row("c_decimal", null, 2.0, 0.0, null, "345", "346"), - row("c_decimal_w_params", null, 2.0, 0.0, null, "345.67100", "345.67800"), + row("c_decimal", null, 2.0, 0.0, null, "345.0", "346.0"), + row("c_decimal_w_params", null, 2.0, 0.0, null, "345.671", "345.678"), row("c_timestamp", null, 2.0, 0.0, null, null, null), row("c_date", null, 2.0, 0.0, null, "2015-05-09", "2015-06-10"), row("c_string", 22.0, 2.0, 0.0, null, null, null), @@ -500,12 +500,12 @@ public void testStatisticsForAllDataTypes() row("c_tinyint", null, 2.0, 0.0, null, "121", "127"), row("c_smallint", null, 2.0, 0.0, null, "32761", "32767"), row("c_int", null, 2.0, 0.0, null, "2147483641", "2147483647"), - row("c_bigint", null, 2.0, 0.0, null, "9223372036854775801", "9223372036854775807"), + row("c_bigint", null, 2.0, 0.0, null, "9223372036854775807", "9223372036854775807"), row("c_float", null, 2.0, 0.0, null, "123.341", "123.345"), row("c_double", null, 2.0, 0.0, null, "234.561", "235.567"), - row("c_decimal", null, 2.0, 0.0, null, "345", "346"), - row("c_decimal_w_params", null, 2.0, 0.0, null, "345.67100", "345.67800"), - row("c_timestamp", null, 2.0, 0.0, null, null, null), + row("c_decimal", null, 2.0, 0.0, null, "345.0", "346.0"), + row("c_decimal_w_params", null, 2.0, 0.0, null, "345.671", "345.678"), + row("c_timestamp", null, 2.0, 0.0, null, null, null), // timestamp is shifted by hive.time-zone on read row("c_date", null, 2.0, 0.0, null, "2015-05-09", "2015-06-10"), row("c_string", 22.0, 2.0, 0.0, null, null, null), row("c_varchar", 20.0, 2.0, 0.0, null, null, null), @@ -646,11 +646,11 @@ public void testComputeTableStatisticsOnInsert() row("c_tinyint", null, 2.0, 0.5, null, "121", "127"), row("c_smallint", null, 2.0, 0.5, null, "32761", "32767"), row("c_int", null, 2.0, 0.5, null, "2147483641", "2147483647"), - row("c_bigint", null, 2.0, 0.5, null, "9223372036854775801", "9223372036854775807"), + row("c_bigint", null, 2.0, 0.5, null, "9223372036854775807", "9223372036854775807"), row("c_float", null, 2.0, 0.5, null, "123.341", "123.345"), row("c_double", null, 2.0, 0.5, null, "234.561", "235.567"), - row("c_decimal", null, 2.0, 0.5, null, "345", "346"), - row("c_decimal_w_params", null, 2.0, 0.5, null, "345.67100", "345.67800"), + row("c_decimal", null, 2.0, 0.5, null, "345.0", "346.0"), + row("c_decimal_w_params", null, 2.0, 0.5, null, "345.671", "345.678"), row("c_timestamp", null, 2.0, 0.5, null, null, null), row("c_date", null, 2.0, 0.5, null, "2015-05-09", "2015-06-10"), row("c_string", 22.0, 2.0, 0.5, null, null, null), @@ -681,11 +681,11 @@ public void testComputeTableStatisticsOnInsert() row("c_tinyint", null, 2.0, 0.4, null, "120", "127"), row("c_smallint", null, 2.0, 0.4, null, "32760", "32767"), row("c_int", null, 2.0, 0.4, null, "2147483640", "2147483647"), - row("c_bigint", null, 2.0, 0.4, null, "9223372036854775800", "9223372036854775807"), + row("c_bigint", null, 2.0, 0.4, null, "9223372036854775807", "9223372036854775807"), row("c_float", null, 2.0, 0.4, null, "123.34", "123.345"), row("c_double", null, 2.0, 0.4, null, "234.56", "235.567"), - row("c_decimal", null, 2.0, 0.4, null, "343", "346"), - row("c_decimal_w_params", null, 2.0, 0.4, null, "345.67000", "345.67800"), + row("c_decimal", null, 2.0, 0.4, null, "343.0", "346.0"), + row("c_decimal_w_params", null, 2.0, 0.4, null, "345.67", "345.678"), row("c_timestamp", null, 2.0, 0.4, null, null, null), row("c_date", null, 2.0, 0.4, null, "2015-05-08", "2015-06-10"), row("c_string", 32.0, 2.0, 0.4, null, null, null), @@ -757,11 +757,11 @@ public void testComputePartitionStatisticsOnCreateTable() row("c_tinyint", null, 1.0, 0.5, null, "120", "120"), row("c_smallint", null, 1.0, 0.5, null, "32760", "32760"), row("c_int", null, 1.0, 0.5, null, "2147483640", "2147483640"), - row("c_bigint", null, 1.0, 0.5, null, "9223372036854775800", "9223372036854775800"), + row("c_bigint", null, 1.0, 0.5, null, "9223372036854775807", "9223372036854775807"), row("c_float", null, 1.0, 0.5, null, "123.34", "123.34"), row("c_double", null, 1.0, 0.5, null, "234.56", "234.56"), - row("c_decimal", null, 1.0, 0.5, null, "343", "343"), - row("c_decimal_w_params", null, 1.0, 0.5, null, "345.67000", "345.67000"), + row("c_decimal", null, 1.0, 0.5, null, "343.0", "343.0"), + row("c_decimal_w_params", null, 1.0, 0.5, null, "345.67", "345.67"), row("c_timestamp", null, 1.0, 0.5, null, null, null), row("c_date", null, 1.0, 0.5, null, "2015-05-08", "2015-05-08"), row("c_string", 10.0, 1.0, 0.5, null, null, null), @@ -780,8 +780,8 @@ public void testComputePartitionStatisticsOnCreateTable() row("c_bigint", null, 1.0, 0.5, null, "555", "555"), row("c_float", null, 1.0, 0.5, null, "666.34", "666.34"), row("c_double", null, 1.0, 0.5, null, "777.56", "777.56"), - row("c_decimal", null, 1.0, 0.5, null, "888", "888"), - row("c_decimal_w_params", null, 1.0, 0.5, null, "999.67000", "999.67000"), + row("c_decimal", null, 1.0, 0.5, null, "888.0", "888.0"), + row("c_decimal_w_params", null, 1.0, 0.5, null, "999.67", "999.67"), row("c_timestamp", null, 1.0, 0.5, null, null, null), row("c_date", null, 1.0, 0.5, null, "2015-05-09", "2015-05-09"), row("c_string", 10.0, 1.0, 0.5, null, null, null), @@ -843,11 +843,11 @@ public void testComputePartitionStatisticsOnInsert() row("c_tinyint", null, 1.0, 0.5, null, "120", "120"), row("c_smallint", null, 1.0, 0.5, null, "32760", "32760"), row("c_int", null, 1.0, 0.5, null, "2147483640", "2147483640"), - row("c_bigint", null, 1.0, 0.5, null, "9223372036854775800", "9223372036854775800"), + row("c_bigint", null, 1.0, 0.5, null, "9223372036854775807", "9223372036854775807"), row("c_float", null, 1.0, 0.5, null, "123.34", "123.34"), row("c_double", null, 1.0, 0.5, null, "234.56", "234.56"), - row("c_decimal", null, 1.0, 0.5, null, "343", "343"), - row("c_decimal_w_params", null, 1.0, 0.5, null, "345.67000", "345.67000"), + row("c_decimal", null, 1.0, 0.5, null, "343.0", "343.0"), + row("c_decimal_w_params", null, 1.0, 0.5, null, "345.67", "345.67"), row("c_timestamp", null, 1.0, 0.5, null, null, null), row("c_date", null, 1.0, 0.5, null, "2015-05-08", "2015-05-08"), row("c_string", 10.0, 1.0, 0.5, null, null, null), @@ -866,8 +866,8 @@ public void testComputePartitionStatisticsOnInsert() row("c_bigint", null, 1.0, 0.5, null, "555", "555"), row("c_float", null, 1.0, 0.5, null, "666.34", "666.34"), row("c_double", null, 1.0, 0.5, null, "777.56", "777.56"), - row("c_decimal", null, 1.0, 0.5, null, "888", "888"), - row("c_decimal_w_params", null, 1.0, 0.5, null, "999.67000", "999.67000"), + row("c_decimal", null, 1.0, 0.5, null, "888.0", "888.0"), + row("c_decimal_w_params", null, 1.0, 0.5, null, "999.67", "999.67"), row("c_timestamp", null, 1.0, 0.5, null, null, null), row("c_date", null, 1.0, 0.5, null, "2015-05-09", "2015-05-09"), row("c_string", 10.0, 1.0, 0.5, null, null, null), @@ -886,11 +886,11 @@ public void testComputePartitionStatisticsOnInsert() row("c_tinyint", null, 1.0, 0.5, null, "119", "120"), row("c_smallint", null, 1.0, 0.5, null, "32759", "32760"), row("c_int", null, 1.0, 0.5, null, "2147483639", "2147483640"), - row("c_bigint", null, 1.0, 0.5, null, "9223372036854775799", "9223372036854775800"), + row("c_bigint", null, 1.0, 0.5, null, "9223372036854775807", "9223372036854775807"), row("c_float", null, 1.0, 0.5, null, "122.34", "123.34"), row("c_double", null, 1.0, 0.5, null, "233.56", "234.56"), - row("c_decimal", null, 1.0, 0.5, null, "342", "343"), - row("c_decimal_w_params", null, 1.0, 0.5, null, "344.67000", "345.67000"), + row("c_decimal", null, 1.0, 0.5, null, "342.0", "343.0"), + row("c_decimal_w_params", null, 1.0, 0.5, null, "344.67", "345.67"), row("c_timestamp", null, 1.0, 0.5, null, null, null), row("c_date", null, 1.0, 0.5, null, "2015-05-07", "2015-05-08"), row("c_string", 20.0, 1.0, 0.5, null, null, null), @@ -912,8 +912,8 @@ public void testComputePartitionStatisticsOnInsert() row("c_bigint", null, 1.0, 0.5, null, "555", "556"), row("c_float", null, 1.0, 0.5, null, "666.34", "667.34"), row("c_double", null, 1.0, 0.5, null, "777.56", "778.56"), - row("c_decimal", null, 1.0, 0.5, null, "888", "889"), - row("c_decimal_w_params", null, 1.0, 0.5, null, "999.67000", "1000.67000"), + row("c_decimal", null, 1.0, 0.5, null, "888.0", "889.0"), + row("c_decimal_w_params", null, 1.0, 0.5, null, "999.67", "1000.67"), row("c_timestamp", null, 1.0, 0.5, null, null, null), row("c_date", null, 1.0, 0.5, null, "2015-05-09", "2015-05-10"), row("c_string", 20.0, 1.0, 0.5, null, null, null), diff --git a/presto-spi/pom.xml b/presto-spi/pom.xml index ab0f3b900aeca..af28725666a45 100644 --- a/presto-spi/pom.xml +++ b/presto-spi/pom.xml @@ -97,5 +97,11 @@ json test + + + org.assertj + assertj-core + test + diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java index 0ce862828c53f..6863f8ec36cd6 100644 --- a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java @@ -24,15 +24,13 @@ public final class ColumnStatistics private final Estimate nullsFraction; private final Estimate distinctValuesCount; private final Estimate dataSize; - private final Optional lowValue; - private final Optional highValue; + private final Optional range; public ColumnStatistics( Estimate nullsFraction, Estimate distinctValuesCount, Estimate dataSize, - Optional lowValue, - Optional highValue) + Optional range) { this.nullsFraction = requireNonNull(nullsFraction, "nullsFraction is null"); if (!nullsFraction.isUnknown()) { @@ -48,8 +46,7 @@ public ColumnStatistics( if (!dataSize.isUnknown() && dataSize.getValue() < 0) { throw new IllegalArgumentException(format("dataSize must be greater than or equal to 0: %s", dataSize.getValue())); } - this.lowValue = requireNonNull(lowValue, "lowValue is null"); - this.highValue = requireNonNull(highValue, "highValue is null"); + this.range = requireNonNull(range, "range is null"); } public Estimate getNullsFraction() @@ -67,14 +64,9 @@ public Estimate getDataSize() return dataSize; } - public Optional getLowValue() + public Optional getRange() { - return lowValue; - } - - public Optional getHighValue() - { - return highValue; + return range; } @Override @@ -90,14 +82,13 @@ public boolean equals(Object o) return Objects.equals(nullsFraction, that.nullsFraction) && Objects.equals(distinctValuesCount, that.distinctValuesCount) && Objects.equals(dataSize, that.dataSize) && - Objects.equals(lowValue, that.lowValue) && - Objects.equals(highValue, that.highValue); + Objects.equals(range, that.range); } @Override public int hashCode() { - return Objects.hash(nullsFraction, distinctValuesCount, dataSize, lowValue, highValue); + return Objects.hash(nullsFraction, distinctValuesCount, dataSize, range); } @Override @@ -107,8 +98,7 @@ public String toString() "nullsFraction=" + nullsFraction + ", distinctValuesCount=" + distinctValuesCount + ", dataSize=" + dataSize + - ", lowValue=" + lowValue + - ", highValue=" + highValue + + ", range=" + range + '}'; } @@ -122,8 +112,7 @@ public static final class Builder private Estimate nullsFraction = Estimate.unknown(); private Estimate distinctValuesCount = Estimate.unknown(); private Estimate dataSize = Estimate.unknown(); - private Optional lowValue = Optional.empty(); - private Optional highValue = Optional.empty(); + private Optional range = Optional.empty(); public Builder setNullsFraction(Estimate nullsFraction) { @@ -143,21 +132,21 @@ public Builder setDataSize(Estimate dataSize) return this; } - public Builder setLowValue(Optional lowValue) + public Builder setRange(DoubleRange range) { - this.lowValue = requireNonNull(lowValue, "lowValue is null"); + this.range = Optional.of(requireNonNull(range, "range is null")); return this; } - public Builder setHighValue(Optional highValue) + public Builder setRange(Optional range) { - this.highValue = requireNonNull(highValue, "highValue is null"); + this.range = requireNonNull(range, "range is null"); return this; } public ColumnStatistics build() { - return new ColumnStatistics(nullsFraction, distinctValuesCount, dataSize, lowValue, highValue); + return new ColumnStatistics(nullsFraction, distinctValuesCount, dataSize, range); } } } diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/DoubleRange.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/DoubleRange.java new file mode 100644 index 0000000000000..1ea795b3c367d --- /dev/null +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/DoubleRange.java @@ -0,0 +1,79 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.spi.statistics; + +import java.util.Objects; + +import static java.lang.Double.isNaN; +import static java.lang.String.format; + +public class DoubleRange +{ + private final double min; + private final double max; + + public DoubleRange(double min, double max) + { + if (isNaN(min)) { + throw new IllegalArgumentException("min must not be NaN"); + } + if (isNaN(max)) { + throw new IllegalArgumentException("max must not be NaN"); + } + if (min > max) { + throw new IllegalArgumentException(format("max must be greater than or equal to min. min: %s. max: %s. ", min, max)); + } + this.min = min; + this.max = max; + } + + public double getMin() + { + return min; + } + + public double getMax() + { + return max; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + DoubleRange range = (DoubleRange) o; + return Double.compare(range.min, min) == 0 && + Double.compare(range.max, max) == 0; + } + + @Override + public int hashCode() + { + return Objects.hash(min, max); + } + + @Override + public String toString() + { + return "DoubleRange{" + + "min=" + min + + ", max=" + max + + '}'; + } +} diff --git a/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestDoubleRange.java b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestDoubleRange.java new file mode 100644 index 0000000000000..17c9edca705c9 --- /dev/null +++ b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestDoubleRange.java @@ -0,0 +1,66 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.spi.statistics; + +import org.testng.annotations.Test; + +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.testng.Assert.assertEquals; + +public class TestDoubleRange +{ + @Test + public void testRange() + { + assertRange(0, 0); + assertRange(0, 0.1); + assertRange(-0.1, 0.1); + assertRange(Double.NEGATIVE_INFINITY, 0); + assertRange(Float.NEGATIVE_INFINITY, 0); + assertRange(Double.NEGATIVE_INFINITY, -1.0 * Double.MAX_VALUE); + assertRange(Float.NEGATIVE_INFINITY, -1.0 * Double.MAX_VALUE); + assertRange(Float.NEGATIVE_INFINITY, -1.0 * Float.MAX_VALUE); + assertRange(Double.MAX_VALUE, Double.POSITIVE_INFINITY); + assertRange(Float.MAX_VALUE, Double.POSITIVE_INFINITY); + assertRange(Double.MAX_VALUE, Float.POSITIVE_INFINITY); + assertRange(Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY); + assertRange(Double.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY); + assertRange(Float.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY); + assertThatThrownBy(() -> new DoubleRange(Double.NaN, 0)).isInstanceOf(IllegalArgumentException.class); + assertThatThrownBy(() -> new DoubleRange(0, Double.NaN)).isInstanceOf(IllegalArgumentException.class); + assertThatThrownBy(() -> new DoubleRange(Double.NaN, Double.NaN)).isInstanceOf(IllegalArgumentException.class); + assertThatThrownBy(() -> new DoubleRange(Float.NaN, Float.NaN)).isInstanceOf(IllegalArgumentException.class); + assertThatThrownBy(() -> new DoubleRange(1, 0)).isInstanceOf(IllegalArgumentException.class); + assertThatThrownBy(() -> new DoubleRange(0, Double.NEGATIVE_INFINITY)).isInstanceOf(IllegalArgumentException.class); + assertThatThrownBy(() -> new DoubleRange(0, Float.NEGATIVE_INFINITY)).isInstanceOf(IllegalArgumentException.class); + assertThatThrownBy(() -> new DoubleRange(-1.0 * Double.MAX_VALUE, Double.NEGATIVE_INFINITY)).isInstanceOf(IllegalArgumentException.class); + assertThatThrownBy(() -> new DoubleRange(-1.0 * Float.MAX_VALUE, Double.NEGATIVE_INFINITY)).isInstanceOf(IllegalArgumentException.class); + assertThatThrownBy(() -> new DoubleRange(-1.0 * Double.MAX_VALUE, Float.NEGATIVE_INFINITY)).isInstanceOf(IllegalArgumentException.class); + assertThatThrownBy(() -> new DoubleRange(Double.POSITIVE_INFINITY, Double.MAX_VALUE)).isInstanceOf(IllegalArgumentException.class); + assertThatThrownBy(() -> new DoubleRange(Float.POSITIVE_INFINITY, Double.MAX_VALUE)).isInstanceOf(IllegalArgumentException.class); + assertThatThrownBy(() -> new DoubleRange(Double.POSITIVE_INFINITY, Float.MAX_VALUE)).isInstanceOf(IllegalArgumentException.class); + assertThatThrownBy(() -> new DoubleRange(Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY)).isInstanceOf(IllegalArgumentException.class); + assertThatThrownBy(() -> new DoubleRange(Float.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY)).isInstanceOf(IllegalArgumentException.class); + assertThatThrownBy(() -> new DoubleRange(Double.POSITIVE_INFINITY, Float.NEGATIVE_INFINITY)).isInstanceOf(IllegalArgumentException.class); + assertThatThrownBy(() -> new DoubleRange(Double.POSITIVE_INFINITY, 0)).isInstanceOf(IllegalArgumentException.class); + assertThatThrownBy(() -> new DoubleRange(Float.POSITIVE_INFINITY, 0)).isInstanceOf(IllegalArgumentException.class); + } + + private static void assertRange(double min, double max) + { + DoubleRange range = new DoubleRange(min, max); + assertEquals(range.getMin(), min); + assertEquals(range.getMax(), max); + } +} diff --git a/presto-tests/src/test/java/com/facebook/presto/tests/TestLocalQueries.java b/presto-tests/src/test/java/com/facebook/presto/tests/TestLocalQueries.java index 4f8f031a0d29a..0b167288705e0 100644 --- a/presto-tests/src/test/java/com/facebook/presto/tests/TestLocalQueries.java +++ b/presto-tests/src/test/java/com/facebook/presto/tests/TestLocalQueries.java @@ -88,9 +88,9 @@ public void testShowColumnStats() MaterializedResult expectedStatistics = resultBuilder(getSession(), VARCHAR, DOUBLE, DOUBLE, DOUBLE, DOUBLE, VARCHAR, VARCHAR) .row("nationkey", null, 25.0, 0.0, null, "0", "24") - .row("name", 177.0, 25.0, 0.0, null, "ALGERIA", "VIETNAM") + .row("name", 177.0, 25.0, 0.0, null, null, null) .row("regionkey", null, 5.0, 0.0, null, "0", "4") - .row("comment", 1857.0, 25.0, 0.0, null, " haggle. carefully final deposit...", "y final packages. slow foxes caj...") + .row("comment", 1857.0, 25.0, 0.0, null, null, null) .row(null, null, null, null, 25.0, null, null) .build(); diff --git a/presto-tpcds/src/main/java/com/facebook/presto/tpcds/statistics/TpcdsTableStatisticsFactory.java b/presto-tpcds/src/main/java/com/facebook/presto/tpcds/statistics/TpcdsTableStatisticsFactory.java index ef6f503a798f1..3b40f1243f970 100644 --- a/presto-tpcds/src/main/java/com/facebook/presto/tpcds/statistics/TpcdsTableStatisticsFactory.java +++ b/presto-tpcds/src/main/java/com/facebook/presto/tpcds/statistics/TpcdsTableStatisticsFactory.java @@ -16,27 +16,30 @@ import com.facebook.presto.spi.ColumnHandle; import com.facebook.presto.spi.statistics.ColumnStatistics; +import com.facebook.presto.spi.statistics.DoubleRange; import com.facebook.presto.spi.statistics.Estimate; import com.facebook.presto.spi.statistics.TableStatistics; import com.facebook.presto.spi.type.CharType; import com.facebook.presto.spi.type.DecimalType; -import com.facebook.presto.spi.type.TimeType; +import com.facebook.presto.spi.type.Decimals; import com.facebook.presto.spi.type.Type; import com.facebook.presto.spi.type.VarcharType; import com.facebook.presto.tpcds.TpcdsColumnHandle; import com.teradata.tpcds.Table; -import io.airlift.slice.Slices; +import io.airlift.slice.Slice; import java.time.LocalDate; import java.util.Map; import java.util.Optional; import static com.facebook.presto.spi.type.BigintType.BIGINT; -import static com.facebook.presto.spi.type.Chars.truncateToLengthAndTrimSpaces; import static com.facebook.presto.spi.type.DateType.DATE; +import static com.facebook.presto.spi.type.Decimals.isLongDecimal; import static com.facebook.presto.spi.type.Decimals.isShortDecimal; import static com.facebook.presto.spi.type.DoubleType.DOUBLE; import static com.facebook.presto.spi.type.IntegerType.INTEGER; +import static com.facebook.presto.spi.type.TimeType.TIME; +import static java.lang.Double.parseDouble; public class TpcdsTableStatisticsFactory { @@ -71,32 +74,43 @@ private ColumnStatistics toColumnStatistics(ColumnStatisticsData columnStatistic ColumnStatistics.Builder columnStatistics = ColumnStatistics.builder(); long nullCount = columnStatisticsData.getNullsCount(); columnStatistics.setNullsFraction(Estimate.of((double) nullCount / rowCount)); - columnStatistics.setLowValue(columnStatisticsData.getMin().map(value -> toPrestoValue(value, type))); - columnStatistics.setHighValue(columnStatisticsData.getMax().map(value -> toPrestoValue(value, type))); + columnStatistics.setRange(toRange(columnStatisticsData.getMin(), columnStatisticsData.getMax(), type)); columnStatistics.setDistinctValuesCount(Estimate.of(columnStatisticsData.getDistinctValuesCount())); columnStatistics.setDataSize(columnStatisticsData.getDataSize().map(Estimate::of).orElse(Estimate.unknown())); return columnStatistics.build(); } - private Object toPrestoValue(Object tpcdsValue, Type type) + private static Optional toRange(Optional min, Optional max, Type columnType) { - if (type instanceof VarcharType) { - return Slices.utf8Slice((String) tpcdsValue); + if (columnType instanceof VarcharType || columnType instanceof CharType || columnType.equals(TIME)) { + return Optional.empty(); } - else if (type instanceof CharType) { - return truncateToLengthAndTrimSpaces(Slices.utf8Slice((String) tpcdsValue), type); + if (!min.isPresent() || !max.isPresent()) { + return Optional.empty(); } - else if (tpcdsValue instanceof String && type.equals(DATE)) { - return LocalDate.parse((CharSequence) tpcdsValue).toEpochDay(); + return Optional.of(new DoubleRange(toDouble(min.get(), columnType), toDouble(max.get(), columnType))); + } + + private static double toDouble(Object value, Type type) + { + if (value instanceof String && type.equals(DATE)) { + return LocalDate.parse((CharSequence) value).toEpochDay(); } - else if (type.equals(BIGINT) || type.equals(INTEGER) || type.equals(DATE) || (type instanceof DecimalType && isShortDecimal(type))) { - return ((Number) tpcdsValue).longValue(); + if (type.equals(BIGINT) || type.equals(INTEGER) || type.equals(DATE)) { + return ((Number) value).doubleValue(); } - else if (type.equals(DOUBLE)) { - return ((Number) tpcdsValue).doubleValue(); + if (type instanceof DecimalType) { + DecimalType decimalType = (DecimalType) type; + if (isShortDecimal(decimalType)) { + return parseDouble(Decimals.toString(((Number) value).longValue(), decimalType.getScale())); + } + if (isLongDecimal(decimalType)) { + return parseDouble(Decimals.toString((Slice) value, decimalType.getScale())); + } + throw new IllegalArgumentException("Unexpected decimal type: " + decimalType); } - else if (type.equals(TimeType.TIME)) { - return ((Number) tpcdsValue).longValue(); + if (type.equals(DOUBLE)) { + return ((Number) value).doubleValue(); } throw new IllegalArgumentException("unsupported column type " + type); } diff --git a/presto-tpcds/src/test/java/com/facebook/presto/tpcds/TestTpcdsMetadataStatistics.java b/presto-tpcds/src/test/java/com/facebook/presto/tpcds/TestTpcdsMetadataStatistics.java index 4051a3e8e1553..f14da5781305b 100644 --- a/presto-tpcds/src/test/java/com/facebook/presto/tpcds/TestTpcdsMetadataStatistics.java +++ b/presto-tpcds/src/test/java/com/facebook/presto/tpcds/TestTpcdsMetadataStatistics.java @@ -19,17 +19,15 @@ import com.facebook.presto.spi.ConnectorTableHandle; import com.facebook.presto.spi.SchemaTableName; import com.facebook.presto.spi.statistics.ColumnStatistics; +import com.facebook.presto.spi.statistics.DoubleRange; import com.facebook.presto.spi.statistics.Estimate; import com.facebook.presto.spi.statistics.TableStatistics; -import com.google.common.primitives.Primitives; import com.teradata.tpcds.Table; import com.teradata.tpcds.column.CallCenterColumn; import com.teradata.tpcds.column.WebSiteColumn; -import io.airlift.slice.Slices; import org.testng.annotations.Test; import java.util.Map; -import java.util.Optional; import java.util.stream.Stream; import static com.facebook.presto.spi.Constraint.alwaysTrue; @@ -71,16 +69,6 @@ public void testTableStatsExistenceSupportedSchema() for (ColumnHandle column : metadata.getColumnHandles(session, tableHandle).values()) { assertTrue(tableStatistics.getColumnStatistics().containsKey(column)); assertNotNull(tableStatistics.getColumnStatistics().get(column)); - - TpcdsColumnHandle tpcdsColumn = (TpcdsColumnHandle) column; - Optional low = tableStatistics.getColumnStatistics().get(column).getLowValue(); - if (low.isPresent()) { - assertEquals(low.get().getClass(), Primitives.wrap(tpcdsColumn.getType().getJavaType())); - } - Optional high = tableStatistics.getColumnStatistics().get(column).getLowValue(); - if (high.isPresent()) { - assertEquals(high.get().getClass(), Primitives.wrap(tpcdsColumn.getType().getJavaType())); - } } })); } @@ -107,8 +95,7 @@ public void testTableStatsDetails() ColumnStatistics.builder() .setNullsFraction(Estimate.of(0)) .setDistinctValuesCount(Estimate.of(6)) - .setLowValue(Optional.of(1L)) - .setHighValue(Optional.of(6L)) + .setRange(new DoubleRange(1, 6)) .build()); // varchar @@ -117,8 +104,6 @@ public void testTableStatsDetails() ColumnStatistics.builder() .setNullsFraction(Estimate.of(0)) .setDistinctValuesCount(Estimate.of(3)) - .setLowValue(Optional.of(Slices.utf8Slice("AAAAAAAABAAAAAAA"))) - .setHighValue(Optional.of(Slices.utf8Slice("AAAAAAAAEAAAAAAA"))) .setDataSize(Estimate.of(48.0)) .build()); @@ -128,8 +113,6 @@ public void testTableStatsDetails() ColumnStatistics.builder() .setNullsFraction(Estimate.of(0)) .setDistinctValuesCount(Estimate.of(1)) - .setLowValue(Optional.of(Slices.utf8Slice("31904"))) - .setHighValue(Optional.of(Slices.utf8Slice("31904"))) .setDataSize(Estimate.of(5.0)) .build()); @@ -139,8 +122,7 @@ public void testTableStatsDetails() ColumnStatistics.builder() .setNullsFraction(Estimate.of(0)) .setDistinctValuesCount(Estimate.of(1)) - .setLowValue(Optional.of(-500L)) - .setHighValue(Optional.of(-500L)) + .setRange(new DoubleRange(-5, -5)) .build()); // date @@ -149,8 +131,7 @@ public void testTableStatsDetails() ColumnStatistics.builder() .setNullsFraction(Estimate.of(0)) .setDistinctValuesCount(Estimate.of(4)) - .setLowValue(Optional.of(10227L)) - .setHighValue(Optional.of(11688L)) + .setRange(new DoubleRange(10227L, 11688L)) .build()); // only null values @@ -159,8 +140,6 @@ public void testTableStatsDetails() ColumnStatistics.builder() .setNullsFraction(Estimate.of(1)) .setDistinctValuesCount(Estimate.of(0)) - .setLowValue(Optional.empty()) - .setHighValue(Optional.empty()) .build()); } @@ -179,8 +158,7 @@ public void testNullFraction() ColumnStatistics.builder() .setNullsFraction(Estimate.of(0.5)) .setDistinctValuesCount(Estimate.of(3)) - .setLowValue(Optional.of(10819L)) - .setHighValue(Optional.of(11549L)) + .setRange(new DoubleRange(10819L, 11549L)) .build()); } @@ -189,7 +167,6 @@ private void assertColumnStatistics(ColumnStatistics actual, ColumnStatistics ex estimateAssertion.assertClose(actual.getNullsFraction(), expected.getNullsFraction(), "Nulls fraction"); estimateAssertion.assertClose(actual.getDataSize(), expected.getDataSize(), "Data size"); estimateAssertion.assertClose(actual.getDistinctValuesCount(), expected.getDistinctValuesCount(), "Distinct values count"); - assertEquals(actual.getLowValue(), expected.getLowValue()); - assertEquals(actual.getHighValue(), expected.getHighValue()); + assertEquals(actual.getRange(), expected.getRange()); } } diff --git a/presto-tpch/src/main/java/com/facebook/presto/tpch/TpchMetadata.java b/presto-tpch/src/main/java/com/facebook/presto/tpch/TpchMetadata.java index 53dcba6be6ac1..7a66f861ce3c4 100644 --- a/presto-tpch/src/main/java/com/facebook/presto/tpch/TpchMetadata.java +++ b/presto-tpch/src/main/java/com/facebook/presto/tpch/TpchMetadata.java @@ -33,6 +33,7 @@ import com.facebook.presto.spi.predicate.NullableValue; import com.facebook.presto.spi.predicate.TupleDomain; import com.facebook.presto.spi.statistics.ColumnStatistics; +import com.facebook.presto.spi.statistics.DoubleRange; import com.facebook.presto.spi.statistics.Estimate; import com.facebook.presto.spi.statistics.TableStatistics; import com.facebook.presto.spi.type.Type; @@ -382,24 +383,31 @@ private ColumnStatistics toColumnStatistics(ColumnStatisticsData stats, Type col .setNullsFraction(Estimate.zero()) .setDistinctValuesCount(stats.getDistinctValuesCount().map(Estimate::of).orElse(Estimate.unknown())) .setDataSize(stats.getDataSize().map(Estimate::of).orElse(Estimate.unknown())) - .setLowValue(stats.getMin().map(value -> toPrestoValue(value, columnType))) - .setHighValue(stats.getMax().map(value -> toPrestoValue(value, columnType))) + .setRange(toRange(stats.getMin(), stats.getMax(), columnType)) .build(); } - private Object toPrestoValue(Object tpchValue, Type columnType) + private static Optional toRange(Optional min, Optional max, Type columnType) { if (columnType instanceof VarcharType) { - return Slices.utf8Slice((String) tpchValue); + return Optional.empty(); } - if (tpchValue instanceof String && columnType.equals(DATE)) { - return LocalDate.parse((CharSequence) tpchValue).toEpochDay(); + if (!min.isPresent() || !max.isPresent()) { + return Optional.empty(); + } + return Optional.of(new DoubleRange(toDouble(min.get(), columnType), toDouble(max.get(), columnType))); + } + + private static double toDouble(Object value, Type columnType) + { + if (value instanceof String && columnType.equals(DATE)) { + return LocalDate.parse((CharSequence) value).toEpochDay(); } if (columnType.equals(BIGINT) || columnType.equals(INTEGER) || columnType.equals(DATE)) { - return ((Number) tpchValue).longValue(); + return ((Number) value).longValue(); } if (columnType.equals(DOUBLE)) { - return ((Number) tpchValue).doubleValue(); + return ((Number) value).doubleValue(); } throw new IllegalArgumentException("unsupported column type " + columnType); } diff --git a/presto-tpch/src/test/java/com/facebook/presto/tpch/EstimateAssertion.java b/presto-tpch/src/test/java/com/facebook/presto/tpch/EstimateAssertion.java index f03c91bbef8c3..fdc6d87d994f3 100644 --- a/presto-tpch/src/test/java/com/facebook/presto/tpch/EstimateAssertion.java +++ b/presto-tpch/src/test/java/com/facebook/presto/tpch/EstimateAssertion.java @@ -13,6 +13,7 @@ */ package com.facebook.presto.tpch; +import com.facebook.presto.spi.statistics.DoubleRange; import com.facebook.presto.spi.statistics.Estimate; import io.airlift.slice.Slice; @@ -60,6 +61,12 @@ private void assertClose(Object actual, Object expected, String comparedValue) assertEquals(actual.getClass(), expected.getClass(), comparedValue); assertEquals(((Slice) actual).toStringUtf8(), ((Slice) expected).toStringUtf8()); } + else if (actual instanceof DoubleRange) { + DoubleRange actualRange = (DoubleRange) actual; + DoubleRange expectedRange = (DoubleRange) expected; + assertClose(actualRange.getMin(), expectedRange.getMin(), comparedValue); + assertClose(actualRange.getMax(), expectedRange.getMax(), comparedValue); + } else { double actualDouble = toDouble(actual); double expectedDouble = toDouble(expected); diff --git a/presto-tpch/src/test/java/com/facebook/presto/tpch/TestTpchMetadata.java b/presto-tpch/src/test/java/com/facebook/presto/tpch/TestTpchMetadata.java index 78a4b75f44ec5..3dd19c5aab783 100644 --- a/presto-tpch/src/test/java/com/facebook/presto/tpch/TestTpchMetadata.java +++ b/presto-tpch/src/test/java/com/facebook/presto/tpch/TestTpchMetadata.java @@ -22,6 +22,7 @@ import com.facebook.presto.spi.predicate.NullableValue; import com.facebook.presto.spi.predicate.TupleDomain; import com.facebook.presto.spi.statistics.ColumnStatistics; +import com.facebook.presto.spi.statistics.DoubleRange; import com.facebook.presto.spi.statistics.Estimate; import com.facebook.presto.spi.statistics.TableStatistics; import com.facebook.presto.tpch.util.PredicateUtils; @@ -166,7 +167,7 @@ public void testColumnStats() testColumnStats(schema, PART_SUPPLIER, PART_KEY, columnStatistics(200_000 * scaleFactor, 1, 200_000 * scaleFactor)); //dictionary - testColumnStats(schema, CUSTOMER, MARKET_SEGMENT, columnStatistics(5, "AUTOMOBILE", "MACHINERY", 45)); + testColumnStats(schema, CUSTOMER, MARKET_SEGMENT, columnStatistics(5, 45)); //low-valued numeric column testColumnStats(schema, LINE_ITEM, LINE_NUMBER, columnStatistics(7, 1, 7)); @@ -176,11 +177,11 @@ public void testColumnStats() //varchar and double columns if (schema.equals("tiny")) { - testColumnStats(schema, CUSTOMER, NAME, columnStatistics(150_000 * scaleFactor, "Customer#000000001", "Customer#000001500", 27000)); + testColumnStats(schema, CUSTOMER, NAME, columnStatistics(150_000 * scaleFactor, 27000)); testColumnStats(schema, PART, RETAIL_PRICE, columnStatistics(1_099, 901, 1900.99)); } else if (schema.equals("sf1")) { - testColumnStats(schema, CUSTOMER, NAME, columnStatistics(150_000 * scaleFactor, "Customer#000000001", "Customer#000150000", 2700000)); + testColumnStats(schema, CUSTOMER, NAME, columnStatistics(150_000 * scaleFactor, 2700000)); testColumnStats(schema, PART, RETAIL_PRICE, columnStatistics(20899, 901, 2089.99)); } }); @@ -193,15 +194,15 @@ public void testColumnStatsWithConstraints() double scaleFactor = TpchMetadata.schemaNameToScaleFactor(schema); //value count, min and max are supported for the constrained column - testColumnStats(schema, ORDERS, ORDER_STATUS, constraint(ORDER_STATUS, "F"), columnStatistics(1, "F", "F", 1)); - testColumnStats(schema, ORDERS, ORDER_STATUS, constraint(ORDER_STATUS, "O"), columnStatistics(1, "O", "O", 1)); - testColumnStats(schema, ORDERS, ORDER_STATUS, constraint(ORDER_STATUS, "P"), columnStatistics(1, "P", "P", 1)); + testColumnStats(schema, ORDERS, ORDER_STATUS, constraint(ORDER_STATUS, "F"), columnStatistics(1, 1)); + testColumnStats(schema, ORDERS, ORDER_STATUS, constraint(ORDER_STATUS, "O"), columnStatistics(1, 1)); + testColumnStats(schema, ORDERS, ORDER_STATUS, constraint(ORDER_STATUS, "P"), columnStatistics(1, 1)); //only min and max values for non-scaling columns can be estimated for non-constrained columns testColumnStats(schema, ORDERS, ORDER_KEY, constraint(ORDER_STATUS, "F"), rangeStatistics(3, 6_000_000 * scaleFactor)); testColumnStats(schema, ORDERS, ORDER_KEY, constraint(ORDER_STATUS, "O"), rangeStatistics(1, 6_000_000 * scaleFactor)); testColumnStats(schema, ORDERS, ORDER_KEY, constraint(ORDER_STATUS, "P"), rangeStatistics(65, 6_000_000 * scaleFactor)); - testColumnStats(schema, ORDERS, CLERK, constraint(ORDER_STATUS, "O"), rangeStatistics("Clerk#000000001", "Clerk#000001000", 15000)); + testColumnStats(schema, ORDERS, CLERK, constraint(ORDER_STATUS, "O"), createColumnStatistics(Optional.empty(), Optional.empty(), Optional.of(15000.0))); //nothing can be said for always false constraints testColumnStats(schema, ORDERS, ORDER_STATUS, alwaysFalse(), noColumnStatistics()); @@ -210,21 +211,21 @@ public void testColumnStatsWithConstraints() testColumnStats(schema, ORDERS, ORDER_KEY, constraint(ORDER_STATUS, "NO SUCH STATUS"), noColumnStatistics()); //unmodified stats are returned for the always true constraint - testColumnStats(schema, ORDERS, ORDER_STATUS, alwaysTrue(), columnStatistics(3, "F", "P", 3)); + testColumnStats(schema, ORDERS, ORDER_STATUS, alwaysTrue(), columnStatistics(3, 3)); testColumnStats(schema, ORDERS, ORDER_KEY, alwaysTrue(), columnStatistics(1_500_000 * scaleFactor, 1, 6_000_000 * scaleFactor)); //constraints on columns other than ORDER_STATUS are not supported and are ignored - testColumnStats(schema, ORDERS, ORDER_STATUS, constraint(CLERK, "NO SUCH CLERK"), columnStatistics(3, "F", "P", 3)); + testColumnStats(schema, ORDERS, ORDER_STATUS, constraint(CLERK, "NO SUCH CLERK"), columnStatistics(3, 3)); testColumnStats(schema, ORDERS, ORDER_KEY, constraint(CLERK, "Clerk#000000001"), columnStatistics(1_500_000 * scaleFactor, 1, 6_000_000 * scaleFactor)); //compound constraints are supported - testColumnStats(schema, ORDERS, ORDER_STATUS, constraint(ORDER_STATUS, "F", "NO SUCH STATUS"), columnStatistics(1, "F", "F", 1)); + testColumnStats(schema, ORDERS, ORDER_STATUS, constraint(ORDER_STATUS, "F", "NO SUCH STATUS"), columnStatistics(1, 1)); testColumnStats(schema, ORDERS, ORDER_KEY, constraint(ORDER_STATUS, "F", "NO SUCH STATUS"), rangeStatistics(3, 6_000_000 * scaleFactor)); - testColumnStats(schema, ORDERS, ORDER_STATUS, constraint(ORDER_STATUS, "F", "O"), columnStatistics(2, "F", "O", 2)); + testColumnStats(schema, ORDERS, ORDER_STATUS, constraint(ORDER_STATUS, "F", "O"), columnStatistics(2, 2)); testColumnStats(schema, ORDERS, ORDER_KEY, constraint(ORDER_STATUS, "F", "O"), rangeStatistics(1, 6_000_000 * scaleFactor)); - testColumnStats(schema, ORDERS, ORDER_STATUS, constraint(ORDER_STATUS, "F", "O", "P"), columnStatistics(3, "F", "P", 3)); + testColumnStats(schema, ORDERS, ORDER_STATUS, constraint(ORDER_STATUS, "F", "O", "P"), columnStatistics(3, 3)); testColumnStats(schema, ORDERS, ORDER_KEY, constraint(ORDER_STATUS, "F", "O", "P"), columnStatistics(1_500_000 * scaleFactor, 1, 6_000_000 * scaleFactor)); }); } @@ -247,8 +248,7 @@ private void testColumnStats(String schema, TpchTable table, TpchColumn co estimateAssertion.assertClose(actual.getDistinctValuesCount(), expected.getDistinctValuesCount(), "distinctValuesCount"); estimateAssertion.assertClose(actual.getDataSize(), expected.getDataSize(), "dataSize"); estimateAssertion.assertClose(actual.getNullsFraction(), expected.getNullsFraction(), "nullsFraction"); - estimateAssertion.assertClose(actual.getLowValue(), expected.getLowValue(), "lowValue"); - estimateAssertion.assertClose(actual.getHighValue(), expected.getHighValue(), "highValue"); + estimateAssertion.assertClose(actual.getRange(), expected.getRange(), "range"); } @Test @@ -380,36 +380,30 @@ private static ConnectorTableLayoutResult getTableOnlyLayout(TpchMetadata tpchMe private ColumnStatistics noColumnStatistics() { - return createColumnStatistics(Optional.of(0.0), empty(), empty(), Optional.of(0.0)); + return createColumnStatistics(Optional.of(0.0), Optional.empty(), Optional.of(0.0)); } - private ColumnStatistics columnStatistics(double distinctValuesCount, String min, String max, double dataSize) + private ColumnStatistics columnStatistics(double distinctValuesCount, double dataSize) { - return createColumnStatistics(Optional.of(distinctValuesCount), Optional.of(utf8Slice(min)), Optional.of(utf8Slice(max)), Optional.of(dataSize)); + return createColumnStatistics(Optional.of(distinctValuesCount), Optional.empty(), Optional.of(dataSize)); } private ColumnStatistics columnStatistics(double distinctValuesCount, double min, double max) { - return createColumnStatistics(Optional.of(distinctValuesCount), Optional.of(min), Optional.of(max), Optional.empty()); - } - - private ColumnStatistics rangeStatistics(String min, String max, double dataSize) - { - return createColumnStatistics(empty(), Optional.of(utf8Slice(min)), Optional.of(utf8Slice(max)), Optional.of(dataSize)); + return createColumnStatistics(Optional.of(distinctValuesCount), Optional.of(new DoubleRange(min, max)), Optional.empty()); } private ColumnStatistics rangeStatistics(double min, double max) { - return createColumnStatistics(empty(), Optional.of(min), Optional.of(max), Optional.empty()); + return createColumnStatistics(empty(), Optional.of(new DoubleRange(min, max)), Optional.empty()); } - private static ColumnStatistics createColumnStatistics(Optional distinctValuesCount, Optional min, Optional max, Optional dataSize) + private static ColumnStatistics createColumnStatistics(Optional distinctValuesCount, Optional range, Optional dataSize) { return ColumnStatistics.builder() .setNullsFraction(Estimate.zero()) .setDistinctValuesCount(toEstimate(distinctValuesCount)) - .setLowValue(min) - .setHighValue(max) + .setRange(range) .setDataSize(toEstimate(dataSize)) .build(); } From 5f5e6629d3d4d22314b06fbd630d3510d672a94c Mon Sep 17 00:00:00 2001 From: Andrii Rosa Date: Wed, 19 Sep 2018 14:03:20 -0400 Subject: [PATCH 12/14] Refactor MetastoreHiveStatisticsProvider - Add sanity checks to make sure that statistics returned make sense - Make the class to be more unit test friendly - Add extensive unit tests --- .../presto/hive/HiveClientConfig.java | 14 + .../facebook/presto/hive/HiveErrorCode.java | 1 + .../facebook/presto/hive/HiveMetadata.java | 8 +- .../presto/hive/HiveMetadataFactory.java | 2 +- .../presto/hive/HiveSessionProperties.java | 11 + .../presto/hive/PartitionStatistics.java | 28 + .../statistics/HiveStatisticsProvider.java | 16 +- .../MetastoreHiveStatisticsProvider.java | 938 ++++++++++++------ .../facebook/presto/hive/util/Statistics.java | 135 --- .../presto/hive/TestHiveClientConfig.java | 3 + .../TestMetastoreHiveStatisticsProvider.java | 831 ++++++++++++++++ .../tests/hive/TestHiveTableStatistics.java | 42 +- .../spi/statistics/ColumnStatistics.java | 7 + .../presto/spi/statistics/DoubleRange.java | 10 + .../spi/statistics/TestDoubleRange.java | 11 + 15 files changed, 1559 insertions(+), 498 deletions(-) diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/HiveClientConfig.java b/presto-hive/src/main/java/com/facebook/presto/hive/HiveClientConfig.java index 2b22edf9afbdf..d4a0eba7c6edd 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/HiveClientConfig.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/HiveClientConfig.java @@ -137,6 +137,7 @@ public class HiveClientConfig private boolean tableStatisticsEnabled = true; private int partitionStatisticsSampleSize = 100; + private boolean ignoreCorruptedStatistics; private boolean collectColumnStatisticsOnWrite; public int getMaxInitialSplits() @@ -1094,6 +1095,19 @@ public HiveClientConfig setPartitionStatisticsSampleSize(int partitionStatistics return this; } + public boolean isIgnoreCorruptedStatistics() + { + return ignoreCorruptedStatistics; + } + + @Config("hive.ignore-corrupted-statistics") + @ConfigDescription("Ignore corrupted statistics rather than failing") + public HiveClientConfig setIgnoreCorruptedStatistics(boolean ignoreCorruptedStatistics) + { + this.ignoreCorruptedStatistics = ignoreCorruptedStatistics; + return this; + } + public boolean isCollectColumnStatisticsOnWrite() { return collectColumnStatisticsOnWrite; diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/HiveErrorCode.java b/presto-hive/src/main/java/com/facebook/presto/hive/HiveErrorCode.java index d9505e8caa7f9..a1caa955cd4d7 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/HiveErrorCode.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/HiveErrorCode.java @@ -61,6 +61,7 @@ public enum HiveErrorCode HIVE_TABLE_NOT_READABLE(34, USER_ERROR), HIVE_TABLE_DROPPED_DURING_QUERY(35, EXTERNAL), // HIVE_TOO_MANY_BUCKET_SORT_FILES(36) is deprecated + HIVE_CORRUPTED_COLUMN_STATISTICS(37, EXTERNAL), /**/; private final ErrorCode errorCode; diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/HiveMetadata.java b/presto-hive/src/main/java/com/facebook/presto/hive/HiveMetadata.java index da7b438317206..53fe86bef79e1 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/HiveMetadata.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/HiveMetadata.java @@ -526,12 +526,14 @@ public TableStatistics getTableStatistics(ConnectorSession session, ConnectorTab if (!isStatisticsEnabled(session)) { return TableStatistics.empty(); } - List hivePartitions = getPartitionsAsList(tableHandle, constraint); - Map tableColumns = getColumnHandles(session, tableHandle) + Map columns = getColumnHandles(session, tableHandle) .entrySet().stream() .filter(entry -> !((HiveColumnHandle) entry.getValue()).isHidden()) .collect(toImmutableMap(Map.Entry::getKey, Map.Entry::getValue)); - return hiveStatisticsProvider.getTableStatistics(session, tableHandle, hivePartitions, tableColumns); + Map columnTypes = columns.entrySet().stream() + .collect(toImmutableMap(Map.Entry::getKey, entry -> getColumnMetadata(session, tableHandle, entry.getValue()).getType())); + List partitions = getPartitionsAsList(tableHandle, constraint); + return hiveStatisticsProvider.getTableStatistics(session, ((HiveTableHandle) tableHandle).getSchemaTableName(), columns, columnTypes, partitions); } private List getPartitionsAsList(ConnectorTableHandle tableHandle, Constraint constraint) diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/HiveMetadataFactory.java b/presto-hive/src/main/java/com/facebook/presto/hive/HiveMetadataFactory.java index 0da42584d631e..5fd8d754293ff 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/HiveMetadataFactory.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/HiveMetadataFactory.java @@ -160,7 +160,7 @@ public HiveMetadata get() partitionUpdateCodec, typeTranslator, prestoVersion, - new MetastoreHiveStatisticsProvider(typeManager, metastore, timeZone), + new MetastoreHiveStatisticsProvider(metastore), maxPartitions); } } diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/HiveSessionProperties.java b/presto-hive/src/main/java/com/facebook/presto/hive/HiveSessionProperties.java index 62988901396a5..ecddee3cdb642 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/HiveSessionProperties.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/HiveSessionProperties.java @@ -71,6 +71,7 @@ public final class HiveSessionProperties private static final String SORTED_WRITING_ENABLED = "sorted_writing_enabled"; private static final String STATISTICS_ENABLED = "statistics_enabled"; private static final String PARTITION_STATISTICS_SAMPLE_SIZE = "partition_statistics_sample_size"; + private static final String IGNORE_CORRUPTED_STATISTICS = "ignore_corrupted_statistics"; private static final String COLLECT_COLUMN_STATISTICS_ON_WRITE = "collect_column_statistics_on_write"; private final List> sessionProperties; @@ -256,6 +257,11 @@ public HiveSessionProperties(HiveClientConfig hiveClientConfig, OrcFileWriterCon "Maximum sample size of the partitions column statistics", hiveClientConfig.getPartitionStatisticsSampleSize(), false), + booleanProperty( + IGNORE_CORRUPTED_STATISTICS, + "Experimental: Ignore corrupted statistics rather than failing", + hiveClientConfig.isIgnoreCorruptedStatistics(), + false), booleanProperty( COLLECT_COLUMN_STATISTICS_ON_WRITE, "Experimental: Enables automatic column level statistics collection on write", @@ -437,6 +443,11 @@ public static int getPartitionStatisticsSampleSize(ConnectorSession session) return size; } + public static boolean isIgnoreCorruptedStatistics(ConnectorSession session) + { + return session.getProperty(IGNORE_CORRUPTED_STATISTICS, Boolean.class); + } + public static boolean isCollectColumnStatisticsOnWrite(ConnectorSession session) { return session.getProperty(COLLECT_COLUMN_STATISTICS_ON_WRITE, Boolean.class); diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/PartitionStatistics.java b/presto-hive/src/main/java/com/facebook/presto/hive/PartitionStatistics.java index 3e1bfb078ee0b..f008bac259fe0 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/PartitionStatistics.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/PartitionStatistics.java @@ -81,4 +81,32 @@ public String toString() .add("columnStatistics", columnStatistics) .toString(); } + + public static Builder builder() + { + return new Builder(); + } + + public static class Builder + { + private HiveBasicStatistics basicStatistics = HiveBasicStatistics.createEmptyStatistics(); + private Map columnStatistics = ImmutableMap.of(); + + public Builder setBasicStatistics(HiveBasicStatistics basicStatistics) + { + this.basicStatistics = requireNonNull(basicStatistics, "basicStatistics is null"); + return this; + } + + public Builder setColumnStatistics(Map columnStatistics) + { + this.columnStatistics = ImmutableMap.copyOf(requireNonNull(columnStatistics, "columnStatistics is null")); + return this; + } + + public PartitionStatistics build() + { + return new PartitionStatistics(basicStatistics, columnStatistics); + } + } } diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/statistics/HiveStatisticsProvider.java b/presto-hive/src/main/java/com/facebook/presto/hive/statistics/HiveStatisticsProvider.java index 7511d6a6e8041..539250431e021 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/statistics/HiveStatisticsProvider.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/statistics/HiveStatisticsProvider.java @@ -17,20 +17,22 @@ import com.facebook.presto.hive.HivePartition; import com.facebook.presto.spi.ColumnHandle; import com.facebook.presto.spi.ConnectorSession; -import com.facebook.presto.spi.ConnectorTableHandle; +import com.facebook.presto.spi.SchemaTableName; import com.facebook.presto.spi.statistics.TableStatistics; +import com.facebook.presto.spi.type.Type; import java.util.List; import java.util.Map; -/** - * @param tableColumns must be Hive columns, not hidden (Presto-internal) columns - */ public interface HiveStatisticsProvider { + /** + * @param columns must be Hive columns, not hidden (Presto-internal) columns + */ TableStatistics getTableStatistics( ConnectorSession session, - ConnectorTableHandle tableHandle, - List hivePartitions, - Map tableColumns); + SchemaTableName table, + Map columns, + Map columnTypes, + List partitions); } diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java b/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java index 7318b53f3d2e7..fff51b416713b 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java @@ -17,15 +17,17 @@ import com.facebook.presto.hive.HiveBasicStatistics; import com.facebook.presto.hive.HiveColumnHandle; import com.facebook.presto.hive.HivePartition; -import com.facebook.presto.hive.HiveTableHandle; import com.facebook.presto.hive.PartitionStatistics; +import com.facebook.presto.hive.metastore.DateStatistics; +import com.facebook.presto.hive.metastore.DecimalStatistics; +import com.facebook.presto.hive.metastore.DoubleStatistics; import com.facebook.presto.hive.metastore.HiveColumnStatistics; +import com.facebook.presto.hive.metastore.IntegerStatistics; import com.facebook.presto.hive.metastore.SemiTransactionalHiveMetastore; -import com.facebook.presto.hive.util.Statistics.Range; import com.facebook.presto.spi.ColumnHandle; import com.facebook.presto.spi.ConnectorSession; -import com.facebook.presto.spi.ConnectorTableHandle; -import com.facebook.presto.spi.block.Block; +import com.facebook.presto.spi.PrestoException; +import com.facebook.presto.spi.SchemaTableName; import com.facebook.presto.spi.predicate.NullableValue; import com.facebook.presto.spi.statistics.ColumnStatistics; import com.facebook.presto.spi.statistics.DoubleRange; @@ -34,32 +36,36 @@ import com.facebook.presto.spi.type.DecimalType; import com.facebook.presto.spi.type.Decimals; import com.facebook.presto.spi.type.Type; -import com.facebook.presto.spi.type.TypeManager; import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.ImmutableList; +import com.google.common.base.VerifyException; import com.google.common.collect.ImmutableMap; import com.google.common.hash.HashFunction; +import com.google.common.primitives.Ints; +import com.google.common.primitives.Shorts; +import com.google.common.primitives.SignedBytes; import io.airlift.log.Logger; import io.airlift.slice.Slice; -import org.joda.time.DateTimeZone; +import java.math.BigDecimal; +import java.time.LocalDate; import java.util.ArrayList; import java.util.Collection; import java.util.Comparator; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Optional; import java.util.OptionalDouble; import java.util.OptionalLong; -import java.util.PrimitiveIterator; -import java.util.function.Function; -import java.util.stream.DoubleStream; +import java.util.Set; +import static com.facebook.presto.hive.HiveErrorCode.HIVE_CORRUPTED_COLUMN_STATISTICS; +import static com.facebook.presto.hive.HivePartition.UNPARTITIONED_ID; import static com.facebook.presto.hive.HiveSessionProperties.getPartitionStatisticsSampleSize; +import static com.facebook.presto.hive.HiveSessionProperties.isIgnoreCorruptedStatistics; import static com.facebook.presto.hive.HiveSessionProperties.isStatisticsEnabled; -import static com.facebook.presto.hive.util.Statistics.getMinMaxAsPrestoNativeValues; -import static com.facebook.presto.spi.predicate.Utils.nativeValueToBlock; import static com.facebook.presto.spi.type.BigintType.BIGINT; +import static com.facebook.presto.spi.type.Chars.isCharType; import static com.facebook.presto.spi.type.DateType.DATE; import static com.facebook.presto.spi.type.Decimals.isLongDecimal; import static com.facebook.presto.spi.type.Decimals.isShortDecimal; @@ -67,17 +73,19 @@ import static com.facebook.presto.spi.type.IntegerType.INTEGER; import static com.facebook.presto.spi.type.RealType.REAL; import static com.facebook.presto.spi.type.SmallintType.SMALLINT; -import static com.facebook.presto.spi.type.StandardTypes.CHAR; -import static com.facebook.presto.spi.type.StandardTypes.VARCHAR; import static com.facebook.presto.spi.type.TinyintType.TINYINT; +import static com.facebook.presto.spi.type.Varchars.isVarcharType; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Verify.verify; import static com.google.common.collect.ImmutableList.toImmutableList; import static com.google.common.collect.ImmutableSet.toImmutableSet; import static com.google.common.collect.Maps.immutableEntry; import static com.google.common.hash.Hashing.murmur3_128; +import static java.lang.Double.isFinite; +import static java.lang.Double.isNaN; import static java.lang.Double.parseDouble; import static java.lang.Float.intBitsToFloat; +import static java.lang.String.format; import static java.util.Collections.unmodifiableList; import static java.util.Objects.requireNonNull; @@ -86,441 +94,709 @@ public class MetastoreHiveStatisticsProvider { private static final Logger log = Logger.get(MetastoreHiveStatisticsProvider.class); - private final TypeManager typeManager; - private final SemiTransactionalHiveMetastore metastore; - private final DateTimeZone timeZone; + private final PartitionsStatisticsProvider statisticsProvider; - public MetastoreHiveStatisticsProvider(TypeManager typeManager, SemiTransactionalHiveMetastore metastore, DateTimeZone timeZone) + public MetastoreHiveStatisticsProvider(SemiTransactionalHiveMetastore metastore) { - this.typeManager = requireNonNull(typeManager, "typeManager is null"); - this.metastore = requireNonNull(metastore, "metastore is null"); - this.timeZone = requireNonNull(timeZone, "timeZone is null"); + requireNonNull(metastore, "metastore is null"); + this.statisticsProvider = (table, hivePartitions) -> getPartitionsStatistics(metastore, table, hivePartitions); + } + + @VisibleForTesting + MetastoreHiveStatisticsProvider(PartitionsStatisticsProvider statisticsProvider) + { + this.statisticsProvider = requireNonNull(statisticsProvider, "statisticsProvider is null"); + } + + private static Map getPartitionsStatistics(SemiTransactionalHiveMetastore metastore, SchemaTableName table, List hivePartitions) + { + if (hivePartitions.isEmpty()) { + return ImmutableMap.of(); + } + boolean unpartitioned = hivePartitions.stream().anyMatch(partition -> partition.getPartitionId().equals(UNPARTITIONED_ID)); + if (unpartitioned) { + checkArgument(hivePartitions.size() == 1, "expected only one hive partition"); + return ImmutableMap.of(UNPARTITIONED_ID, metastore.getTableStatistics(table.getSchemaName(), table.getTableName())); + } + Set partitionNames = hivePartitions.stream() + .map(HivePartition::getPartitionId) + .collect(toImmutableSet()); + return metastore.getPartitionStatistics(table.getSchemaName(), table.getTableName(), partitionNames); } @Override - public TableStatistics getTableStatistics(ConnectorSession session, ConnectorTableHandle tableHandle, List queriedPartitions, Map tableColumns) + public TableStatistics getTableStatistics( + ConnectorSession session, + SchemaTableName table, + Map columns, + Map columnTypes, + List partitions) { if (!isStatisticsEnabled(session)) { return TableStatistics.empty(); } - - int queriedPartitionsCount = queriedPartitions.size(); int sampleSize = getPartitionStatisticsSampleSize(session); - List samplePartitions = getPartitionsSample(queriedPartitions, sampleSize); - - Map statisticsSample = getPartitionsStatistics((HiveTableHandle) tableHandle, samplePartitions); - - TableStatistics.Builder tableStatistics = TableStatistics.builder(); - OptionalDouble rowsPerPartition = calculateRowsPerPartition(statisticsSample); - Estimate rowCount = calculateRowsCount(rowsPerPartition, queriedPartitionsCount); - tableStatistics.setRowCount(rowCount); - for (Map.Entry columnEntry : tableColumns.entrySet()) { - String columnName = columnEntry.getKey(); - HiveColumnHandle hiveColumnHandle = (HiveColumnHandle) columnEntry.getValue(); - - List lowValueCandidates = ImmutableList.of(); - List highValueCandidates = ImmutableList.of(); - - Type prestoType = typeManager.getType(hiveColumnHandle.getTypeSignature()); - Estimate nullsFraction; - Estimate dataSize; - ColumnStatistics.Builder columnStatistics = ColumnStatistics.builder(); - if (hiveColumnHandle.isPartitionKey()) { - columnStatistics.setDistinctValuesCount(countDistinctPartitionKeys(hiveColumnHandle, queriedPartitions)); - nullsFraction = calculateNullsFractionForPartitioningKey(hiveColumnHandle, queriedPartitions, statisticsSample, rowCount, rowsPerPartition); - if (isLowHighSupportedForType(prestoType)) { - lowValueCandidates = queriedPartitions.stream() - .map(HivePartition::getKeys) - .map(keys -> keys.get(hiveColumnHandle)) - .filter(value -> !value.isNull()) - .map(NullableValue::getValue) - .collect(toImmutableList()); - highValueCandidates = lowValueCandidates; - } - dataSize = calculateDataSizeForPartitioningKey(hiveColumnHandle, queriedPartitions, statisticsSample, rowCount, rowsPerPartition); - } - else { - columnStatistics.setDistinctValuesCount(calculateDistinctValuesCount(statisticsSample, columnName)); - nullsFraction = calculateNullsFraction(statisticsSample, queriedPartitionsCount, columnName, rowCount); - - if (isLowHighSupportedForType(prestoType)) { - List ranges = statisticsSample.values().stream() - .map(PartitionStatistics::getColumnStatistics) - .filter(stats -> stats.containsKey(columnName)) - .map(stats -> stats.get(columnName)) - .map(stats -> getMinMaxAsPrestoNativeValues(stats, prestoType, timeZone)) - .collect(toImmutableList()); - - // TODO[lo] Maybe we do not want to expose high/low value if it is based on too small fraction of - // partitions. And return unknown if most of the partitions we are working with do not have - // statistics computed. - lowValueCandidates = ranges.stream() - .filter(range -> range.getMin().isPresent()) - .map(range -> range.getMin().get()) - .collect(toImmutableList()); - highValueCandidates = ranges.stream() - .filter(range -> range.getMax().isPresent()) - .map(range -> range.getMax().get()) - .collect(toImmutableList()); - } - dataSize = calculateDataSize(statisticsSample, columnName, rowCount); + List partitionsSample = getPartitionsSample(partitions, sampleSize); + try { + Map statisticsSample = statisticsProvider.getPartitionsStatistics(table, partitionsSample); + validatePartitionStatistics(table, statisticsSample); + return getTableStatistics(columns, columnTypes, partitions, statisticsSample); + } + catch (PrestoException e) { + if (e.getErrorCode().equals(HIVE_CORRUPTED_COLUMN_STATISTICS.toErrorCode()) && isIgnoreCorruptedStatistics(session)) { + log.error(e); + return TableStatistics.empty(); } + throw e; + } + } - Comparator comparator = (leftValue, rightValue) -> { - Block leftBlock = nativeValueToBlock(prestoType, leftValue); - Block rightBlock = nativeValueToBlock(prestoType, rightValue); - return prestoType.compareTo(leftBlock, 0, rightBlock, 0); - }; - Optional min = lowValueCandidates.stream().min(comparator); - Optional max = highValueCandidates.stream().max(comparator); - if (min.isPresent() && max.isPresent()) { - columnStatistics.setRange(createPrestoRange(prestoType, min.get(), max.get())); + @VisibleForTesting + static List getPartitionsSample(List partitions, int sampleSize) + { + checkArgument(sampleSize > 0, "sampleSize is expected to be greater than zero"); + + if (partitions.size() <= sampleSize) { + return partitions; + } + + List result = new ArrayList<>(); + + int samplesLeft = sampleSize; + + HivePartition min = partitions.get(0); + HivePartition max = partitions.get(0); + for (HivePartition partition : partitions) { + if (partition.getPartitionId().compareTo(min.getPartitionId()) < 0) { + min = partition; } - columnStatistics.setDataSize(dataSize); + else if (partition.getPartitionId().compareTo(max.getPartitionId()) > 0) { + max = partition; + } + } - columnStatistics.setNullsFraction(nullsFraction); - tableStatistics.setColumnStatistics(hiveColumnHandle, columnStatistics.build()); + result.add(min); + samplesLeft--; + if (samplesLeft > 0) { + result.add(max); + samplesLeft--; } - return tableStatistics.build(); + + if (samplesLeft > 0) { + HashFunction hashFunction = murmur3_128(); + Comparator> hashComparator = Comparator + ., Long>comparing(Map.Entry::getValue) + .thenComparing(entry -> entry.getKey().getPartitionId()); + partitions.stream() + .filter(partition -> !result.contains(partition)) + .map(partition -> immutableEntry(partition, hashFunction.hashUnencodedChars(partition.getPartitionId()).asLong())) + .sorted(hashComparator) + .limit(samplesLeft) + .forEachOrdered(entry -> result.add(entry.getKey())); + } + + return unmodifiableList(result); } - private boolean isLowHighSupportedForType(Type type) + @VisibleForTesting + static void validatePartitionStatistics(SchemaTableName table, Map partitionStatistics) { - if (type instanceof DecimalType) { - return true; - } - if (type.equals(TINYINT) - || type.equals(SMALLINT) - || type.equals(INTEGER) - || type.equals(BIGINT) - || type.equals(REAL) - || type.equals(DOUBLE) - || type.equals(DATE)) { - return true; - } - return false; + partitionStatistics.forEach((partition, statistics) -> { + HiveBasicStatistics basicStatistics = statistics.getBasicStatistics(); + OptionalLong rowCount = basicStatistics.getRowCount(); + rowCount.ifPresent(count -> checkStatistics(count >= 0, table, partition, "rowCount must be greater than or equal to zero: %s", count)); + basicStatistics.getFileCount().ifPresent(count -> checkStatistics(count >= 0, table, partition, "fileCount must be greater than or equal to zero: %s", count)); + basicStatistics.getInMemoryDataSizeInBytes().ifPresent(size -> checkStatistics(size >= 0, table, partition, "inMemoryDataSizeInBytes must be greater than or equal to zero: %s", size)); + basicStatistics.getOnDiskDataSizeInBytes().ifPresent(size -> checkStatistics(size >= 0, table, partition, "onDiskDataSizeInBytes must be greater than or equal to zero: %s", size)); + statistics.getColumnStatistics().forEach((column, columnStatistics) -> validateColumnStatistics(table, partition, column, rowCount, columnStatistics)); + }); } - public static DoubleRange createPrestoRange(Type type, Object min, Object max) + private static void validateColumnStatistics(SchemaTableName table, String partition, String column, OptionalLong rowCount, HiveColumnStatistics columnStatistics) { - return new DoubleRange(convertPrestoValueToStatsRepresentation(type, min), convertPrestoValueToStatsRepresentation(type, max)); + columnStatistics.getMaxValueSizeInBytes().ifPresent(maxValueSizeInBytes -> + checkStatistics(maxValueSizeInBytes >= 0, table, partition, column, "maxValueSizeInBytes must be greater than or equal to zero: %s", maxValueSizeInBytes)); + columnStatistics.getTotalSizeInBytes().ifPresent(totalSizeInBytes -> + checkStatistics(totalSizeInBytes >= 0, table, partition, column, "totalSizeInBytes must be greater than or equal to zero: %s", totalSizeInBytes)); + columnStatistics.getNullsCount().ifPresent(nullsCount -> { + checkStatistics(nullsCount >= 0, table, partition, column, "nullsCount must be greater than or equal to zero: %s", nullsCount); + if (rowCount.isPresent()) { + checkStatistics( + nullsCount <= rowCount.getAsLong(), + table, + partition, + column, + "nullsCount must be less than or equal to rowCount. nullsCount: %s. rowCount: %s.", + nullsCount, + rowCount.getAsLong()); + } + }); + columnStatistics.getDistinctValuesCount().ifPresent(distinctValuesCount -> + checkStatistics(distinctValuesCount >= 0, table, partition, column, "distinctValuesCount must be greater than or equal to zero: %s", distinctValuesCount)); + columnStatistics.getIntegerStatistics().ifPresent(integerStatistics -> { + OptionalLong min = integerStatistics.getMin(); + OptionalLong max = integerStatistics.getMax(); + if (min.isPresent() && max.isPresent()) { + checkStatistics( + min.getAsLong() <= max.getAsLong(), + table, + partition, + column, + "integerStatistics.min must be less than or equal to integerStatistics.max. integerStatistics.min: %s. integerStatistics.max: %s.", + min.getAsLong(), + max.getAsLong()); + } + }); + columnStatistics.getDoubleStatistics().ifPresent(doubleStatistics -> { + OptionalDouble min = doubleStatistics.getMin(); + OptionalDouble max = doubleStatistics.getMax(); + if (min.isPresent() && max.isPresent() && !isNaN(min.getAsDouble()) && !isNaN(max.getAsDouble())) { + checkStatistics( + min.getAsDouble() <= max.getAsDouble(), + table, + partition, + column, + "doubleStatistics.min must be less than or equal to doubleStatistics.max. doubleStatistics.min: %s. doubleStatistics.max: %s.", + min.getAsDouble(), + max.getAsDouble()); + } + }); + columnStatistics.getDecimalStatistics().ifPresent(decimalStatistics -> { + Optional min = decimalStatistics.getMin(); + Optional max = decimalStatistics.getMax(); + if (min.isPresent() && max.isPresent()) { + checkStatistics( + min.get().compareTo(max.get()) <= 0, + table, + partition, + column, + "decimalStatistics.min must be less than or equal to decimalStatistics.max. decimalStatistics.min: %s. decimalStatistics.max: %s.", + min.get(), + max.get()); + } + }); + columnStatistics.getDateStatistics().ifPresent(dateStatistics -> { + Optional min = dateStatistics.getMin(); + Optional max = dateStatistics.getMax(); + if (min.isPresent() && max.isPresent()) { + checkStatistics( + min.get().compareTo(max.get()) <= 0, + table, + partition, + column, + "dateStatistics.min must be less than or equal to dateStatistics.max. dateStatistics.min: %s. dateStatistics.max: %s.", + min.get(), + max.get()); + } + }); + columnStatistics.getBooleanStatistics().ifPresent(booleanStatistics -> { + OptionalLong falseCount = booleanStatistics.getFalseCount(); + OptionalLong trueCount = booleanStatistics.getTrueCount(); + falseCount.ifPresent(count -> + checkStatistics(count >= 0, table, partition, column, "falseCount must be greater than or equal to zero: %s", count)); + trueCount.ifPresent(count -> + checkStatistics(count >= 0, table, partition, column, "trueCount must be greater than or equal to zero: %s", count)); + if (rowCount.isPresent() && falseCount.isPresent()) { + checkStatistics( + falseCount.getAsLong() <= rowCount.getAsLong(), + table, + partition, + column, + "booleanStatistics.falseCount must be less than or equal to rowCount. booleanStatistics.falseCount: %s. rowCount: %s.", + falseCount.getAsLong(), + rowCount.getAsLong()); + } + if (rowCount.isPresent() && trueCount.isPresent()) { + checkStatistics( + trueCount.getAsLong() <= rowCount.getAsLong(), + table, + partition, + column, + "booleanStatistics.trueCount must be less than or equal to rowCount. booleanStatistics.trueCount: %s. rowCount: %s.", + trueCount.getAsLong(), + rowCount.getAsLong()); + } + }); } - private static double convertPrestoValueToStatsRepresentation(Type type, Object value) + private static void checkStatistics(boolean expression, SchemaTableName table, String partition, String column, String message, Object... args) { - if (type.equals(BIGINT) || type.equals(INTEGER) || type.equals(SMALLINT) || type.equals(TINYINT)) { - return (Long) value; + if (!expression) { + throw new PrestoException( + HIVE_CORRUPTED_COLUMN_STATISTICS, + format("Corrupted partition statistics (Table: %s Partition: [%s] Column: %s): %s", table, partition, column, format(message, args))); } - if (type.equals(DOUBLE)) { - return (Double) value; + } + + private static void checkStatistics(boolean expression, SchemaTableName table, String partition, String message, Object... args) + { + if (!expression) { + throw new PrestoException( + HIVE_CORRUPTED_COLUMN_STATISTICS, + format("Corrupted partition statistics (Table: %s Partition: [%s]): %s", table, partition, format(message, args))); } - if (type.equals(REAL)) { - return intBitsToFloat(((Long) value).intValue()); + } + + private static TableStatistics getTableStatistics( + Map columns, + Map columnTypes, + List partitions, + Map statistics) + { + if (statistics.isEmpty()) { + return TableStatistics.empty(); } - if (type instanceof DecimalType) { - DecimalType decimalType = (DecimalType) type; - if (isShortDecimal(decimalType)) { - return parseDouble(Decimals.toString((Long) value, decimalType.getScale())); + + checkArgument(!partitions.isEmpty(), "partitions is empty"); + + OptionalDouble optionalAverageRowsPerPartition = calculateAverageRowsPerPartition(statistics.values()); + if (!optionalAverageRowsPerPartition.isPresent()) { + return TableStatistics.empty(); + } + double averageRowsPerPartition = optionalAverageRowsPerPartition.getAsDouble(); + verify(averageRowsPerPartition >= 0, "averageRowsPerPartition must be greater than or equal to zero"); + int queriedPartitionsCount = partitions.size(); + double rowCount = averageRowsPerPartition * queriedPartitionsCount; + + TableStatistics.Builder result = TableStatistics.builder(); + result.setRowCount(Estimate.of(rowCount)); + for (Map.Entry column : columns.entrySet()) { + String columnName = column.getKey(); + HiveColumnHandle columnHandle = (HiveColumnHandle) column.getValue(); + Type columnType = columnTypes.get(columnName); + ColumnStatistics columnStatistics; + if (columnHandle.isPartitionKey()) { + columnStatistics = createPartitionColumnStatistics(columnHandle, columnType, partitions, statistics, averageRowsPerPartition, rowCount); } - if (isLongDecimal(decimalType)) { - return parseDouble(Decimals.toString((Slice) value, decimalType.getScale())); + else { + columnStatistics = createDataColumnStatistics(columnName, columnType, rowCount, statistics.values()); } - throw new IllegalArgumentException("Unexpected decimal type: " + decimalType); + result.setColumnStatistics(columnHandle, columnStatistics); } - if (type.equals(DATE)) { - return (Long) value; - } - throw new IllegalArgumentException("Unsupported type: " + type); + return result.build(); } - private OptionalDouble calculateRowsPerPartition(Map statisticsSample) + @VisibleForTesting + static OptionalDouble calculateAverageRowsPerPartition(Collection statistics) { - return statisticsSample.values().stream() + return statistics.stream() .map(PartitionStatistics::getBasicStatistics) .map(HiveBasicStatistics::getRowCount) .filter(OptionalLong::isPresent) .mapToLong(OptionalLong::getAsLong) + .peek(count -> verify(count >= 0, "count must be greater than or equal to zero")) .average(); } - private Estimate calculateRowsCount(OptionalDouble rowsPerPartition, int queriedPartitionsCount) + private static ColumnStatistics createPartitionColumnStatistics( + HiveColumnHandle column, + Type type, + List partitions, + Map statistics, + double averageRowsPerPartition, + double rowCount) { - if (!rowsPerPartition.isPresent()) { - return Estimate.unknown(); + return ColumnStatistics.builder() + .setDistinctValuesCount(Estimate.of(calculateDistinctPartitionKeys(column, partitions, statistics, averageRowsPerPartition))) + .setNullsFraction(Estimate.of(calculateNullsFractionForPartitioningKey(column, partitions, statistics, averageRowsPerPartition, rowCount))) + .setRange(calculateRangeForPartitioningKey(column, type, partitions)) + .setDataSize(calculateDataSizeForPartitioningKey(column, type, partitions, statistics, averageRowsPerPartition)) + .build(); + } + + @VisibleForTesting + static long calculateDistinctPartitionKeys( + HiveColumnHandle column, + List partitions, + Map statistics, + double averageRowsPerPartition) + { + return partitions.stream() + // consider only non empty partitions + .filter(partition -> getPartitionRowCount(partition.getPartitionId(), statistics).orElse(averageRowsPerPartition) > 0) + .map(partition -> partition.getKeys().get(column)) + .filter(value -> !value.isNull()) + .distinct() + .count(); + } + + @VisibleForTesting + static double calculateNullsFractionForPartitioningKey( + HiveColumnHandle column, + List partitions, + Map statistics, + double averageRowsPerPartition, + double rowCount) + { + if (rowCount == 0) { + return 0; } - return Estimate.of(rowsPerPartition.getAsDouble() * queriedPartitionsCount); + double estimatedNullsCount = partitions.stream() + .filter(partition -> partition.getKeys().get(column).isNull()) + .map(HivePartition::getPartitionId) + .mapToDouble(partitionName -> getPartitionRowCount(partitionName, statistics).orElse(averageRowsPerPartition)) + .sum(); + return normalizeFraction(estimatedNullsCount / rowCount); } - private Estimate calculateDistinctValuesCount(Map statisticsSample, String column) + private static double normalizeFraction(double fraction) { - return summarizePartitionStatistics( - statisticsSample.values(), - column, - columnStatistics -> { - if (columnStatistics.getDistinctValuesCount().isPresent()) { - return OptionalDouble.of(columnStatistics.getDistinctValuesCount().getAsLong()); - } - if (columnStatistics.getBooleanStatistics().isPresent() && - columnStatistics.getBooleanStatistics().get().getFalseCount().isPresent() && - columnStatistics.getBooleanStatistics().get().getTrueCount().isPresent()) { - long falseCount = columnStatistics.getBooleanStatistics().get().getFalseCount().getAsLong(); - long trueCount = columnStatistics.getBooleanStatistics().get().getTrueCount().getAsLong(); - return OptionalDouble.of((falseCount > 0 ? 1 : 0) + (trueCount > 0 ? 1 : 0)); - } - return OptionalDouble.empty(); - }, - DoubleStream::max); + checkArgument(!isNaN(fraction), "fraction is NaN"); + checkArgument(isFinite(fraction), "fraction must be finite"); + if (fraction < 0) { + return 0; + } + if (fraction > 1) { + return 1; + } + return fraction; } - private Estimate calculateNullsFraction(Map statisticsSample, int totalPartitionsCount, String column, Estimate rowCount) + @VisibleForTesting + static Estimate calculateDataSizeForPartitioningKey( + HiveColumnHandle column, + Type type, + List partitions, + Map statistics, + double averageRowsPerPartition) { - if (rowCount.isUnknown()) { + if (!isVarcharType(type) && !isCharType(type)) { return Estimate.unknown(); } - if (rowCount.getValue() == 0.0) { - return Estimate.zero(); + double dataSize = 0; + for (HivePartition partition : partitions) { + int length = getSize(partition.getKeys().get(column)); + double rowCount = getPartitionRowCount(partition.getPartitionId(), statistics).orElse(averageRowsPerPartition); + dataSize += length * rowCount; } + return Estimate.of(dataSize); + } - Estimate totalNullsCount = summarizePartitionStatistics( - statisticsSample.values(), - column, - columnStatistics -> { - if (!columnStatistics.getNullsCount().isPresent()) { - return OptionalDouble.empty(); - } - return OptionalDouble.of(columnStatistics.getNullsCount().getAsLong()); - }, - nullsCountStream -> { - double nullsCount = 0; - long partitionsWithStatisticsCount = 0; - for (PrimitiveIterator.OfDouble nullsCountIterator = nullsCountStream.iterator(); nullsCountIterator.hasNext(); ) { - nullsCount += nullsCountIterator.nextDouble(); - partitionsWithStatisticsCount++; - } - - if (partitionsWithStatisticsCount == 0) { - return OptionalDouble.empty(); - } - return OptionalDouble.of(totalPartitionsCount / partitionsWithStatisticsCount * nullsCount); - }); + private static int getSize(NullableValue nullableValue) + { + if (nullableValue.isNull()) { + return 0; + } + Object value = nullableValue.getValue(); + checkArgument(value instanceof Slice, "value is expected to be of Slice type"); + return ((Slice) value).length(); + } - if (totalNullsCount.isUnknown()) { - return Estimate.unknown(); + private static OptionalDouble getPartitionRowCount(String partitionName, Map statistics) + { + PartitionStatistics partitionStatistics = statistics.get(partitionName); + if (partitionStatistics == null) { + return OptionalDouble.empty(); + } + OptionalLong rowCount = partitionStatistics.getBasicStatistics().getRowCount(); + if (rowCount.isPresent()) { + verify(rowCount.getAsLong() >= 0, "rowCount must be greater than or equal to zero"); + return OptionalDouble.of(rowCount.getAsLong()); } - return Estimate.of(totalNullsCount.getValue() / rowCount.getValue()); + return OptionalDouble.empty(); } - private Estimate calculateDataSize(Map statisticsSample, String columnName, Estimate rowCount) + @VisibleForTesting + static Optional calculateRangeForPartitioningKey(HiveColumnHandle column, Type type, List partitions) { - if (rowCount.isUnknown()) { - return Estimate.unknown(); + if (!isRangeSupported(type)) { + return Optional.empty(); } - int knownPartitionCount = 0; - double knownRowCount = 0; - double knownDataSize = 0; + List values = partitions.stream() + .map(HivePartition::getKeys) + .map(keys -> keys.get(column)) + .filter(value -> !value.isNull()) + .map(NullableValue::getValue) + .map(value -> convertPartitionValueToDouble(type, value)) + .collect(toImmutableList()); - for (PartitionStatistics statistics : statisticsSample.values()) { - if (!statistics.getBasicStatistics().getRowCount().isPresent()) { - continue; - } - double partitionRowCount = statistics.getBasicStatistics().getRowCount().getAsLong(); + if (values.isEmpty()) { + return Optional.empty(); + } - HiveColumnStatistics partitionColumnStatistics = statistics.getColumnStatistics().get(columnName); - if (partitionColumnStatistics == null || !partitionColumnStatistics.getTotalSizeInBytes().isPresent()) { - continue; - } + double min = values.get(0); + double max = values.get(0); - knownPartitionCount++; - knownRowCount += partitionRowCount; - // Note: average column length from Hive might not translate directly into internal data size - knownDataSize += partitionColumnStatistics.getTotalSizeInBytes().getAsLong(); + for (Double value : values) { + if (value > max) { + max = value; + } + if (value < min) { + min = value; + } } - if (knownPartitionCount == 0) { - return Estimate.unknown(); + return Optional.of(new DoubleRange(min, max)); + } + + @VisibleForTesting + static double convertPartitionValueToDouble(Type type, Object value) + { + if (type.equals(BIGINT) || type.equals(INTEGER) || type.equals(SMALLINT) || type.equals(TINYINT)) { + return (Long) value; + } + if (type.equals(DOUBLE)) { + return (Double) value; } + if (type.equals(REAL)) { + return intBitsToFloat(((Long) value).intValue()); + } + if (type instanceof DecimalType) { + DecimalType decimalType = (DecimalType) type; + if (isShortDecimal(decimalType)) { + return parseDouble(Decimals.toString((Long) value, decimalType.getScale())); + } + if (isLongDecimal(decimalType)) { + return parseDouble(Decimals.toString((Slice) value, decimalType.getScale())); + } + throw new IllegalArgumentException("Unexpected decimal type: " + decimalType); + } + if (type.equals(DATE)) { + return (Long) value; + } + throw new IllegalArgumentException("Unexpected type: " + type); + } - if (knownDataSize == 0) { - return Estimate.zero(); + @VisibleForTesting + static ColumnStatistics createDataColumnStatistics(String column, Type type, double rowsCount, Collection partitionStatistics) + { + List columnStatistics = partitionStatistics.stream() + .map(PartitionStatistics::getColumnStatistics) + .map(statistics -> statistics.get(column)) + .filter(Objects::nonNull) + .collect(toImmutableList()); + + if (columnStatistics.isEmpty()) { + return ColumnStatistics.empty(); } - verify(knownRowCount > 0); - return Estimate.of(knownDataSize / knownRowCount * rowCount.getValue()); + return ColumnStatistics.builder() + .setDistinctValuesCount(calculateDistinctValuesCount(columnStatistics)) + .setNullsFraction(calculateNullsFraction(column, partitionStatistics)) + .setDataSize(calculateDataSize(column, partitionStatistics, rowsCount)) + .setRange(calculateRange(type, columnStatistics)) + .build(); } - private Estimate countDistinctPartitionKeys(HiveColumnHandle partitionColumn, List partitions) + @VisibleForTesting + static Estimate calculateDistinctValuesCount(List columnStatistics) { - return Estimate.of(partitions.stream() - .map(HivePartition::getKeys) - .map(keys -> keys.get(partitionColumn)) - .distinct() - .count()); + return columnStatistics.stream() + .map(MetastoreHiveStatisticsProvider::getDistinctValuesCount) + .filter(OptionalLong::isPresent) + .map(OptionalLong::getAsLong) + .peek(distinctValuesCount -> verify(distinctValuesCount >= 0, "distinctValuesCount must be greater than or equal to zero")) + .max(Long::compare) + .map(Estimate::of) + .orElse(Estimate.unknown()); } - private Estimate calculateNullsFractionForPartitioningKey( - HiveColumnHandle partitionColumn, - List queriedPartitions, - Map statisticsSample, - Estimate rowCount, - OptionalDouble rowsPerPartition) + private static OptionalLong getDistinctValuesCount(HiveColumnStatistics statistics) { - if (rowCount.isUnknown()) { - return Estimate.unknown(); + if (statistics.getBooleanStatistics().isPresent() && + statistics.getBooleanStatistics().get().getFalseCount().isPresent() && + statistics.getBooleanStatistics().get().getTrueCount().isPresent()) { + long falseCount = statistics.getBooleanStatistics().get().getFalseCount().getAsLong(); + long trueCount = statistics.getBooleanStatistics().get().getTrueCount().getAsLong(); + return OptionalLong.of((falseCount > 0 ? 1 : 0) + (trueCount > 0 ? 1 : 0)); } - if (rowCount.getValue() == 0.0) { - return Estimate.zero(); + if (statistics.getDistinctValuesCount().isPresent()) { + return statistics.getDistinctValuesCount(); } - if (!rowsPerPartition.isPresent()) { + return OptionalLong.empty(); + } + + @VisibleForTesting + static Estimate calculateNullsFraction(String column, Collection partitionStatistics) + { + List statisticsWithKnownRowCountAndNullsCount = partitionStatistics.stream() + .filter(statistics -> { + if (!statistics.getBasicStatistics().getRowCount().isPresent()) { + return false; + } + HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column); + if (columnStatistics == null) { + return false; + } + return columnStatistics.getNullsCount().isPresent(); + }) + .collect(toImmutableList()); + + if (statisticsWithKnownRowCountAndNullsCount.isEmpty()) { return Estimate.unknown(); } - double estimatedNullsCount = queriedPartitions.stream() - .filter(partition -> partition.getKeys().get(partitionColumn).isNull()) - .map(HivePartition::getPartitionId) - .mapToDouble(partitionId -> orElse(statisticsSample.get(partitionId).getBasicStatistics().getRowCount(), rowsPerPartition.getAsDouble())) - .sum(); - return Estimate.of(estimatedNullsCount / rowCount.getValue()); + long totalNullsCount = 0; + long totalRowCount = 0; + for (PartitionStatistics statistics : statisticsWithKnownRowCountAndNullsCount) { + long rowCount = statistics.getBasicStatistics().getRowCount().orElseThrow(() -> new VerifyException("rowCount is not present")); + verify(rowCount >= 0, "rowCount must be greater than or equal to zero"); + HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column); + verify(columnStatistics != null, "columnStatistics is null"); + long nullsCount = columnStatistics.getNullsCount().orElseThrow(() -> new VerifyException("nullsCount is not present")); + verify(nullsCount >= 0, "nullsCount must be greater than or equal to zero"); + verify(nullsCount <= rowCount, "nullsCount must be less than or equal to rowCount. nullsCount: %s. rowCount: %s.", nullsCount, rowCount); + totalNullsCount += nullsCount; + totalRowCount += rowCount; + } + + if (totalRowCount == 0) { + return Estimate.zero(); + } + + verify( + totalNullsCount <= totalRowCount, + "totalNullsCount must be less than or equal to totalRowCount. totalNullsCount: %s. totalRowCount: %s.", + totalNullsCount, + totalRowCount); + return Estimate.of(((double) totalNullsCount) / totalRowCount); } - private Estimate calculateDataSizeForPartitioningKey( - HiveColumnHandle partitionColumn, - List queriedPartitions, - Map statisticsSample, - Estimate rowCount, - OptionalDouble rowsPerPartition) + @VisibleForTesting + static Estimate calculateDataSize(String column, Collection partitionStatistics, double totalRowCount) { - if (rowCount.isUnknown() || !rowsPerPartition.isPresent()) { - return Estimate.unknown(); - } + List statisticsWithKnownRowCountAndDataSize = partitionStatistics.stream() + .filter(statistics -> { + if (!statistics.getBasicStatistics().getRowCount().isPresent()) { + return false; + } + HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column); + if (columnStatistics == null) { + return false; + } + return columnStatistics.getTotalSizeInBytes().isPresent(); + }) + .collect(toImmutableList()); - String baseType = partitionColumn.getTypeSignature().getBase(); - if (!VARCHAR.equals(baseType) && !CHAR.equalsIgnoreCase(baseType)) { - // TODO support VARBINARY + if (statisticsWithKnownRowCountAndDataSize.isEmpty()) { return Estimate.unknown(); } - double knownRowCount = 0; - double knownDataSize = 0; - - for (HivePartition partition : queriedPartitions) { - NullableValue value = partition.getKeys().get(partitionColumn); - int length = value.isNull() ? 0 : ((Slice) value.getValue()).length(); + long knownRowCount = 0; + long knownDataSize = 0; + for (PartitionStatistics statistics : statisticsWithKnownRowCountAndDataSize) { + long rowCount = statistics.getBasicStatistics().getRowCount().orElseThrow(() -> new VerifyException("rowCount is not present")); + verify(rowCount >= 0, "rowCount must be greater than or equal to zero"); + HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column); + verify(columnStatistics != null, "columnStatistics is null"); + long dataSize = columnStatistics.getTotalSizeInBytes().orElseThrow(() -> new VerifyException("totalSizeInBytes is not present")); + verify(dataSize >= 0, "dataSize must be greater than or equal to zero"); + knownRowCount += rowCount; + knownDataSize += dataSize; + } - double partitionRowCount = orElse( - Optional.ofNullable(statisticsSample.get(partition.getPartitionId())) - .orElseGet(PartitionStatistics::empty) - .getBasicStatistics() - .getRowCount(), - rowsPerPartition.getAsDouble()); - knownRowCount += partitionRowCount; - knownDataSize += length * partitionRowCount; + if (totalRowCount == 0) { + return Estimate.zero(); } if (knownRowCount == 0) { return Estimate.unknown(); } - return Estimate.of(knownDataSize / knownRowCount * rowCount.getValue()); + double averageValueDataSizeInBytes = ((double) knownDataSize) / knownRowCount; + return Estimate.of(averageValueDataSizeInBytes * totalRowCount); } - private Estimate summarizePartitionStatistics( - Collection partitionStatistics, - String column, - Function valueExtractFunction, - Function valueAggregateFunction) + @VisibleForTesting + static Optional calculateRange(Type type, List columnStatistics) { - DoubleStream intermediateStream = partitionStatistics.stream() - .map(PartitionStatistics::getColumnStatistics) - .filter(stats -> stats.containsKey(column)) - .map(stats -> stats.get(column)) - .map(valueExtractFunction) - .filter(OptionalDouble::isPresent) - .mapToDouble(OptionalDouble::getAsDouble); - - OptionalDouble statisticsValue = valueAggregateFunction.apply(intermediateStream); + if (!isRangeSupported(type)) { + return Optional.empty(); + } + return columnStatistics.stream() + .map(statistics -> createRange(type, statistics)) + .filter(Optional::isPresent) + .map(Optional::get) + .reduce(DoubleRange::union); + } - if (!statisticsValue.isPresent()) { - return Estimate.unknown(); - } - return Estimate.of(statisticsValue.getAsDouble()); + private static boolean isRangeSupported(Type type) + { + return type.equals(TINYINT) + || type.equals(SMALLINT) + || type.equals(INTEGER) + || type.equals(BIGINT) + || type.equals(REAL) + || type.equals(DOUBLE) + || type.equals(DATE) + || type instanceof DecimalType; } - private Map getPartitionsStatistics(HiveTableHandle tableHandle, List hivePartitions) + private static Optional createRange(Type type, HiveColumnStatistics statistics) { - if (hivePartitions.isEmpty()) { - return ImmutableMap.of(); + if (type.equals(BIGINT) || type.equals(INTEGER) || type.equals(SMALLINT) || type.equals(TINYINT)) { + return statistics.getIntegerStatistics().flatMap(integerStatistics -> createIntegerRange(type, integerStatistics)); } - boolean unpartitioned = hivePartitions.stream().anyMatch(partition -> partition.getPartitionId().equals(HivePartition.UNPARTITIONED_ID)); - if (unpartitioned) { - checkArgument(hivePartitions.size() == 1, "expected only one hive partition"); + if (type.equals(DOUBLE) || type.equals(REAL)) { + return statistics.getDoubleStatistics().flatMap(MetastoreHiveStatisticsProvider::createDoubleRange); } - - if (unpartitioned) { - return ImmutableMap.of(HivePartition.UNPARTITIONED_ID, metastore.getTableStatistics(tableHandle.getSchemaName(), tableHandle.getTableName())); + if (type.equals(DATE)) { + return statistics.getDateStatistics().flatMap(MetastoreHiveStatisticsProvider::createDateRange); } - else { - return metastore.getPartitionStatistics( - tableHandle.getSchemaName(), - tableHandle.getTableName(), - hivePartitions.stream() - .map(HivePartition::getPartitionId) - .collect(toImmutableSet())); + if (type instanceof DecimalType) { + return statistics.getDecimalStatistics().flatMap(MetastoreHiveStatisticsProvider::createDecimalRange); } + throw new IllegalArgumentException("Unexpected type: " + type); } - @VisibleForTesting - static List getPartitionsSample(List partitions, int sampleSize) + private static Optional createIntegerRange(Type type, IntegerStatistics statistics) { - checkArgument(sampleSize > 0, "sampleSize is expected to be greater than zero"); - - if (partitions.size() <= sampleSize) { - return partitions; + if (statistics.getMin().isPresent() && statistics.getMax().isPresent()) { + return Optional.of(createIntegerRange(type, statistics.getMin().getAsLong(), statistics.getMax().getAsLong())); } + return Optional.empty(); + } - List result = new ArrayList<>(); - - int samplesLeft = sampleSize; + private static DoubleRange createIntegerRange(Type type, long min, long max) + { + return new DoubleRange(normalizeIntegerValue(type, min), normalizeIntegerValue(type, max)); + } - HivePartition min = partitions.get(0); - HivePartition max = partitions.get(0); - for (HivePartition partition : partitions) { - if (partition.getPartitionId().compareTo(min.getPartitionId()) < 0) { - min = partition; - } - else if (partition.getPartitionId().compareTo(max.getPartitionId()) > 0) { - max = partition; - } + private static long normalizeIntegerValue(Type type, long value) + { + if (type.equals(BIGINT)) { + return value; } - - result.add(min); - samplesLeft--; - if (samplesLeft > 0) { - result.add(max); - samplesLeft--; + if (type.equals(INTEGER)) { + return Ints.saturatedCast(value); + } + if (type.equals(SMALLINT)) { + return Shorts.saturatedCast(value); + } + if (type.equals(TINYINT)) { + return SignedBytes.saturatedCast(value); } + throw new IllegalArgumentException("Unexpected type: " + type); + } - if (samplesLeft > 0) { - HashFunction hashFunction = murmur3_128(); - Comparator> hashComparator = Comparator - ., Long>comparing(Map.Entry::getValue) - .thenComparing(entry -> entry.getKey().getPartitionId()); - partitions.stream() - .filter(partition -> !result.contains(partition)) - .map(partition -> immutableEntry(partition, hashFunction.hashUnencodedChars(partition.getPartitionId()).asLong())) - .sorted(hashComparator) - .limit(samplesLeft) - .forEachOrdered(entry -> result.add(entry.getKey())); + private static Optional createDoubleRange(DoubleStatistics statistics) + { + if (statistics.getMin().isPresent() && statistics.getMax().isPresent() && !isNaN(statistics.getMin().getAsDouble()) && !isNaN(statistics.getMax().getAsDouble())) { + return Optional.of(new DoubleRange(statistics.getMin().getAsDouble(), statistics.getMax().getAsDouble())); } + return Optional.empty(); + } - return unmodifiableList(result); + private static Optional createDateRange(DateStatistics statistics) + { + if (statistics.getMin().isPresent() && statistics.getMax().isPresent()) { + return Optional.of(new DoubleRange(statistics.getMin().get().toEpochDay(), statistics.getMax().get().toEpochDay())); + } + return Optional.empty(); } - private static double orElse(OptionalLong value, double other) + private static Optional createDecimalRange(DecimalStatistics statistics) { - if (value.isPresent()) { - return value.getAsLong(); + if (statistics.getMin().isPresent() && statistics.getMax().isPresent()) { + return Optional.of(new DoubleRange(statistics.getMin().get().doubleValue(), statistics.getMax().get().doubleValue())); } - return other; + return Optional.empty(); + } + + @VisibleForTesting + interface PartitionsStatisticsProvider + { + Map getPartitionsStatistics(SchemaTableName table, List hivePartitions); } } diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/util/Statistics.java b/presto-hive/src/main/java/com/facebook/presto/hive/util/Statistics.java index 68b1445253842..8614333ed050d 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/util/Statistics.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/util/Statistics.java @@ -28,7 +28,6 @@ import com.facebook.presto.spi.statistics.ColumnStatisticType; import com.facebook.presto.spi.statistics.ComputedStatistics; import com.facebook.presto.spi.type.DecimalType; -import com.facebook.presto.spi.type.Decimals; import com.facebook.presto.spi.type.SqlDate; import com.facebook.presto.spi.type.SqlDecimal; import com.facebook.presto.spi.type.Type; @@ -36,7 +35,6 @@ import org.joda.time.DateTimeZone; import java.math.BigDecimal; -import java.math.BigInteger; import java.time.LocalDate; import java.util.Collection; import java.util.HashMap; @@ -72,8 +70,6 @@ import static com.google.common.collect.ImmutableList.toImmutableList; import static com.google.common.collect.ImmutableMap.toImmutableMap; import static com.google.common.collect.Sets.intersection; -import static java.lang.Float.floatToRawIntBits; -import static java.util.Objects.requireNonNull; import static java.util.concurrent.TimeUnit.MILLISECONDS; public final class Statistics @@ -175,23 +171,6 @@ private static Optional mergeBooleanStatistics(Optional> T min(T first, T second) return first.compareTo(second) <= 0 ? first : second; } - public static Range getMinMaxAsPrestoNativeValues(HiveColumnStatistics statistics, Type type, DateTimeZone timeZone) - { - if (type.equals(BIGINT) || type.equals(INTEGER) || type.equals(SMALLINT) || type.equals(TINYINT)) { - return statistics.getIntegerStatistics().map(integerStatistics -> Range.create( - integerStatistics.getMin(), - integerStatistics.getMax())) - .orElse(Range.empty()); - } - if (type.equals(DOUBLE)) { - return statistics.getDoubleStatistics().map(doubleStatistics -> Range.create( - doubleStatistics.getMin(), - doubleStatistics.getMax())) - .orElse(Range.empty()); - } - if (type.equals(REAL)) { - return statistics.getDoubleStatistics().map(doubleStatistics -> Range.create( - boxed(doubleStatistics.getMin()).map(Statistics::floatAsDoubleToLongBits), - boxed(doubleStatistics.getMax()).map(Statistics::floatAsDoubleToLongBits))) - .orElse(Range.empty()); - } - if (type.equals(DATE)) { - return statistics.getDateStatistics().map(dateStatistics -> Range.create( - dateStatistics.getMin().map(LocalDate::toEpochDay), - dateStatistics.getMax().map(LocalDate::toEpochDay))) - .orElse(Range.empty()); - } - if (type.equals(TIMESTAMP)) { - return statistics.getIntegerStatistics().map(integerStatistics -> Range.create( - boxed(integerStatistics.getMin()).map(value -> convertLocalToUtc(timeZone, value)), - boxed(integerStatistics.getMax()).map(value -> convertLocalToUtc(timeZone, value)))) - .orElse(Range.empty()); - } - if (type instanceof DecimalType) { - return statistics.getDecimalStatistics().map(decimalStatistics -> Range.create( - decimalStatistics.getMin().map(value -> encodeDecimal(type, value)), - decimalStatistics.getMax().map(value -> encodeDecimal(type, value)))) - .orElse(Range.empty()); - } - return Range.empty(); - } - - private static long floatAsDoubleToLongBits(double value) - { - return floatToRawIntBits((float) value); - } - - private static long convertLocalToUtc(DateTimeZone timeZone, long value) - { - return timeZone.convertLocalToUTC(value * 1000, false); - } - - private static Comparable encodeDecimal(Type type, BigDecimal value) - { - BigInteger unscaled = Decimals.rescale(value, (DecimalType) type).unscaledValue(); - if (Decimals.isShortDecimal(type)) { - return unscaled.longValueExact(); - } - return Decimals.encodeUnscaledValue(unscaled); - } - public static Map, ComputedStatistics> createComputedStatisticsToPartitionMap( Collection computedStatistics, List partitionColumns, @@ -463,16 +382,6 @@ private static Optional getDecimalValue(ConnectorSession session, Ty return block.isNull(0) ? Optional.empty() : Optional.of(((SqlDecimal) type.getObjectValue(session, block, 0)).toBigDecimal()); } - private static Optional boxed(OptionalLong input) - { - return input.isPresent() ? Optional.of(input.getAsLong()) : Optional.empty(); - } - - private static Optional boxed(OptionalDouble input) - { - return input.isPresent() ? Optional.of(input.getAsDouble()) : Optional.empty(); - } - public enum ReduceOperator { ADD, @@ -480,48 +389,4 @@ public enum ReduceOperator MIN, MAX, } - - public static class Range - { - private static final Range EMPTY = new Range(Optional.empty(), Optional.empty()); - - private final Optional> min; - private final Optional> max; - - public static Range empty() - { - return EMPTY; - } - - public static Range create(Optional> min, Optional> max) - { - return new Range(min, max); - } - - public static Range create(OptionalLong min, OptionalLong max) - { - return new Range(boxed(min), boxed(max)); - } - - public static Range create(OptionalDouble min, OptionalDouble max) - { - return new Range(boxed(min), boxed(max)); - } - - public Range(Optional> min, Optional> max) - { - this.min = requireNonNull(min, "min is null"); - this.max = requireNonNull(max, "max is null"); - } - - public Optional> getMin() - { - return min; - } - - public Optional> getMax() - { - return max; - } - } } diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveClientConfig.java b/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveClientConfig.java index 258d5c57c23ef..9f2f8c3b58fa7 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveClientConfig.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveClientConfig.java @@ -108,6 +108,7 @@ public void testDefaults() .setCreatesOfNonManagedTablesEnabled(true) .setHdfsWireEncryptionEnabled(false) .setPartitionStatisticsSampleSize(100) + .setIgnoreCorruptedStatistics(false) .setCollectColumnStatisticsOnWrite(false)); } @@ -185,6 +186,7 @@ public void testExplicitPropertyMappings() .put("hive.non-managed-table-creates-enabled", "false") .put("hive.hdfs.wire-encryption.enabled", "true") .put("hive.partition-statistics-sample-size", "1234") + .put("hive.ignore-corrupted-statistics", "true") .put("hive.collect-column-statistics-on-write", "true") .build(); @@ -259,6 +261,7 @@ public void testExplicitPropertyMappings() .setCreatesOfNonManagedTablesEnabled(false) .setHdfsWireEncryptionEnabled(true) .setPartitionStatisticsSampleSize(1234) + .setIgnoreCorruptedStatistics(true) .setCollectColumnStatisticsOnWrite(true); ConfigAssertions.assertFullMapping(properties, expected); diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/statistics/TestMetastoreHiveStatisticsProvider.java b/presto-hive/src/test/java/com/facebook/presto/hive/statistics/TestMetastoreHiveStatisticsProvider.java index 6f13e49d698c8..68ba391481f3a 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/statistics/TestMetastoreHiveStatisticsProvider.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/statistics/TestMetastoreHiveStatisticsProvider.java @@ -13,27 +13,85 @@ */ package com.facebook.presto.hive.statistics; +import com.facebook.presto.hive.HiveBasicStatistics; +import com.facebook.presto.hive.HiveClientConfig; import com.facebook.presto.hive.HiveColumnHandle; import com.facebook.presto.hive.HivePartition; +import com.facebook.presto.hive.HiveSessionProperties; +import com.facebook.presto.hive.OrcFileWriterConfig; +import com.facebook.presto.hive.PartitionStatistics; +import com.facebook.presto.hive.metastore.DateStatistics; +import com.facebook.presto.hive.metastore.DecimalStatistics; +import com.facebook.presto.hive.metastore.DoubleStatistics; +import com.facebook.presto.hive.metastore.HiveColumnStatistics; +import com.facebook.presto.hive.metastore.IntegerStatistics; +import com.facebook.presto.spi.PrestoException; import com.facebook.presto.spi.SchemaTableName; +import com.facebook.presto.spi.statistics.ColumnStatistics; +import com.facebook.presto.spi.statistics.DoubleRange; +import com.facebook.presto.spi.statistics.Estimate; +import com.facebook.presto.spi.statistics.TableStatistics; +import com.facebook.presto.spi.type.DecimalType; +import com.facebook.presto.spi.type.Type; +import com.facebook.presto.testing.TestingConnectorSession; import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; import org.joda.time.DateTimeZone; import org.testng.annotations.Test; +import java.math.BigDecimal; +import java.time.LocalDate; import java.util.Optional; +import java.util.OptionalDouble; +import java.util.OptionalLong; import static com.facebook.presto.hive.HiveColumnHandle.ColumnType.PARTITION_KEY; +import static com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR; +import static com.facebook.presto.hive.HiveErrorCode.HIVE_CORRUPTED_COLUMN_STATISTICS; +import static com.facebook.presto.hive.HivePartition.UNPARTITIONED_ID; import static com.facebook.presto.hive.HivePartitionManager.parsePartition; import static com.facebook.presto.hive.HiveType.HIVE_LONG; import static com.facebook.presto.hive.HiveType.HIVE_STRING; +import static com.facebook.presto.hive.HiveUtil.parsePartitionValue; +import static com.facebook.presto.hive.metastore.HiveColumnStatistics.createBooleanColumnStatistics; +import static com.facebook.presto.hive.metastore.HiveColumnStatistics.createDateColumnStatistics; +import static com.facebook.presto.hive.metastore.HiveColumnStatistics.createDecimalColumnStatistics; +import static com.facebook.presto.hive.metastore.HiveColumnStatistics.createDoubleColumnStatistics; +import static com.facebook.presto.hive.metastore.HiveColumnStatistics.createIntegerColumnStatistics; +import static com.facebook.presto.hive.statistics.MetastoreHiveStatisticsProvider.calculateAverageRowsPerPartition; +import static com.facebook.presto.hive.statistics.MetastoreHiveStatisticsProvider.calculateDataSize; +import static com.facebook.presto.hive.statistics.MetastoreHiveStatisticsProvider.calculateDataSizeForPartitioningKey; +import static com.facebook.presto.hive.statistics.MetastoreHiveStatisticsProvider.calculateDistinctPartitionKeys; +import static com.facebook.presto.hive.statistics.MetastoreHiveStatisticsProvider.calculateDistinctValuesCount; +import static com.facebook.presto.hive.statistics.MetastoreHiveStatisticsProvider.calculateNullsFraction; +import static com.facebook.presto.hive.statistics.MetastoreHiveStatisticsProvider.calculateNullsFractionForPartitioningKey; +import static com.facebook.presto.hive.statistics.MetastoreHiveStatisticsProvider.calculateRange; +import static com.facebook.presto.hive.statistics.MetastoreHiveStatisticsProvider.calculateRangeForPartitioningKey; +import static com.facebook.presto.hive.statistics.MetastoreHiveStatisticsProvider.convertPartitionValueToDouble; +import static com.facebook.presto.hive.statistics.MetastoreHiveStatisticsProvider.createDataColumnStatistics; import static com.facebook.presto.hive.statistics.MetastoreHiveStatisticsProvider.getPartitionsSample; +import static com.facebook.presto.hive.statistics.MetastoreHiveStatisticsProvider.validatePartitionStatistics; import static com.facebook.presto.spi.type.BigintType.BIGINT; +import static com.facebook.presto.spi.type.DateType.DATE; +import static com.facebook.presto.spi.type.DecimalType.createDecimalType; +import static com.facebook.presto.spi.type.DoubleType.DOUBLE; +import static com.facebook.presto.spi.type.IntegerType.INTEGER; +import static com.facebook.presto.spi.type.RealType.REAL; +import static com.facebook.presto.spi.type.SmallintType.SMALLINT; +import static com.facebook.presto.spi.type.TinyintType.TINYINT; import static com.facebook.presto.spi.type.VarcharType.VARCHAR; +import static java.lang.Double.NaN; +import static java.lang.String.format; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.testng.Assert.assertEquals; public class TestMetastoreHiveStatisticsProvider { private static final SchemaTableName TABLE = new SchemaTableName("schema", "table"); + private static final String PARTITION = "partition"; + private static final String COLUMN = "column"; + private static final DecimalType DECIMAL = createDecimalType(5, 3); private static final HiveColumnHandle PARTITION_COLUMN_1 = new HiveColumnHandle("p1", HIVE_STRING, VARCHAR.getTypeSignature(), 0, PARTITION_KEY, Optional.empty()); private static final HiveColumnHandle PARTITION_COLUMN_2 = new HiveColumnHandle("p2", HIVE_LONG, BIGINT.getTypeSignature(), 1, PARTITION_KEY, Optional.empty()); @@ -56,8 +114,781 @@ public void testGetPartitionsSample() assertEquals(getPartitionsSample(ImmutableList.of(p1, p2, p3, p4, p5), 3), ImmutableList.of(p1, p5, p4)); } + @Test + public void testValidatePartitionStatistics() + { + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(-1, 0, 0, 0)) + .build(), + invalidPartitionStatistics("fileCount must be greater than or equal to zero: -1")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, -1, 0, 0)) + .build(), + invalidPartitionStatistics("rowCount must be greater than or equal to zero: -1")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, -1, 0)) + .build(), + invalidPartitionStatistics("inMemoryDataSizeInBytes must be greater than or equal to zero: -1")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, -1)) + .build(), + invalidPartitionStatistics("onDiskDataSizeInBytes must be greater than or equal to zero: -1")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, HiveColumnStatistics.builder().setMaxValueSizeInBytes(-1).build())) + .build(), + invalidColumnStatistics("maxValueSizeInBytes must be greater than or equal to zero: -1")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, HiveColumnStatistics.builder().setTotalSizeInBytes(-1).build())) + .build(), + invalidColumnStatistics("totalSizeInBytes must be greater than or equal to zero: -1")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, HiveColumnStatistics.builder().setNullsCount(-1).build())) + .build(), + invalidColumnStatistics("nullsCount must be greater than or equal to zero: -1")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, HiveColumnStatistics.builder().setNullsCount(1).build())) + .build(), + invalidColumnStatistics("nullsCount must be less than or equal to rowCount. nullsCount: 1. rowCount: 0.")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, HiveColumnStatistics.builder().setDistinctValuesCount(-1).build())) + .build(), + invalidColumnStatistics("distinctValuesCount must be greater than or equal to zero: -1")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, createIntegerColumnStatistics(OptionalLong.of(1), OptionalLong.of(-1), OptionalLong.empty(), OptionalLong.empty()))) + .build(), + invalidColumnStatistics("integerStatistics.min must be less than or equal to integerStatistics.max. integerStatistics.min: 1. integerStatistics.max: -1.")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, createDoubleColumnStatistics(OptionalDouble.of(1), OptionalDouble.of(-1), OptionalLong.empty(), OptionalLong.empty()))) + .build(), + invalidColumnStatistics("doubleStatistics.min must be less than or equal to doubleStatistics.max. doubleStatistics.min: 1.0. doubleStatistics.max: -1.0.")); + validatePartitionStatistics( + TABLE, + ImmutableMap.of( + PARTITION, + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, createDoubleColumnStatistics(OptionalDouble.of(NaN), OptionalDouble.of(NaN), OptionalLong.empty(), OptionalLong.empty()))) + .build())); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, createDecimalColumnStatistics(Optional.of(BigDecimal.valueOf(1)), Optional.of(BigDecimal.valueOf(-1)), OptionalLong.empty(), OptionalLong.empty()))) + .build(), + invalidColumnStatistics("decimalStatistics.min must be less than or equal to decimalStatistics.max. decimalStatistics.min: 1. decimalStatistics.max: -1.")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, createDateColumnStatistics(Optional.of(LocalDate.ofEpochDay(1)), Optional.of(LocalDate.ofEpochDay(-1)), OptionalLong.empty(), OptionalLong.empty()))) + .build(), + invalidColumnStatistics("dateStatistics.min must be less than or equal to dateStatistics.max. dateStatistics.min: 1970-01-02. dateStatistics.max: 1969-12-31.")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, createBooleanColumnStatistics(OptionalLong.of(-1), OptionalLong.empty(), OptionalLong.empty()))) + .build(), + invalidColumnStatistics("trueCount must be greater than or equal to zero: -1")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, createBooleanColumnStatistics(OptionalLong.empty(), OptionalLong.of(-1), OptionalLong.empty()))) + .build(), + invalidColumnStatistics("falseCount must be greater than or equal to zero: -1")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, createBooleanColumnStatistics(OptionalLong.of(1), OptionalLong.empty(), OptionalLong.empty()))) + .build(), + invalidColumnStatistics("booleanStatistics.trueCount must be less than or equal to rowCount. booleanStatistics.trueCount: 1. rowCount: 0.")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, createBooleanColumnStatistics(OptionalLong.empty(), OptionalLong.of(1), OptionalLong.empty()))) + .build(), + invalidColumnStatistics("booleanStatistics.falseCount must be less than or equal to rowCount. booleanStatistics.falseCount: 1. rowCount: 0.")); + } + + @Test + public void testCalculateAverageRowsPerPartition() + { + assertThat(calculateAverageRowsPerPartition(ImmutableList.of())).isEmpty(); + assertThat(calculateAverageRowsPerPartition(ImmutableList.of(PartitionStatistics.empty()))).isEmpty(); + assertThat(calculateAverageRowsPerPartition(ImmutableList.of(PartitionStatistics.empty(), PartitionStatistics.empty()))).isEmpty(); + assertEquals(calculateAverageRowsPerPartition(ImmutableList.of(rowsCount(10))), OptionalDouble.of(10)); + assertEquals(calculateAverageRowsPerPartition(ImmutableList.of(rowsCount(10), PartitionStatistics.empty())), OptionalDouble.of(10)); + assertEquals(calculateAverageRowsPerPartition(ImmutableList.of(rowsCount(10), rowsCount(20))), OptionalDouble.of(15)); + assertEquals(calculateAverageRowsPerPartition(ImmutableList.of(rowsCount(10), rowsCount(20), PartitionStatistics.empty())), OptionalDouble.of(15)); + } + + @Test + public void testCalculateDistinctPartitionKeys() + { + assertEquals(calculateDistinctPartitionKeys(PARTITION_COLUMN_1, ImmutableList.of(), ImmutableMap.of(), 1000), 0); + assertEquals( + calculateDistinctPartitionKeys( + PARTITION_COLUMN_1, + ImmutableList.of(partition("p1=string1/p2=1234")), + ImmutableMap.of("p1=string1/p2=1234", rowsCount(1000)), + 2000), + 1); + assertEquals( + calculateDistinctPartitionKeys( + PARTITION_COLUMN_1, + ImmutableList.of(partition("p1=string1/p2=1234"), partition("p1=string2/p2=1234")), + ImmutableMap.of("p1=string1/p2=1234", rowsCount(1000), "p1=string2/p2=1234", rowsCount(1)), + 2000), + 2); + assertEquals( + calculateDistinctPartitionKeys( + PARTITION_COLUMN_1, + ImmutableList.of(partition("p1=string1/p2=1234"), partition("p1=string2/p2=1234")), + ImmutableMap.of("p1=string1/p2=1234", rowsCount(1000), "p1=string2/p2=1234", rowsCount(0)), + 2000), + 1); + assertEquals( + calculateDistinctPartitionKeys( + PARTITION_COLUMN_1, + ImmutableList.of(partition("p1=string1/p2=1234"), partition("p1=string2/p2=1234")), + ImmutableMap.of("p1=string1/p2=1234", rowsCount(1000)), + 2000), + 2); + assertEquals( + calculateDistinctPartitionKeys( + PARTITION_COLUMN_1, + ImmutableList.of(partition("p1=string1/p2=1234"), partition("p1=string2/p2=1234")), + ImmutableMap.of("p1=string1/p2=1234", rowsCount(1000)), + 0), + 1); + assertEquals( + calculateDistinctPartitionKeys( + PARTITION_COLUMN_2, + ImmutableList.of(partition("p1=string1/p2=1234"), partition("p1=string2/p2=1234")), + ImmutableMap.of("p1=string1/p2=1234", rowsCount(1000), "p1=string2/p2=1234", rowsCount(1)), + 2000), + 1); + assertEquals( + calculateDistinctPartitionKeys( + PARTITION_COLUMN_2, + ImmutableList.of(partition("p1=string1/p2=1234"), partition("p1=string1/p2=1235")), + ImmutableMap.of("p1=string1/p2=1234", rowsCount(1000), "p1=string1/p2=1235", rowsCount(1)), + 2000), + 2); + assertEquals( + calculateDistinctPartitionKeys( + PARTITION_COLUMN_1, + ImmutableList.of(partition("p1=__HIVE_DEFAULT_PARTITION__/p2=1234"), partition("p1=string1/p2=1235")), + ImmutableMap.of("p1=__HIVE_DEFAULT_PARTITION__/p2=1234", rowsCount(1000), "p1=string1/p2=1235", rowsCount(1)), + 2000), + 1); + assertEquals( + calculateDistinctPartitionKeys( + PARTITION_COLUMN_2, + ImmutableList.of(partition("p1=123/p2=__HIVE_DEFAULT_PARTITION__"), partition("p1=string1/p2=1235")), + ImmutableMap.of("p1=123/p2=__HIVE_DEFAULT_PARTITION__", rowsCount(1000), "p1=string1/p2=1235", rowsCount(1)), + 2000), + 1); + assertEquals( + calculateDistinctPartitionKeys( + PARTITION_COLUMN_2, + ImmutableList.of(partition("p1=123/p2=__HIVE_DEFAULT_PARTITION__"), partition("p1=string1/p2=__HIVE_DEFAULT_PARTITION__")), + ImmutableMap.of("p1=123/p2=__HIVE_DEFAULT_PARTITION__", rowsCount(1000), "p1=string1/p2=__HIVE_DEFAULT_PARTITION__", rowsCount(1)), + 2000), + 0); + } + + @Test + public void testCalculateNullsFractionForPartitioningKey() + { + assertEquals( + calculateNullsFractionForPartitioningKey( + PARTITION_COLUMN_1, + ImmutableList.of(partition("p1=string1/p2=1234")), + ImmutableMap.of("p1=string1/p2=1234", rowsCount(1000)), + 2000, + 0), + 0.0); + assertEquals( + calculateNullsFractionForPartitioningKey( + PARTITION_COLUMN_1, + ImmutableList.of(partition("p1=string1/p2=1234")), + ImmutableMap.of("p1=string1/p2=1234", rowsCount(1000)), + 2000, + 4000), + 0.0); + assertEquals( + calculateNullsFractionForPartitioningKey( + PARTITION_COLUMN_1, + ImmutableList.of(partition("p1=__HIVE_DEFAULT_PARTITION__/p2=1234")), + ImmutableMap.of("p1=__HIVE_DEFAULT_PARTITION__/p2=1234", rowsCount(1000)), + 2000, + 4000), + 0.25); + assertEquals( + calculateNullsFractionForPartitioningKey( + PARTITION_COLUMN_1, + ImmutableList.of(partition("p1=__HIVE_DEFAULT_PARTITION__/p2=1234")), + ImmutableMap.of("p1=__HIVE_DEFAULT_PARTITION__/p2=1234", PartitionStatistics.empty()), + 2000, + 4000), + 0.5); + assertEquals( + calculateNullsFractionForPartitioningKey( + PARTITION_COLUMN_1, + ImmutableList.of(partition("p1=__HIVE_DEFAULT_PARTITION__/p2=1234")), + ImmutableMap.of(), + 2000, + 4000), + 0.5); + assertEquals( + calculateNullsFractionForPartitioningKey( + PARTITION_COLUMN_1, + ImmutableList.of(partition("p1=__HIVE_DEFAULT_PARTITION__/p2=1234"), partition("p1=__HIVE_DEFAULT_PARTITION__/p2=4321")), + ImmutableMap.of("p1=__HIVE_DEFAULT_PARTITION__/p2=1234", rowsCount(1000), "p1=__HIVE_DEFAULT_PARTITION__/p2=4321", rowsCount(2000)), + 3000, + 4000), + 0.75); + assertEquals( + calculateNullsFractionForPartitioningKey( + PARTITION_COLUMN_1, + ImmutableList.of(partition("p1=__HIVE_DEFAULT_PARTITION__/p2=1234"), partition("p1=__HIVE_DEFAULT_PARTITION__/p2=4321")), + ImmutableMap.of("p1=__HIVE_DEFAULT_PARTITION__/p2=1234", rowsCount(1000), "p1=__HIVE_DEFAULT_PARTITION__/p2=4321", PartitionStatistics.empty()), + 3000, + 4000), + 1.0); + assertEquals( + calculateNullsFractionForPartitioningKey( + PARTITION_COLUMN_1, + ImmutableList.of(partition("p1=__HIVE_DEFAULT_PARTITION__/p2=1234"), partition("p1=__HIVE_DEFAULT_PARTITION__/p2=4321")), + ImmutableMap.of("p1=__HIVE_DEFAULT_PARTITION__/p2=1234", rowsCount(1000), "p1=__HIVE_DEFAULT_PARTITION__/p2=4321", PartitionStatistics.empty()), + 4000, + 4000), + 1.0); + } + + @Test + public void testCalculateDataSizeForPartitioningKey() + { + assertEquals( + calculateDataSizeForPartitioningKey( + PARTITION_COLUMN_2, + BIGINT, + ImmutableList.of(partition("p1=string1/p2=1234")), + ImmutableMap.of("p1=string1/p2=1234", rowsCount(1000)), + 2000), + Estimate.unknown()); + assertEquals( + calculateDataSizeForPartitioningKey( + PARTITION_COLUMN_1, + VARCHAR, + ImmutableList.of(partition("p1=string1/p2=1234")), + ImmutableMap.of("p1=string1/p2=1234", rowsCount(1000)), + 2000), + Estimate.of(7000)); + assertEquals( + calculateDataSizeForPartitioningKey( + PARTITION_COLUMN_1, + VARCHAR, + ImmutableList.of(partition("p1=string1/p2=1234")), + ImmutableMap.of("p1=string1/p2=1234", PartitionStatistics.empty()), + 2000), + Estimate.of(14000)); + assertEquals( + calculateDataSizeForPartitioningKey( + PARTITION_COLUMN_1, + VARCHAR, + ImmutableList.of(partition("p1=string1/p2=1234"), partition("p1=str2/p2=1234")), + ImmutableMap.of("p1=string1/p2=1234", rowsCount(1000), "p1=str2/p2=1234", rowsCount(2000)), + 3000), + Estimate.of(15000)); + assertEquals( + calculateDataSizeForPartitioningKey( + PARTITION_COLUMN_1, + VARCHAR, + ImmutableList.of(partition("p1=string1/p2=1234"), partition("p1=str2/p2=1234")), + ImmutableMap.of("p1=string1/p2=1234", rowsCount(1000), "p1=str2/p2=1234", PartitionStatistics.empty()), + 3000), + Estimate.of(19000)); + assertEquals( + calculateDataSizeForPartitioningKey( + PARTITION_COLUMN_1, + VARCHAR, + ImmutableList.of(partition("p1=string1/p2=1234"), partition("p1=str2/p2=1234")), + ImmutableMap.of(), + 3000), + Estimate.of(33000)); + assertEquals( + calculateDataSizeForPartitioningKey( + PARTITION_COLUMN_1, + VARCHAR, + ImmutableList.of(partition("p1=__HIVE_DEFAULT_PARTITION__/p2=1234"), partition("p1=str2/p2=1234")), + ImmutableMap.of(), + 3000), + Estimate.of(12000)); + } + + @Test + public void testCalculateRangeForPartitioningKey() + { + assertEquals( + calculateRangeForPartitioningKey( + PARTITION_COLUMN_1, + VARCHAR, + ImmutableList.of(partition("p1=string1/p2=1234"))), + Optional.empty()); + assertEquals( + calculateRangeForPartitioningKey( + PARTITION_COLUMN_2, + BIGINT, + ImmutableList.of(partition("p1=string1/p2=__HIVE_DEFAULT_PARTITION__"))), + Optional.empty()); + assertEquals( + calculateRangeForPartitioningKey( + PARTITION_COLUMN_2, + BIGINT, + ImmutableList.of(partition("p1=string1/p2=__HIVE_DEFAULT_PARTITION__"), partition("p1=string1/p2=__HIVE_DEFAULT_PARTITION__"))), + Optional.empty()); + assertEquals( + calculateRangeForPartitioningKey( + PARTITION_COLUMN_2, + BIGINT, + ImmutableList.of(partition("p1=string1/p2=__HIVE_DEFAULT_PARTITION__"), partition("p1=string1/p2=1"))), + Optional.of(new DoubleRange(1, 1))); + assertEquals( + calculateRangeForPartitioningKey( + PARTITION_COLUMN_2, + BIGINT, + ImmutableList.of(partition("p1=string1/p2=2"), partition("p1=string1/p2=1"))), + Optional.of(new DoubleRange(1, 2))); + assertEquals( + calculateRangeForPartitioningKey( + PARTITION_COLUMN_2, + BIGINT, + ImmutableList.of(partition("p1=string1/p2=2"), partition("p1=string1/p2=3"), partition("p1=string1/p2=1"))), + Optional.of(new DoubleRange(1, 3))); + } + + @Test + public void testConvertPartitionValueToDouble() + { + assertConvertPartitionValueToDouble(BIGINT, "123456", 123456); + assertConvertPartitionValueToDouble(INTEGER, "12345", 12345); + assertConvertPartitionValueToDouble(SMALLINT, "1234", 1234); + assertConvertPartitionValueToDouble(TINYINT, "123", 123); + assertConvertPartitionValueToDouble(DOUBLE, "0.1", 0.1); + assertConvertPartitionValueToDouble(REAL, "0.2", (double) (float) 0.2); + assertConvertPartitionValueToDouble(createDecimalType(5, 2), "123.45", 123.45); + assertConvertPartitionValueToDouble(createDecimalType(25, 5), "12345678901234567890.12345", 12345678901234567890.12345); + assertConvertPartitionValueToDouble(DATE, "1970-01-02", 1); + } + + private static void assertConvertPartitionValueToDouble(Type type, String value, double expected) + { + Object prestoValue = parsePartitionValue(format("p=%s", value), value, type, DateTimeZone.getDefault()).getValue(); + assertEquals(convertPartitionValueToDouble(type, prestoValue), expected); + } + + @Test + public void testCreateDataColumnStatistics() + { + assertEquals(createDataColumnStatistics(COLUMN, BIGINT, 1000, ImmutableList.of()), ColumnStatistics.empty()); + assertEquals( + createDataColumnStatistics(COLUMN, BIGINT, 1000, ImmutableList.of(PartitionStatistics.empty(), PartitionStatistics.empty())), + ColumnStatistics.empty()); + assertEquals( + createDataColumnStatistics( + COLUMN, + BIGINT, + 1000, + ImmutableList.of(new PartitionStatistics(HiveBasicStatistics.createZeroStatistics(), ImmutableMap.of("column2", HiveColumnStatistics.empty())))), + ColumnStatistics.empty()); + } + + @Test + public void testCalculateDistinctValuesCount() + { + assertEquals(calculateDistinctValuesCount(ImmutableList.of()), Estimate.unknown()); + assertEquals(calculateDistinctValuesCount(ImmutableList.of(HiveColumnStatistics.empty())), Estimate.unknown()); + assertEquals(calculateDistinctValuesCount(ImmutableList.of(HiveColumnStatistics.empty(), HiveColumnStatistics.empty())), Estimate.unknown()); + assertEquals(calculateDistinctValuesCount(ImmutableList.of(distinctValuesCount(1))), Estimate.of(1)); + assertEquals(calculateDistinctValuesCount(ImmutableList.of(distinctValuesCount(1), distinctValuesCount(2))), Estimate.of(2)); + assertEquals(calculateDistinctValuesCount(ImmutableList.of(distinctValuesCount(1), HiveColumnStatistics.empty())), Estimate.of(1)); + assertEquals(calculateDistinctValuesCount(ImmutableList.of(createBooleanColumnStatistics(OptionalLong.empty(), OptionalLong.empty(), OptionalLong.empty()))), Estimate.unknown()); + assertEquals(calculateDistinctValuesCount(ImmutableList.of(createBooleanColumnStatistics(OptionalLong.of(1), OptionalLong.of(0), OptionalLong.empty()))), Estimate.of(1)); + assertEquals(calculateDistinctValuesCount(ImmutableList.of(createBooleanColumnStatistics(OptionalLong.of(10), OptionalLong.empty(), OptionalLong.empty()))), Estimate.unknown()); + assertEquals(calculateDistinctValuesCount(ImmutableList.of(createBooleanColumnStatistics(OptionalLong.of(10), OptionalLong.of(10), OptionalLong.empty()))), Estimate.of(2)); + assertEquals(calculateDistinctValuesCount(ImmutableList.of(createBooleanColumnStatistics(OptionalLong.empty(), OptionalLong.of(10), OptionalLong.empty()))), Estimate.unknown()); + assertEquals(calculateDistinctValuesCount(ImmutableList.of(createBooleanColumnStatistics(OptionalLong.of(0), OptionalLong.of(10), OptionalLong.empty()))), Estimate.of(1)); + assertEquals(calculateDistinctValuesCount(ImmutableList.of(createBooleanColumnStatistics(OptionalLong.of(0), OptionalLong.of(0), OptionalLong.empty()))), Estimate.of(0)); + assertEquals( + calculateDistinctValuesCount(ImmutableList.of( + createBooleanColumnStatistics(OptionalLong.of(0), OptionalLong.of(10), OptionalLong.empty()), + createBooleanColumnStatistics(OptionalLong.of(1), OptionalLong.of(10), OptionalLong.empty()))), + Estimate.of(2)); + } + + @Test + public void testCalculateNullsFraction() + { + assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of()), Estimate.unknown()); + assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of(PartitionStatistics.empty())), Estimate.unknown()); + assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of(rowsCount(1000))), Estimate.unknown()); + assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of(rowsCount(1000), nullsCount(500))), Estimate.unknown()); + assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of(rowsCount(1000), nullsCount(500), rowsCountAndNullsCount(1000, 500))), Estimate.of(0.5)); + assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of(rowsCountAndNullsCount(2000, 200), rowsCountAndNullsCount(1000, 100))), Estimate.of(0.1)); + assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of(rowsCountAndNullsCount(0, 0), rowsCountAndNullsCount(0, 0))), Estimate.of(0)); + } + + @Test + public void testCalculateDataSize() + { + assertEquals(calculateDataSize(COLUMN, ImmutableList.of(), 0), Estimate.unknown()); + assertEquals(calculateDataSize(COLUMN, ImmutableList.of(), 1000), Estimate.unknown()); + assertEquals(calculateDataSize(COLUMN, ImmutableList.of(PartitionStatistics.empty()), 1000), Estimate.unknown()); + assertEquals(calculateDataSize(COLUMN, ImmutableList.of(rowsCount(1000)), 1000), Estimate.unknown()); + assertEquals(calculateDataSize(COLUMN, ImmutableList.of(dataSize(1000)), 1000), Estimate.unknown()); + assertEquals(calculateDataSize(COLUMN, ImmutableList.of(dataSize(1000), rowsCount(1000)), 1000), Estimate.unknown()); + assertEquals(calculateDataSize(COLUMN, ImmutableList.of(rowsCountAndDataSize(500, 1000)), 2000), Estimate.of(4000)); + assertEquals(calculateDataSize(COLUMN, ImmutableList.of(rowsCountAndDataSize(0, 0)), 2000), Estimate.unknown()); + assertEquals(calculateDataSize(COLUMN, ImmutableList.of(rowsCountAndDataSize(0, 0)), 0), Estimate.zero()); + assertEquals(calculateDataSize(COLUMN, ImmutableList.of(rowsCountAndDataSize(1000, 0)), 2000), Estimate.of(0)); + assertEquals( + calculateDataSize( + COLUMN, + ImmutableList.of( + rowsCountAndDataSize(500, 1000), + rowsCountAndDataSize(1000, 5000)), + 5000), + Estimate.of(20000)); + assertEquals( + calculateDataSize( + COLUMN, + ImmutableList.of( + dataSize(1000), + rowsCountAndDataSize(500, 1000), + rowsCount(3000), + rowsCountAndDataSize(1000, 5000)), + 5000), + Estimate.of(20000)); + } + + @Test + public void testCalculateRange() + { + assertEquals(calculateRange(VARCHAR, ImmutableList.of()), Optional.empty()); + assertEquals(calculateRange(VARCHAR, ImmutableList.of(integerRange(OptionalLong.empty(), OptionalLong.empty()))), Optional.empty()); + assertEquals(calculateRange(VARCHAR, ImmutableList.of(integerRange(1, 2))), Optional.empty()); + assertEquals(calculateRange(BIGINT, ImmutableList.of(integerRange(1, 2))), Optional.of(new DoubleRange(1, 2))); + assertEquals(calculateRange(BIGINT, ImmutableList.of(integerRange(Long.MIN_VALUE, Long.MAX_VALUE))), Optional.of(new DoubleRange(Long.MIN_VALUE, Long.MAX_VALUE))); + assertEquals(calculateRange(INTEGER, ImmutableList.of(integerRange(Long.MIN_VALUE, Long.MAX_VALUE))), Optional.of(new DoubleRange(Integer.MIN_VALUE, Integer.MAX_VALUE))); + assertEquals(calculateRange(SMALLINT, ImmutableList.of(integerRange(Long.MIN_VALUE, Long.MAX_VALUE))), Optional.of(new DoubleRange(Short.MIN_VALUE, Short.MAX_VALUE))); + assertEquals(calculateRange(TINYINT, ImmutableList.of(integerRange(Long.MIN_VALUE, Long.MAX_VALUE))), Optional.of(new DoubleRange(Byte.MIN_VALUE, Byte.MAX_VALUE))); + assertEquals(calculateRange(BIGINT, ImmutableList.of(integerRange(1, 5), integerRange(3, 7))), Optional.of(new DoubleRange(1, 7))); + assertEquals(calculateRange(BIGINT, ImmutableList.of(integerRange(OptionalLong.empty(), OptionalLong.empty()), integerRange(3, 7))), Optional.of(new DoubleRange(3, 7))); + assertEquals(calculateRange(BIGINT, ImmutableList.of(integerRange(OptionalLong.empty(), OptionalLong.of(8)), integerRange(3, 7))), Optional.of(new DoubleRange(3, 7))); + assertEquals(calculateRange(DOUBLE, ImmutableList.of(integerRange(1, 2))), Optional.empty()); + assertEquals(calculateRange(REAL, ImmutableList.of(integerRange(1, 2))), Optional.empty()); + assertEquals(calculateRange(DOUBLE, ImmutableList.of(doubleRange(OptionalDouble.empty(), OptionalDouble.empty()))), Optional.empty()); + assertEquals(calculateRange(DOUBLE, ImmutableList.of(doubleRange(0.1, 0.2))), Optional.of(new DoubleRange(0.1, 0.2))); + assertEquals(calculateRange(BIGINT, ImmutableList.of(doubleRange(0.1, 0.2))), Optional.empty()); + assertEquals(calculateRange(DOUBLE, ImmutableList.of(doubleRange(0.1, 0.2), doubleRange(0.15, 0.25))), Optional.of(new DoubleRange(0.1, 0.25))); + assertEquals(calculateRange(REAL, ImmutableList.of(doubleRange(0.1, 0.2), doubleRange(0.15, 0.25))), Optional.of(new DoubleRange(0.1, 0.25))); + assertEquals(calculateRange(REAL, ImmutableList.of(doubleRange(OptionalDouble.empty(), OptionalDouble.of(0.2)), doubleRange(0.15, 0.25))), Optional.of(new DoubleRange(0.15, 0.25))); + assertEquals(calculateRange(DOUBLE, ImmutableList.of(doubleRange(NaN, 0.2))), Optional.empty()); + assertEquals(calculateRange(DOUBLE, ImmutableList.of(doubleRange(0.1, NaN))), Optional.empty()); + assertEquals(calculateRange(DOUBLE, ImmutableList.of(doubleRange(NaN, NaN))), Optional.empty()); + assertEquals(calculateRange(DOUBLE, ImmutableList.of(doubleRange(Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY))), Optional.of(new DoubleRange(Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY))); + assertEquals(calculateRange(REAL, ImmutableList.of(doubleRange(Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY))), Optional.of(new DoubleRange(Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY))); + assertEquals(calculateRange(DOUBLE, ImmutableList.of(doubleRange(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY))), Optional.of(new DoubleRange(Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY))); + assertEquals(calculateRange(DOUBLE, ImmutableList.of(doubleRange(Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY), doubleRange(0.1, 0.2))), Optional.of(new DoubleRange(Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY))); + assertEquals(calculateRange(DATE, ImmutableList.of(doubleRange(0.1, 0.2))), Optional.empty()); + assertEquals(calculateRange(DATE, ImmutableList.of(dateRange("1970-01-01", "1970-01-02"))), Optional.of(new DoubleRange(0, 1))); + assertEquals(calculateRange(DATE, ImmutableList.of(dateRange(Optional.empty(), Optional.empty()))), Optional.empty()); + assertEquals(calculateRange(DATE, ImmutableList.of(dateRange(Optional.of("1970-01-01"), Optional.empty()))), Optional.empty()); + assertEquals(calculateRange(DATE, ImmutableList.of(dateRange("1970-01-01", "1970-01-05"), dateRange("1970-01-03", "1970-01-07"))), Optional.of(new DoubleRange(0, 6))); + assertEquals(calculateRange(DECIMAL, ImmutableList.of(doubleRange(0.1, 0.2))), Optional.empty()); + assertEquals(calculateRange(DECIMAL, ImmutableList.of(decimalRange(BigDecimal.valueOf(1), BigDecimal.valueOf(5)))), Optional.of(new DoubleRange(1, 5))); + assertEquals(calculateRange(DECIMAL, ImmutableList.of(decimalRange(Optional.empty(), Optional.empty()))), Optional.empty()); + assertEquals(calculateRange(DECIMAL, ImmutableList.of(decimalRange(Optional.of(BigDecimal.valueOf(1)), Optional.empty()))), Optional.empty()); + assertEquals(calculateRange(DECIMAL, ImmutableList.of(decimalRange(BigDecimal.valueOf(1), BigDecimal.valueOf(5)), decimalRange(BigDecimal.valueOf(3), BigDecimal.valueOf(7)))), Optional.of(new DoubleRange(1, 7))); + } + + @Test + public void testGetTableStatistics() + { + String partitionName = "p1=string1/p2=1234"; + PartitionStatistics statistics = PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(OptionalLong.empty(), OptionalLong.of(1000), OptionalLong.empty(), OptionalLong.empty())) + .setColumnStatistics(ImmutableMap.of(COLUMN, createIntegerColumnStatistics(OptionalLong.of(-100), OptionalLong.of(100), OptionalLong.of(500), OptionalLong.of(300)))) + .build(); + MetastoreHiveStatisticsProvider statisticsProvider = new MetastoreHiveStatisticsProvider((table, hivePartitions) -> ImmutableMap.of(partitionName, statistics)); + TestingConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(new HiveClientConfig(), new OrcFileWriterConfig()).getSessionProperties()); + HiveColumnHandle columnHandle = new HiveColumnHandle(COLUMN, HIVE_LONG, BIGINT.getTypeSignature(), 2, REGULAR, Optional.empty()); + TableStatistics expected = TableStatistics.builder() + .setRowCount(Estimate.of(1000)) + .setColumnStatistics( + PARTITION_COLUMN_1, + ColumnStatistics.builder() + .setDataSize(Estimate.of(7000)) + .setNullsFraction(Estimate.of(0)) + .setDistinctValuesCount(Estimate.of(1)) + .build()) + .setColumnStatistics( + PARTITION_COLUMN_2, + ColumnStatistics.builder() + .setRange(new DoubleRange(1234, 1234)) + .setNullsFraction(Estimate.of(0)) + .setDistinctValuesCount(Estimate.of(1)) + .build()) + .setColumnStatistics( + columnHandle, + ColumnStatistics.builder() + .setRange(new DoubleRange(-100, 100)) + .setNullsFraction(Estimate.of(0.5)) + .setDistinctValuesCount(Estimate.of(300)) + .build()) + .build(); + assertEquals( + statisticsProvider.getTableStatistics( + session, + TABLE, + ImmutableMap.of( + "p1", PARTITION_COLUMN_1, + "p2", PARTITION_COLUMN_2, + COLUMN, columnHandle), + ImmutableMap.of( + "p1", VARCHAR, + "p2", BIGINT, + COLUMN, BIGINT), + ImmutableList.of(partition(partitionName))), + expected); + } + + @Test + public void testGetTableStatisticsUnpartitioned() + { + PartitionStatistics statistics = PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(OptionalLong.empty(), OptionalLong.of(1000), OptionalLong.empty(), OptionalLong.empty())) + .setColumnStatistics(ImmutableMap.of(COLUMN, createIntegerColumnStatistics(OptionalLong.of(-100), OptionalLong.of(100), OptionalLong.of(500), OptionalLong.of(300)))) + .build(); + MetastoreHiveStatisticsProvider statisticsProvider = new MetastoreHiveStatisticsProvider((table, hivePartitions) -> ImmutableMap.of(UNPARTITIONED_ID, statistics)); + TestingConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(new HiveClientConfig(), new OrcFileWriterConfig()).getSessionProperties()); + HiveColumnHandle columnHandle = new HiveColumnHandle(COLUMN, HIVE_LONG, BIGINT.getTypeSignature(), 2, REGULAR, Optional.empty()); + TableStatistics expected = TableStatistics.builder() + .setRowCount(Estimate.of(1000)) + .setColumnStatistics( + columnHandle, + ColumnStatistics.builder() + .setRange(new DoubleRange(-100, 100)) + .setNullsFraction(Estimate.of(0.5)) + .setDistinctValuesCount(Estimate.of(300)) + .build()) + .build(); + assertEquals( + statisticsProvider.getTableStatistics( + session, + TABLE, + ImmutableMap.of(COLUMN, columnHandle), + ImmutableMap.of(COLUMN, BIGINT), + ImmutableList.of(new HivePartition(TABLE))), + expected); + } + + @Test + public void testGetTableStatisticsEmpty() + { + String partitionName = "p1=string1/p2=1234"; + MetastoreHiveStatisticsProvider statisticsProvider = new MetastoreHiveStatisticsProvider((table, hivePartitions) -> ImmutableMap.of(partitionName, PartitionStatistics.empty())); + TestingConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(new HiveClientConfig(), new OrcFileWriterConfig()).getSessionProperties()); + assertEquals( + statisticsProvider.getTableStatistics( + session, + TABLE, + ImmutableMap.of(), + ImmutableMap.of(), + ImmutableList.of(partition(partitionName))), + TableStatistics.empty()); + } + + @Test + public void testGetTableStatisticsSampling() + { + MetastoreHiveStatisticsProvider statisticsProvider = new MetastoreHiveStatisticsProvider((table, hivePartitions) -> { + assertEquals(table, TABLE); + assertEquals(hivePartitions.size(), 1); + return ImmutableMap.of(); + }); + TestingConnectorSession session = new TestingConnectorSession(new HiveSessionProperties( + new HiveClientConfig().setPartitionStatisticsSampleSize(1), + new OrcFileWriterConfig()) + .getSessionProperties()); + statisticsProvider.getTableStatistics( + session, + TABLE, + ImmutableMap.of(), + ImmutableMap.of(), + ImmutableList.of(partition("p1=string1/p2=1234"), partition("p1=string1/p2=1235"))); + } + + @Test + public void testGetTableStatisticsValidationFailure() + { + PartitionStatistics corruptedStatistics = PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(-1, 0, 0, 0)) + .build(); + String partitionName = "p1=string1/p2=1234"; + MetastoreHiveStatisticsProvider statisticsProvider = new MetastoreHiveStatisticsProvider((table, hivePartitions) -> ImmutableMap.of(partitionName, corruptedStatistics)); + TestingConnectorSession session = new TestingConnectorSession(new HiveSessionProperties( + new HiveClientConfig().setIgnoreCorruptedStatistics(false), + new OrcFileWriterConfig()) + .getSessionProperties()); + assertThatThrownBy(() -> statisticsProvider.getTableStatistics( + session, + TABLE, + ImmutableMap.of(), + ImmutableMap.of(), + ImmutableList.of(partition(partitionName)))) + .isInstanceOf(PrestoException.class) + .hasFieldOrPropertyWithValue("errorCode", HIVE_CORRUPTED_COLUMN_STATISTICS.toErrorCode()); + TestingConnectorSession ignoreSession = new TestingConnectorSession(new HiveSessionProperties( + new HiveClientConfig().setIgnoreCorruptedStatistics(true), + new OrcFileWriterConfig()) + .getSessionProperties()); + assertEquals( + statisticsProvider.getTableStatistics( + ignoreSession, + TABLE, + ImmutableMap.of(), + ImmutableMap.of(), + ImmutableList.of(partition(partitionName))), + TableStatistics.empty()); + } + + private static void assertInvalidStatistics(PartitionStatistics partitionStatistics, String expectedMessage) + { + assertThatThrownBy(() -> validatePartitionStatistics(TABLE, ImmutableMap.of(PARTITION, partitionStatistics))) + .isInstanceOf(PrestoException.class) + .hasFieldOrPropertyWithValue("errorCode", HIVE_CORRUPTED_COLUMN_STATISTICS.toErrorCode()) + .hasMessage(expectedMessage); + } + + private static String invalidPartitionStatistics(String message) + { + return format("Corrupted partition statistics (Table: %s Partition: [%s]): %s", TABLE, PARTITION, message); + } + + private static String invalidColumnStatistics(String message) + { + return format("Corrupted partition statistics (Table: %s Partition: [%s] Column: %s): %s", TABLE, PARTITION, COLUMN, message); + } + private static HivePartition partition(String name) { return parsePartition(TABLE, name, ImmutableList.of(PARTITION_COLUMN_1, PARTITION_COLUMN_2), ImmutableList.of(VARCHAR, BIGINT), DateTimeZone.getDefault()); } + + private static PartitionStatistics rowsCount(long rowsCount) + { + return new PartitionStatistics(new HiveBasicStatistics(0, rowsCount, 0, 0), ImmutableMap.of()); + } + + private static PartitionStatistics nullsCount(long nullsCount) + { + return new PartitionStatistics(HiveBasicStatistics.createEmptyStatistics(), ImmutableMap.of(COLUMN, HiveColumnStatistics.builder().setNullsCount(nullsCount).build())); + } + + private static PartitionStatistics dataSize(long dataSize) + { + return new PartitionStatistics(HiveBasicStatistics.createEmptyStatistics(), ImmutableMap.of(COLUMN, HiveColumnStatistics.builder().setTotalSizeInBytes(dataSize).build())); + } + + private static PartitionStatistics rowsCountAndNullsCount(long rowsCount, long nullsCount) + { + return new PartitionStatistics( + new HiveBasicStatistics(0, rowsCount, 0, 0), + ImmutableMap.of(COLUMN, HiveColumnStatistics.builder().setNullsCount(nullsCount).build())); + } + + private static PartitionStatistics rowsCountAndDataSize(long rowsCount, long dataSize) + { + return new PartitionStatistics( + new HiveBasicStatistics(0, rowsCount, 0, 0), + ImmutableMap.of(COLUMN, HiveColumnStatistics.builder().setTotalSizeInBytes(dataSize).build())); + } + + private static HiveColumnStatistics distinctValuesCount(long count) + { + return HiveColumnStatistics.builder() + .setDistinctValuesCount(count) + .build(); + } + + private static HiveColumnStatistics integerRange(long min, long max) + { + return integerRange(OptionalLong.of(min), OptionalLong.of(max)); + } + + private static HiveColumnStatistics integerRange(OptionalLong min, OptionalLong max) + { + return HiveColumnStatistics.builder() + .setIntegerStatistics(new IntegerStatistics(min, max)) + .build(); + } + + private static HiveColumnStatistics doubleRange(double min, double max) + { + return doubleRange(OptionalDouble.of(min), OptionalDouble.of(max)); + } + + private static HiveColumnStatistics doubleRange(OptionalDouble min, OptionalDouble max) + { + return HiveColumnStatistics.builder() + .setDoubleStatistics(new DoubleStatistics(min, max)) + .build(); + } + + private static HiveColumnStatistics dateRange(String min, String max) + { + return dateRange(Optional.of(min), Optional.of(max)); + } + + private static HiveColumnStatistics dateRange(Optional min, Optional max) + { + return HiveColumnStatistics.builder() + .setDateStatistics(new DateStatistics(min.map(TestMetastoreHiveStatisticsProvider::parseDate), max.map(TestMetastoreHiveStatisticsProvider::parseDate))) + .build(); + } + + private static LocalDate parseDate(String date) + { + return LocalDate.parse(date); + } + + private static HiveColumnStatistics decimalRange(BigDecimal min, BigDecimal max) + { + return decimalRange(Optional.of(min), Optional.of(max)); + } + + private static HiveColumnStatistics decimalRange(Optional min, Optional max) + { + return HiveColumnStatistics.builder() + .setDecimalStatistics(new DecimalStatistics(min, max)) + .build(); + } } diff --git a/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveTableStatistics.java b/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveTableStatistics.java index 6e1e4702d850b..0cf9f0dc71cbc 100644 --- a/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveTableStatistics.java +++ b/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveTableStatistics.java @@ -227,14 +227,14 @@ public void testStatisticsForTablePartitionedByBigint() assertThat(query(showStatsWholeTable)).containsOnly( row("p_nationkey", null, null, null, null, null, null), row("p_name", null, null, null, null, null, null), - row("p_regionkey", null, 3.0, null, null, "1", "3"), + row("p_regionkey", null, null, null, null, null, null), row("p_comment", null, null, null, null, null, null), row(null, null, null, null, null, null, null)); assertThat(query(showStatsPartitionOne)).containsOnly( row("p_nationkey", null, null, null, null, null, null), row("p_name", null, null, null, null, null, null), - row("p_regionkey", null, 1.0, null, null, "1", "1"), + row("p_regionkey", null, null, null, null, null, null), row("p_comment", null, null, null, null, null, null), row(null, null, null, null, null, null, null)); @@ -259,7 +259,7 @@ public void testStatisticsForTablePartitionedByBigint() assertThat(query(showStatsPartitionTwo)).containsOnly( row("p_nationkey", null, null, null, null, null, null), row("p_name", null, null, null, null, null, null), - row("p_regionkey", null, 1.0, null, null, "2", "2"), + row("p_regionkey", null, null, null, null, null, null), row("p_comment", null, null, null, null, null, null), row(null, null, null, null, null, null, null)); @@ -354,14 +354,14 @@ public void testStatisticsForTablePartitionedByVarchar() assertThat(query(showStatsWholeTable)).containsOnly( row("p_nationkey", null, null, null, null, null, null), row("p_name", null, null, null, null, null, null), - row("p_regionkey", null, 3.0, null, null, null, null), + row("p_regionkey", null, null, null, null, null, null), row("p_comment", null, null, null, null, null, null), row(null, null, null, null, null, null, null)); assertThat(query(showStatsPartitionOne)).containsOnly( row("p_nationkey", null, null, null, null, null, null), row("p_name", null, null, null, null, null, null), - row("p_regionkey", null, 1.0, null, null, null, null), + row("p_regionkey", null, null, null, null, null, null), row("p_comment", null, null, null, null, null, null), row(null, null, null, null, null, null, null)); @@ -386,7 +386,7 @@ public void testStatisticsForTablePartitionedByVarchar() assertThat(query(showStatsPartitionTwo)).containsOnly( row("p_nationkey", null, null, null, null, null, null), row("p_name", null, null, null, null, null, null), - row("p_regionkey", null, 1.0, null, null, null, null), + row("p_regionkey", null, null, null, null, null, null), row("p_comment", null, null, null, null, null, null), row(null, null, null, null, null, null, null)); @@ -524,21 +524,21 @@ public void testStatisticsForAllDataTypesNoData() onHive().executeQuery("ANALYZE TABLE " + tableNameInDatabase + " COMPUTE STATISTICS"); assertThat(query("SHOW STATS FOR " + tableNameInDatabase)).containsOnly( - row("c_tinyint", null, null, 0.0, null, null, null), - row("c_smallint", null, null, 0.0, null, null, null), - row("c_int", null, null, 0.0, null, null, null), - row("c_bigint", null, null, 0.0, null, null, null), - row("c_float", null, null, 0.0, null, null, null), - row("c_double", null, null, 0.0, null, null, null), - row("c_decimal", null, null, 0.0, null, null, null), - row("c_decimal_w_params", null, null, 0.0, null, null, null), - row("c_timestamp", null, null, 0.0, null, null, null), - row("c_date", null, null, 0.0, null, null, null), - row("c_string", null, null, 0.0, null, null, null), - row("c_varchar", null, null, 0.0, null, null, null), - row("c_char", null, null, 0.0, null, null, null), - row("c_boolean", null, null, 0.0, null, null, null), - row("c_binary", null, null, 0.0, null, null, null), + row("c_tinyint", null, null, null, null, null, null), + row("c_smallint", null, null, null, null, null, null), + row("c_int", null, null, null, null, null, null), + row("c_bigint", null, null, null, null, null, null), + row("c_float", null, null, null, null, null, null), + row("c_double", null, null, null, null, null, null), + row("c_decimal", null, null, null, null, null, null), + row("c_decimal_w_params", null, null, null, null, null, null), + row("c_timestamp", null, null, null, null, null, null), + row("c_date", null, null, null, null, null, null), + row("c_string", null, null, null, null, null, null), + row("c_varchar", null, null, null, null, null, null), + row("c_char", null, null, null, null, null, null), + row("c_boolean", null, null, null, null, null, null), + row("c_binary", null, null, null, null, null, null), row(null, null, null, null, 0.0, null, null)); onHive().executeQuery("ANALYZE TABLE " + tableNameInDatabase + " COMPUTE STATISTICS FOR COLUMNS"); diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java index 6863f8ec36cd6..135abdfb3fe4d 100644 --- a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java @@ -21,11 +21,18 @@ public final class ColumnStatistics { + private static final ColumnStatistics EMPTY = new ColumnStatistics(Estimate.unknown(), Estimate.unknown(), Estimate.unknown(), Optional.empty()); + private final Estimate nullsFraction; private final Estimate distinctValuesCount; private final Estimate dataSize; private final Optional range; + public static ColumnStatistics empty() + { + return EMPTY; + } + public ColumnStatistics( Estimate nullsFraction, Estimate distinctValuesCount, diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/DoubleRange.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/DoubleRange.java index 1ea795b3c367d..e359f44a8c774 100644 --- a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/DoubleRange.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/DoubleRange.java @@ -16,7 +16,10 @@ import java.util.Objects; import static java.lang.Double.isNaN; +import static java.lang.Math.max; +import static java.lang.Math.min; import static java.lang.String.format; +import static java.util.Objects.requireNonNull; public class DoubleRange { @@ -48,6 +51,13 @@ public double getMax() return max; } + public static DoubleRange union(DoubleRange first, DoubleRange second) + { + requireNonNull(first, "first is null"); + requireNonNull(second, "second is null"); + return new DoubleRange(min(first.min, second.min), max(first.max, second.max)); + } + @Override public boolean equals(Object o) { diff --git a/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestDoubleRange.java b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestDoubleRange.java index 17c9edca705c9..a3d481e902ef9 100644 --- a/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestDoubleRange.java +++ b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestDoubleRange.java @@ -15,6 +15,7 @@ import org.testng.annotations.Test; +import static com.facebook.presto.spi.statistics.DoubleRange.union; import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.testng.Assert.assertEquals; @@ -57,6 +58,16 @@ public void testRange() assertThatThrownBy(() -> new DoubleRange(Float.POSITIVE_INFINITY, 0)).isInstanceOf(IllegalArgumentException.class); } + @Test + public void testUnion() + { + assertEquals(union(new DoubleRange(1, 2), new DoubleRange(4, 5)), new DoubleRange(1, 5)); + assertEquals(union(new DoubleRange(1, 2), new DoubleRange(1, 2)), new DoubleRange(1, 2)); + assertEquals(union(new DoubleRange(4, 5), new DoubleRange(1, 2)), new DoubleRange(1, 5)); + assertEquals(union(new DoubleRange(Double.NEGATIVE_INFINITY, 0), new DoubleRange(0, Double.POSITIVE_INFINITY)), new DoubleRange(Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY)); + assertEquals(union(new DoubleRange(0, Double.POSITIVE_INFINITY), new DoubleRange(Double.NEGATIVE_INFINITY, 0)), new DoubleRange(Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY)); + } + private static void assertRange(double min, double max) { DoubleRange range = new DoubleRange(min, max); From 1344eb92a71a97e1fd62a85a530dd292585755ed Mon Sep 17 00:00:00 2001 From: Andrii Rosa Date: Wed, 19 Sep 2018 14:03:21 -0400 Subject: [PATCH 13/14] Normalize distinct values count Since the number of distinct values is estimated, it may end up higher than a total of non-null rows count. It makes sense to normalize it before writing and reading from the metastore. --- .../metastore/thrift/ThriftMetastoreUtil.java | 35 +++++++++++------ .../MetastoreHiveStatisticsProvider.java | 27 ++++++++++++- .../facebook/presto/hive/util/Statistics.java | 18 ++++++--- .../presto/hive/AbstractTestHiveClient.java | 14 +++---- .../thrift/TestThriftMetastoreUtil.java | 10 ++--- .../TestMetastoreHiveStatisticsProvider.java | 12 ++++++ .../tests/hive/TestHiveTableStatistics.java | 38 +++++++++---------- 7 files changed, 105 insertions(+), 49 deletions(-) diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/metastore/thrift/ThriftMetastoreUtil.java b/presto-hive/src/main/java/com/facebook/presto/hive/metastore/thrift/ThriftMetastoreUtil.java index ed9d91802d842..9430246a574b7 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/metastore/thrift/ThriftMetastoreUtil.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/metastore/thrift/ThriftMetastoreUtil.java @@ -333,7 +333,7 @@ public static HiveColumnStatistics fromMetastoreApiColumnStatistics(ColumnStatis OptionalLong max = longStatsData.isSetHighValue() ? OptionalLong.of(longStatsData.getHighValue()) : OptionalLong.empty(); OptionalLong nullsCount = longStatsData.isSetNumNulls() ? OptionalLong.of(longStatsData.getNumNulls()) : OptionalLong.empty(); OptionalLong distinctValuesCount = longStatsData.isSetNumDVs() ? OptionalLong.of(longStatsData.getNumDVs()) : OptionalLong.empty(); - return createIntegerColumnStatistics(min, max, nullsCount, fromMetastoreDistinctValuesCount(distinctValuesCount, nullsCount)); + return createIntegerColumnStatistics(min, max, nullsCount, fromMetastoreDistinctValuesCount(distinctValuesCount, nullsCount, rowCount)); } if (columnStatistics.getStatsData().isSetDoubleStats()) { DoubleColumnStatsData doubleStatsData = columnStatistics.getStatsData().getDoubleStats(); @@ -341,7 +341,7 @@ public static HiveColumnStatistics fromMetastoreApiColumnStatistics(ColumnStatis OptionalDouble max = doubleStatsData.isSetHighValue() ? OptionalDouble.of(doubleStatsData.getHighValue()) : OptionalDouble.empty(); OptionalLong nullsCount = doubleStatsData.isSetNumNulls() ? OptionalLong.of(doubleStatsData.getNumNulls()) : OptionalLong.empty(); OptionalLong distinctValuesCount = doubleStatsData.isSetNumDVs() ? OptionalLong.of(doubleStatsData.getNumDVs()) : OptionalLong.empty(); - return createDoubleColumnStatistics(min, max, nullsCount, fromMetastoreDistinctValuesCount(distinctValuesCount, nullsCount)); + return createDoubleColumnStatistics(min, max, nullsCount, fromMetastoreDistinctValuesCount(distinctValuesCount, nullsCount, rowCount)); } if (columnStatistics.getStatsData().isSetDecimalStats()) { DecimalColumnStatsData decimalStatsData = columnStatistics.getStatsData().getDecimalStats(); @@ -349,7 +349,7 @@ public static HiveColumnStatistics fromMetastoreApiColumnStatistics(ColumnStatis Optional max = decimalStatsData.isSetHighValue() ? fromMetastoreDecimal(decimalStatsData.getHighValue()) : Optional.empty(); OptionalLong nullsCount = decimalStatsData.isSetNumNulls() ? OptionalLong.of(decimalStatsData.getNumNulls()) : OptionalLong.empty(); OptionalLong distinctValuesCount = decimalStatsData.isSetNumDVs() ? OptionalLong.of(decimalStatsData.getNumDVs()) : OptionalLong.empty(); - return createDecimalColumnStatistics(min, max, nullsCount, fromMetastoreDistinctValuesCount(distinctValuesCount, nullsCount)); + return createDecimalColumnStatistics(min, max, nullsCount, fromMetastoreDistinctValuesCount(distinctValuesCount, nullsCount, rowCount)); } if (columnStatistics.getStatsData().isSetDateStats()) { DateColumnStatsData dateStatsData = columnStatistics.getStatsData().getDateStats(); @@ -357,7 +357,7 @@ public static HiveColumnStatistics fromMetastoreApiColumnStatistics(ColumnStatis Optional max = dateStatsData.isSetHighValue() ? fromMetastoreDate(dateStatsData.getHighValue()) : Optional.empty(); OptionalLong nullsCount = dateStatsData.isSetNumNulls() ? OptionalLong.of(dateStatsData.getNumNulls()) : OptionalLong.empty(); OptionalLong distinctValuesCount = dateStatsData.isSetNumDVs() ? OptionalLong.of(dateStatsData.getNumDVs()) : OptionalLong.empty(); - return createDateColumnStatistics(min, max, nullsCount, fromMetastoreDistinctValuesCount(distinctValuesCount, nullsCount)); + return createDateColumnStatistics(min, max, nullsCount, fromMetastoreDistinctValuesCount(distinctValuesCount, nullsCount, rowCount)); } if (columnStatistics.getStatsData().isSetBooleanStats()) { BooleanColumnStatsData booleanStatsData = columnStatistics.getStatsData().getBooleanStats(); @@ -376,7 +376,7 @@ public static HiveColumnStatistics fromMetastoreApiColumnStatistics(ColumnStatis maxColumnLength, getTotalSizeInBytes(averageColumnLength, rowCount, nullsCount), nullsCount, - fromMetastoreDistinctValuesCount(distinctValuesCount, nullsCount)); + fromMetastoreDistinctValuesCount(distinctValuesCount, nullsCount, rowCount)); } if (columnStatistics.getStatsData().isSetBinaryStats()) { BinaryColumnStatsData binaryStatsData = columnStatistics.getStatsData().getBinaryStats(); @@ -424,10 +424,23 @@ public static OptionalLong getTotalSizeInBytes(OptionalDouble averageColumnLengt /** * Hive calculates NDV considering null as a distinct value */ - private static OptionalLong fromMetastoreDistinctValuesCount(OptionalLong distinctValuesCount, OptionalLong nullsCount) + private static OptionalLong fromMetastoreDistinctValuesCount(OptionalLong distinctValuesCount, OptionalLong nullsCount, OptionalLong rowCount) { - if (distinctValuesCount.isPresent() && nullsCount.isPresent() && distinctValuesCount.getAsLong() > 0 && nullsCount.getAsLong() > 0) { - return OptionalLong.of(distinctValuesCount.getAsLong() - 1); + if (distinctValuesCount.isPresent() && nullsCount.isPresent() && rowCount.isPresent()) { + return OptionalLong.of(fromMetastoreDistinctValuesCount(distinctValuesCount.getAsLong(), nullsCount.getAsLong(), rowCount.getAsLong())); + } + return OptionalLong.empty(); + } + + private static long fromMetastoreDistinctValuesCount(long distinctValuesCount, long nullsCount, long rowCount) + { + long nonNullsCount = rowCount - nullsCount; + if (nullsCount > 0) { + distinctValuesCount--; + } + // the metastore may store an estimate, so the value stored may be higher than the total number of rows + if (distinctValuesCount > nonNullsCount) { + return nonNullsCount; } return distinctValuesCount; } @@ -664,10 +677,10 @@ public static Decimal toMetastoreDecimal(BigDecimal decimal) private static OptionalLong toMetastoreDistinctValuesCount(OptionalLong distinctValuesCount, OptionalLong nullsCount) { // metastore counts null as a distinct value - if (distinctValuesCount.isPresent() && nullsCount.isPresent() && (nullsCount.getAsLong() > 0)) { - return OptionalLong.of(distinctValuesCount.getAsLong() + 1); + if (distinctValuesCount.isPresent() && nullsCount.isPresent()) { + return OptionalLong.of(distinctValuesCount.getAsLong() + (nullsCount.getAsLong() > 0 ? 1 : 0)); } - return distinctValuesCount; + return OptionalLong.empty(); } private static OptionalDouble getAverageColumnLength(OptionalLong totalSizeInBytes, OptionalLong rowCount, OptionalLong nullsCount) diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java b/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java index fff51b416713b..61fb4f1186a41 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/statistics/MetastoreHiveStatisticsProvider.java @@ -231,8 +231,31 @@ private static void validateColumnStatistics(SchemaTableName table, String parti rowCount.getAsLong()); } }); - columnStatistics.getDistinctValuesCount().ifPresent(distinctValuesCount -> - checkStatistics(distinctValuesCount >= 0, table, partition, column, "distinctValuesCount must be greater than or equal to zero: %s", distinctValuesCount)); + columnStatistics.getDistinctValuesCount().ifPresent(distinctValuesCount -> { + checkStatistics(distinctValuesCount >= 0, table, partition, column, "distinctValuesCount must be greater than or equal to zero: %s", distinctValuesCount); + if (rowCount.isPresent()) { + checkStatistics( + distinctValuesCount <= rowCount.getAsLong(), + table, + partition, + column, + "distinctValuesCount must be less than or equal to rowCount. distinctValuesCount: %s. rowCount: %s.", + distinctValuesCount, + rowCount.getAsLong()); + } + if (rowCount.isPresent() && columnStatistics.getNullsCount().isPresent()) { + long nonNullsCount = rowCount.getAsLong() - columnStatistics.getNullsCount().getAsLong(); + checkStatistics( + distinctValuesCount <= nonNullsCount, + table, + partition, + column, + "distinctValuesCount must be less than or equal to nonNullsCount. distinctValuesCount: %s. nonNullsCount: %s.", + distinctValuesCount, + nonNullsCount); + } + }); + columnStatistics.getIntegerStatistics().ifPresent(integerStatistics -> { OptionalLong min = integerStatistics.getMin(); OptionalLong max = integerStatistics.getMax(); diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/util/Statistics.java b/presto-hive/src/main/java/com/facebook/presto/hive/util/Statistics.java index 8614333ed050d..1ae5c3b555f84 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/util/Statistics.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/util/Statistics.java @@ -314,16 +314,24 @@ private static HiveColumnStatistics createHiveColumnStatistics( result.setTotalSizeInBytes(getIntegerValue(session, BIGINT, computedStatistics.get(TOTAL_SIZE_IN_BYTES))); } - // NDV - if (computedStatistics.containsKey(NUMBER_OF_DISTINCT_VALUES)) { - result.setDistinctValuesCount(BIGINT.getLong(computedStatistics.get(NUMBER_OF_DISTINCT_VALUES), 0)); - } - // NUMBER OF NULLS if (computedStatistics.containsKey(NUMBER_OF_NON_NULL_VALUES)) { result.setNullsCount(rowCount - BIGINT.getLong(computedStatistics.get(NUMBER_OF_NON_NULL_VALUES), 0)); } + // NDV + if (computedStatistics.containsKey(NUMBER_OF_DISTINCT_VALUES) && computedStatistics.containsKey(NUMBER_OF_NON_NULL_VALUES)) { + // number of distinct value is estimated using HLL, and can be higher than the number of non null values + long numberOfNonNullValues = BIGINT.getLong(computedStatistics.get(NUMBER_OF_NON_NULL_VALUES), 0); + long numberOfDistinctValues = BIGINT.getLong(computedStatistics.get(NUMBER_OF_DISTINCT_VALUES), 0); + if (numberOfDistinctValues > numberOfNonNullValues) { + result.setDistinctValuesCount(numberOfNonNullValues); + } + else { + result.setDistinctValuesCount(numberOfDistinctValues); + } + } + // NUMBER OF FALSE, NUMBER OF TRUE if (computedStatistics.containsKey(NUMBER_OF_TRUE_VALUES) && computedStatistics.containsKey(NUMBER_OF_NON_NULL_VALUES)) { long numberOfTrue = BIGINT.getLong(computedStatistics.get(NUMBER_OF_TRUE_VALUES), 0); diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/AbstractTestHiveClient.java b/presto-hive/src/test/java/com/facebook/presto/hive/AbstractTestHiveClient.java index 9aed55b089e57..fa6f8328bf287 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/AbstractTestHiveClient.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/AbstractTestHiveClient.java @@ -482,7 +482,7 @@ private static RowType toRowType(List columns) private static final PartitionStatistics STATISTICS_1_1 = new PartitionStatistics( - new HiveBasicStatistics(OptionalLong.of(0), OptionalLong.of(10), OptionalLong.empty(), OptionalLong.of(0)), + new HiveBasicStatistics(OptionalLong.of(0), OptionalLong.of(15), OptionalLong.empty(), OptionalLong.of(0)), STATISTICS_1.getColumnStatistics().entrySet() .stream() .filter(entry -> entry.getKey().hashCode() % 2 == 0) @@ -490,7 +490,7 @@ private static RowType toRowType(List columns) private static final PartitionStatistics STATISTICS_1_2 = new PartitionStatistics( - new HiveBasicStatistics(OptionalLong.of(0), OptionalLong.of(10), OptionalLong.of(3), OptionalLong.of(0)), + new HiveBasicStatistics(OptionalLong.of(0), OptionalLong.of(15), OptionalLong.of(3), OptionalLong.of(0)), STATISTICS_1.getColumnStatistics().entrySet() .stream() .filter(entry -> entry.getKey().hashCode() % 2 == 1) @@ -519,7 +519,7 @@ private static RowType toRowType(List columns) private static final PartitionStatistics STATISTICS_EMPTY_OPTIONAL_FIELDS = new PartitionStatistics( - new HiveBasicStatistics(OptionalLong.of(0), OptionalLong.empty(), OptionalLong.empty(), OptionalLong.of(0)), + new HiveBasicStatistics(OptionalLong.of(0), OptionalLong.of(20), OptionalLong.empty(), OptionalLong.of(0)), ImmutableMap.builder() .put("t_boolean", createBooleanColumnStatistics(OptionalLong.of(4), OptionalLong.of(3), OptionalLong.of(2))) .put("t_bigint", createIntegerColumnStatistics(OptionalLong.empty(), OptionalLong.empty(), OptionalLong.of(4), OptionalLong.of(7))) @@ -528,10 +528,10 @@ private static RowType toRowType(List columns) .put("t_tinyint", createIntegerColumnStatistics(OptionalLong.empty(), OptionalLong.empty(), OptionalLong.of(2), OptionalLong.of(3))) .put("t_double", createDoubleColumnStatistics(OptionalDouble.empty(), OptionalDouble.empty(), OptionalLong.of(6), OptionalLong.of(3))) .put("t_float", createDoubleColumnStatistics(OptionalDouble.empty(), OptionalDouble.empty(), OptionalLong.of(7), OptionalLong.of(11))) - .put("t_string", createStringColumnStatistics(OptionalLong.of(0), OptionalLong.empty(), OptionalLong.of(2), OptionalLong.of(6))) - .put("t_varchar", createStringColumnStatistics(OptionalLong.of(0), OptionalLong.empty(), OptionalLong.of(7), OptionalLong.of(1))) - .put("t_char", createStringColumnStatistics(OptionalLong.of(0), OptionalLong.empty(), OptionalLong.of(0), OptionalLong.of(3))) - .put("t_varbinary", createBinaryColumnStatistics(OptionalLong.of(0), OptionalLong.empty(), OptionalLong.of(2))) + .put("t_string", createStringColumnStatistics(OptionalLong.of(0), OptionalLong.of(0), OptionalLong.of(2), OptionalLong.of(6))) + .put("t_varchar", createStringColumnStatistics(OptionalLong.of(0), OptionalLong.of(0), OptionalLong.of(7), OptionalLong.of(1))) + .put("t_char", createStringColumnStatistics(OptionalLong.of(0), OptionalLong.of(0), OptionalLong.of(0), OptionalLong.of(3))) + .put("t_varbinary", createBinaryColumnStatistics(OptionalLong.of(0), OptionalLong.of(0), OptionalLong.of(2))) // https://issues.apache.org/jira/browse/HIVE-20098 // .put("t_date", createDateColumnStatistics(Optional.empty(), Optional.empty(), OptionalLong.of(8), OptionalLong.of(7))) .put("t_timestamp", createIntegerColumnStatistics(OptionalLong.empty(), OptionalLong.empty(), OptionalLong.of(9), OptionalLong.of(1))) diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/metastore/thrift/TestThriftMetastoreUtil.java b/presto-hive/src/test/java/com/facebook/presto/hive/metastore/thrift/TestThriftMetastoreUtil.java index a6862a0af2cda..cb3547c1fc66e 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/metastore/thrift/TestThriftMetastoreUtil.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/metastore/thrift/TestThriftMetastoreUtil.java @@ -70,7 +70,7 @@ public void testLongStatsToColumnStatistics() longColumnStatsData.setNumNulls(1); longColumnStatsData.setNumDVs(20); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", BIGINT_TYPE_NAME, longStats(longColumnStatsData)); - HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); + HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.of(1000)); assertEquals(actual.getIntegerStatistics(), Optional.of(new IntegerStatistics(OptionalLong.of(0), OptionalLong.of(100)))); assertEquals(actual.getDoubleStatistics(), Optional.empty()); @@ -110,7 +110,7 @@ public void testDoubleStatsToColumnStatistics() doubleColumnStatsData.setNumNulls(1); doubleColumnStatsData.setNumDVs(20); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", DOUBLE_TYPE_NAME, doubleStats(doubleColumnStatsData)); - HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); + HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.of(1000)); assertEquals(actual.getIntegerStatistics(), Optional.empty()); assertEquals(actual.getDoubleStatistics(), Optional.of(new DoubleStatistics(OptionalDouble.of(0), OptionalDouble.of(100)))); @@ -152,7 +152,7 @@ public void testDecimalStatsToColumnStatistics() decimalColumnStatsData.setNumNulls(1); decimalColumnStatsData.setNumDVs(20); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", DECIMAL_TYPE_NAME, decimalStats(decimalColumnStatsData)); - HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); + HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.of(1000)); assertEquals(actual.getIntegerStatistics(), Optional.empty()); assertEquals(actual.getDoubleStatistics(), Optional.empty()); @@ -231,7 +231,7 @@ public void testDateStatsToColumnStatistics() dateColumnStatsData.setNumNulls(1); dateColumnStatsData.setNumDVs(20); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", DATE_TYPE_NAME, dateStats(dateColumnStatsData)); - HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); + HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.of(1000)); assertEquals(actual.getIntegerStatistics(), Optional.empty()); assertEquals(actual.getDoubleStatistics(), Optional.empty()); @@ -281,7 +281,7 @@ public void testStringStatsToColumnStatistics() assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.of(100)); assertEquals(actual.getTotalSizeInBytes(), OptionalLong.of(23)); assertEquals(actual.getNullsCount(), OptionalLong.of(1)); - assertEquals(actual.getDistinctValuesCount(), OptionalLong.of(19)); + assertEquals(actual.getDistinctValuesCount(), OptionalLong.of(1)); } @Test diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/statistics/TestMetastoreHiveStatisticsProvider.java b/presto-hive/src/test/java/com/facebook/presto/hive/statistics/TestMetastoreHiveStatisticsProvider.java index 68ba391481f3a..837347a214368 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/statistics/TestMetastoreHiveStatisticsProvider.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/statistics/TestMetastoreHiveStatisticsProvider.java @@ -167,6 +167,18 @@ public void testValidatePartitionStatistics() .setColumnStatistics(ImmutableMap.of(COLUMN, HiveColumnStatistics.builder().setDistinctValuesCount(-1).build())) .build(), invalidColumnStatistics("distinctValuesCount must be greater than or equal to zero: -1")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, HiveColumnStatistics.builder().setDistinctValuesCount(1).build())) + .build(), + invalidColumnStatistics("distinctValuesCount must be less than or equal to rowCount. distinctValuesCount: 1. rowCount: 0.")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 1, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, HiveColumnStatistics.builder().setDistinctValuesCount(1).setNullsCount(1).build())) + .build(), + invalidColumnStatistics("distinctValuesCount must be less than or equal to nonNullsCount. distinctValuesCount: 1. nonNullsCount: 0.")); assertInvalidStatistics( PartitionStatistics.builder() .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) diff --git a/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveTableStatistics.java b/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveTableStatistics.java index 0cf9f0dc71cbc..837fb035e9d9b 100644 --- a/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveTableStatistics.java +++ b/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveTableStatistics.java @@ -208,7 +208,7 @@ public void testStatisticsForUnpartitionedTable() row("n_nationkey", null, 19.0, 0.0, null, "0", "24"), row("n_name", 177.0, 24.0, 0.0, null, null, null), row("n_regionkey", null, 5.0, 0.0, null, "0", "4"), - row("n_comment", 1857.0, 31.0, 0.0, null, null, null), + row("n_comment", 1857.0, 25.0, 0.0, null, null, null), row(null, null, null, null, 25.0, null, null)); } @@ -294,16 +294,16 @@ public void testStatisticsForTablePartitionedByBigint() assertThat(query(showStatsWholeTable)).containsOnly( row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), - row("p_name", 114.0, 6.0, 0.0, null, null, null), + row("p_name", 114.0, 5.0, 0.0, null, null, null), row("p_regionkey", null, 3.0, 0.0, null, "1", "3"), - row("p_comment", 1497.0, 7.0, 0.0, null, null, null), + row("p_comment", 1497.0, 5.0, 0.0, null, null, null), row(null, null, null, null, 15.0, null, null)); assertThat(query(showStatsPartitionOne)).containsOnly( row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), - row("p_name", 38.0, 6.0, 0.0, null, null, null), + row("p_name", 38.0, 5.0, 0.0, null, null, null), row("p_regionkey", null, 1.0, 0.0, null, "1", "1"), - row("p_comment", 499.0, 7.0, 0.0, null, null, null), + row("p_comment", 499.0, 5.0, 0.0, null, null, null), row(null, null, null, null, 5.0, null, null)); assertThat(query(showStatsPartitionTwo)).containsOnly( @@ -319,21 +319,21 @@ public void testStatisticsForTablePartitionedByBigint() assertThat(query(showStatsWholeTable)).containsOnly( row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), - row("p_name", 109.0, 6.0, 0.0, null, null, null), + row("p_name", 109.0, 5.0, 0.0, null, null, null), row("p_regionkey", null, 3.0, 0.0, null, "1", "3"), - row("p_comment", 1197.0, 7.0, 0.0, null, null, null), + row("p_comment", 1197.0, 5.0, 0.0, null, null, null), row(null, null, null, null, 15.0, null, null)); assertThat(query(showStatsPartitionOne)).containsOnly( row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), - row("p_name", 38.0, 6.0, 0.0, null, null, null), + row("p_name", 38.0, 5.0, 0.0, null, null, null), row("p_regionkey", null, 1.0, 0.0, null, "1", "1"), - row("p_comment", 499.0, 7.0, 0.0, null, null, null), + row("p_comment", 499.0, 5.0, 0.0, null, null, null), row(null, null, null, null, 5.0, null, null)); assertThat(query(showStatsPartitionTwo)).containsOnly( row("p_nationkey", null, 4.0, 0.0, null, "8", "21"), - row("p_name", 31.0, 6.0, 0.0, null, null, null), + row("p_name", 31.0, 5.0, 0.0, null, null, null), row("p_regionkey", null, 1.0, 0.0, null, "2", "2"), row("p_comment", 351.0, 5.0, 0.0, null, null, null), row(null, null, null, null, 5.0, null, null)); @@ -421,16 +421,16 @@ public void testStatisticsForTablePartitionedByVarchar() assertThat(query(showStatsWholeTable)).containsOnly( row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), - row("p_name", 114.0, 6.0, 0.0, null, null, null), + row("p_name", 114.0, 5.0, 0.0, null, null, null), row("p_regionkey", 85.0, 3.0, 0.0, null, null, null), - row("p_comment", 1497.0, 7.0, 0.0, null, null, null), + row("p_comment", 1497.0, 5.0, 0.0, null, null, null), row(null, null, null, null, 15.0, null, null)); assertThat(query(showStatsPartitionOne)).containsOnly( row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), - row("p_name", 38.0, 6.0, 0.0, null, null, null), + row("p_name", 38.0, 5.0, 0.0, null, null, null), row("p_regionkey", 35.0, 1.0, 0.0, null, null, null), - row("p_comment", 499.0, 7.0, 0.0, null, null, null), + row("p_comment", 499.0, 5.0, 0.0, null, null, null), row(null, null, null, null, 5.0, null, null)); assertThat(query(showStatsPartitionTwo)).containsOnly( @@ -446,21 +446,21 @@ public void testStatisticsForTablePartitionedByVarchar() assertThat(query(showStatsWholeTable)).containsOnly( row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), - row("p_name", 109.0, 6.0, 0.0, null, null, null), + row("p_name", 109.0, 5.0, 0.0, null, null, null), row("p_regionkey", 85.0, 3.0, 0.0, null, null, null), - row("p_comment", 1197.0, 7.0, 0.0, null, null, null), + row("p_comment", 1197.0, 5.0, 0.0, null, null, null), row(null, null, null, null, 15.0, null, null)); assertThat(query(showStatsPartitionOne)).containsOnly( row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), - row("p_name", 38.0, 6.0, 0.0, null, null, null), + row("p_name", 38.0, 5.0, 0.0, null, null, null), row("p_regionkey", 35.0, 1.0, 0.0, null, null, null), - row("p_comment", 499.0, 7.0, 0.0, null, null, null), + row("p_comment", 499.0, 5.0, 0.0, null, null, null), row(null, null, null, null, 5.0, null, null)); assertThat(query(showStatsPartitionTwo)).containsOnly( row("p_nationkey", null, 4.0, 0.0, null, "8", "21"), - row("p_name", 31.0, 6.0, 0.0, null, null, null), + row("p_name", 31.0, 5.0, 0.0, null, null, null), row("p_regionkey", 20.0, 1.0, 0.0, null, null, null), row("p_comment", 351.0, 5.0, 0.0, null, null, null), row(null, null, null, null, 5.0, null, null)); From 652112e7fc33d36ebc60b11d6f2a7bf103ac1a7b Mon Sep 17 00:00:00 2001 From: Andrii Rosa Date: Wed, 19 Sep 2018 14:03:22 -0400 Subject: [PATCH 14/14] Ignore corrupted statistics when altering partition If the statistics are corrupted, it doesn't make much sense to restore them on rollback. --- .../SemiTransactionalHiveMetastore.java | 35 +++++++++++++++---- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/metastore/SemiTransactionalHiveMetastore.java b/presto-hive/src/main/java/com/facebook/presto/hive/metastore/SemiTransactionalHiveMetastore.java index 926b87490f634..6c98b6c5bd4df 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/metastore/SemiTransactionalHiveMetastore.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/metastore/SemiTransactionalHiveMetastore.java @@ -57,6 +57,7 @@ import java.util.concurrent.atomic.AtomicBoolean; import java.util.stream.Collectors; +import static com.facebook.presto.hive.HiveErrorCode.HIVE_CORRUPTED_COLUMN_STATISTICS; import static com.facebook.presto.hive.HiveErrorCode.HIVE_FILESYSTEM_ERROR; import static com.facebook.presto.hive.HiveErrorCode.HIVE_METASTORE_ERROR; import static com.facebook.presto.hive.HiveErrorCode.HIVE_PATH_ALREADY_EXISTS; @@ -1083,13 +1084,7 @@ private void prepareAlterPartition(HdfsContext context, PartitionAndMore partiti format("The partition that this transaction modified was deleted in another transaction. %s %s", partition.getTableName(), partition.getValues())); } String partitionName = getPartitionName(partition.getDatabaseName(), partition.getTableName(), partition.getValues()); - PartitionStatistics oldPartitionStatistics = delegate.getPartitionStatistics(partition.getDatabaseName(), partition.getTableName(), ImmutableSet.of(partitionName)) - .get(partitionName); - if (oldPartitionStatistics == null) { - throw new PrestoException( - TRANSACTION_CONFLICT, - format("The partition that this transaction modified was deleted in another transaction. %s %s", partition.getTableName(), partition.getValues())); - } + PartitionStatistics oldPartitionStatistics = getExistingPartitionStatistics(partition, partitionName); String oldPartitionLocation = oldPartition.get().getStorage().getLocation(); Path oldPartitionPath = new Path(oldPartitionLocation); @@ -1135,6 +1130,32 @@ private void prepareAlterPartition(HdfsContext context, PartitionAndMore partiti new PartitionWithStatistics(oldPartition.get(), partitionName, oldPartitionStatistics))); } + private PartitionStatistics getExistingPartitionStatistics(Partition partition, String partitionName) + { + try { + PartitionStatistics statistics = delegate.getPartitionStatistics(partition.getDatabaseName(), partition.getTableName(), ImmutableSet.of(partitionName)) + .get(partitionName); + if (statistics == null) { + throw new PrestoException( + TRANSACTION_CONFLICT, + format("The partition that this transaction modified was deleted in another transaction. %s %s", partition.getTableName(), partition.getValues())); + } + return statistics; + } + catch (PrestoException e) { + if (e.getErrorCode().equals(HIVE_CORRUPTED_COLUMN_STATISTICS.toErrorCode())) { + log.warn( + e, + "Corrupted statistics found when altering partition. Table: %s.%s. Partition: %s", + partition.getDatabaseName(), + partition.getTableName(), + partition.getValues()); + return PartitionStatistics.empty(); + } + throw e; + } + } + private void prepareAddPartition(HdfsContext context, PartitionAndMore partitionAndMore) { deleteOnly = false;