Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -105,14 +105,26 @@ private PlanNodeStatsEstimate estimateExpressionNotEqualToLiteral(
else {
filterRange = new StatisticRange(NEGATIVE_INFINITY, true, POSITIVE_INFINITY, true, 1);
}
double filterFactor = 1 - calculateFilterFactor(expressionStatistics, filterRange);

double filterFactor;
double expressionNDV = expressionStatistics.getDistinctValuesCount();
if (Double.compare(expressionNDV, 1D) == 0) {
// It's hard to make a meaningful estimate when we have only one distinct value
filterFactor = UNKNOWN_FILTER_COEFFICIENT;
}
else {
filterFactor = 1 - calculateFilterFactor(expressionStatistics, filterRange);
}

PlanNodeStatsEstimate.Builder estimate = PlanNodeStatsEstimate.buildFrom(inputStatistics);
estimate.setOutputRowCount(filterFactor * (1 - expressionStatistics.getNullsFraction()) * inputStatistics.getOutputRowCount());
if (expressionVariable.isPresent()) {
// If the original NDV was 1, we do not make any changes to the new estimate, since we're not sure if we eliminated the only distinct value
// Otherwise, we reduce the NDV by 1 (unless it was already 0)
double newNDV = Double.compare(expressionNDV, 1D) == 0 ? 1 : max(expressionNDV - 1, 0);
VariableStatsEstimate symbolNewEstimate = buildFrom(expressionStatistics)
.setNullsFraction(0.0)
.setDistinctValuesCount(max(expressionStatistics.getDistinctValuesCount() - 1, 0))
.setDistinctValuesCount(newNDV)
.build();
estimate = estimate.addVariableStatistics(expressionVariable.get(), symbolNewEstimate);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,13 +91,21 @@
import static java.lang.Double.NaN;
import static java.lang.Double.isInfinite;
import static java.lang.Double.isNaN;
import static java.lang.Double.max;
import static java.lang.Double.min;
import static java.lang.String.format;
import static java.util.Collections.emptyMap;
import static java.util.Objects.requireNonNull;

public class FilterStatsCalculator
{
/**
*
* This value applies a filter factor to upper-bound the size of the variable range selected for an IN predicate
* Since the estimator sums up the individual estimates, we don't want to go beyond 1.0
* This also impacts NOT IN similarly, we never apply a filter factor of 0.0 for a NOT IN clause
*/
static final double CEIL_IN_PREDICATE_UPPER_BOUND_COEFFICIENT = 0.8;
static final double UNKNOWN_FILTER_COEFFICIENT = 0.9;

private final Metadata metadata;
Expand Down Expand Up @@ -403,9 +411,11 @@ protected PlanNodeStatsEstimate visitInPredicate(InPredicate node, Void context)
}

double notNullValuesBeforeIn = input.getOutputRowCount() * (1 - valueStats.getNullsFraction());
double ceiledInEstimated = max(notNullValuesBeforeIn * CEIL_IN_PREDICATE_UPPER_BOUND_COEFFICIENT, 1.0);
double inEstimateRowCount = min(inEstimate.getOutputRowCount(), ceiledInEstimated);

PlanNodeStatsEstimate.Builder result = PlanNodeStatsEstimate.buildFrom(input);
result.setOutputRowCount(min(inEstimate.getOutputRowCount(), notNullValuesBeforeIn));
result.setOutputRowCount(inEstimateRowCount);

if (node.getValue() instanceof SymbolReference) {
VariableReferenceExpression valueVariable = toVariable(node.getValue());
Expand Down Expand Up @@ -774,9 +784,11 @@ private PlanNodeStatsEstimate estimateIn(RowExpression value, List<RowExpression
}

double notNullValuesBeforeIn = input.getOutputRowCount() * (1 - valueStats.getNullsFraction());
double ceiledInEstimated = max(notNullValuesBeforeIn * CEIL_IN_PREDICATE_UPPER_BOUND_COEFFICIENT, 1.0);
double inEstimateRowCount = min(inEstimate.getOutputRowCount(), ceiledInEstimated);

PlanNodeStatsEstimate.Builder result = PlanNodeStatsEstimate.buildFrom(input);
result.setOutputRowCount(min(inEstimate.getOutputRowCount(), notNullValuesBeforeIn));
result.setOutputRowCount(inEstimateRowCount);

if (value instanceof VariableReferenceExpression) {
VariableReferenceExpression valueVariable = (VariableReferenceExpression) value;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import com.facebook.presto.sql.tree.Expression;
import com.google.common.collect.ImmutableList;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.util.Optional;
Expand All @@ -37,6 +38,7 @@
import static java.lang.Double.POSITIVE_INFINITY;
import static java.lang.String.format;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertNotEquals;

public abstract class AbstractTestFilterStatsCalculator
{
Expand All @@ -63,6 +65,36 @@ public AbstractTestFilterStatsCalculator(boolean withHistograms)
.build();
}

/**
* Returns expressions on a variable with NDV 1 and the expected row count after applying the filter
* Row count for the input plan node is assumed to be 100
* @return
*/
@DataProvider
public static Object[][] ndv1Expressions()
{
return new Object[][] {
{"name <> 'bar'", 90D}, // 100 * UNKNOWN_FILTER_COEFFICIENT
{"name <> 'name' AND name <> 'bar'", 81D}, // 100 * UNKNOWN_FILTER_COEFFICIENT * UNKNOWN_FILTER_COEFFICIENT
{"name <> 'foo' OR name is NULL", 90D}, // 100 * UNKNOWN_FILTER_COEFFICIENT
{"name is NULL OR name <> 'foo'", 90D}, // 100 * UNKNOWN_FILTER_COEFFICIENT
};
}

@DataProvider
public static Object[][] inList()
{
return new Object[][] {
{"'one'"},
{"'one','two'"},
{"'one','two','three'"},
{"'one','two','three','four'"},
{"'one','two','three','four','five'"},
{"'one','two','three','four','five','six'"},
{"'one','two','three','four','five','six', 'seven'"}
};
}

@BeforeClass
public void setUp()
throws Exception
Expand Down Expand Up @@ -500,6 +532,50 @@ public void testSymbolEqualsSameSymbolFilter()
.build());
}

@Test(dataProvider = "inList")
public void testInPredicateWithoutNDV(String inList)
{
Expression exp = expression("status IN (" + inList + ")");
TypeProvider customTypes = TypeProvider.fromVariables(ImmutableList.<VariableReferenceExpression>builder()
.add(new VariableReferenceExpression(Optional.empty(), "status", MEDIUM_VARCHAR_TYPE))
.build());

RowExpression rowExpression = translator.translateAndOptimize(exp, customTypes);

VariableStatsEstimate nameStats = VariableStatsEstimate.builder()
// Nulls fraction is known, but NDV is not. Stats propagation should work
.setNullsFraction(0.0D)
.build();

PlanNodeStatsEstimate inputStats = PlanNodeStatsEstimate.builder()
.addVariableStatistics(new VariableReferenceExpression(Optional.empty(), "status", MEDIUM_VARCHAR_TYPE), nameStats)
.setOutputRowCount(100D)
.build();

PlanNodeStatsEstimate rowExpressionStatsEstimate = statsCalculator.filterStats(inputStats, rowExpression, session);

// The IN filter should always apply a filter factor between (0,1) (never NaN/0/1)
int inListLength = inList.split(",").length;
if (inListLength == 1) {
// A single entry IN list is equivalent to an infinite range intersect; we use StatisticRange#INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR (0.5)
// as our filter factor, resulting in : non-null-inputRowCount * 0.5 = 50
assertEquals(rowExpressionStatsEstimate.getOutputRowCount(), 50D);
}
else {
// Multiple values in the IN list - We sum up the estimates, but cap it to non-null-inputRowCount * CEIL_IN_PREDICATE_UPPER_BOUND_COEFFICIENT = 80 in this case
assertEquals(rowExpressionStatsEstimate.getOutputRowCount(), 80D);
}
}

@Test(dataProvider = "inList")
public void testNotInPredicateEstimateIsNeverZero(String inList)
{
RowExpression rowExpression = translator.translateAndOptimize(expression("mediumVarchar NOT IN (" + inList + ")"), standardTypes);
PlanNodeStatsEstimate rowExpressionStatsEstimate = statsCalculator.filterStats(standardInputStatistics, rowExpression, session);

assertNotEquals(rowExpressionStatsEstimate.getOutputRowCount(), 0D, 0.0001D);
}

@Test
public void testInPredicateFilter()
{
Expand Down Expand Up @@ -588,7 +664,8 @@ public void testInPredicateFilter()

// More values in range than distinct values
assertExpression("z IN (DOUBLE '-1', 3.14e0, 0e0, 1e0, 2e0, 3e0, 4e0, 5e0, 6e0, 7e0, 8e0, DOUBLE '-2')")
.outputRowsCount(900.0)
// Range estimate is never the full-range, it's non-null count * CEIL_IN_PREDICATE_UPPER_BOUND_COEFFICIENT
.outputRowsCount(720.0)
.variableStats(new VariableReferenceExpression(Optional.empty(), "z", DOUBLE), variableStats ->
variableStats.distinctValuesCount(5.0)
.lowValue(-2.0)
Expand All @@ -605,6 +682,34 @@ public void testInPredicateFilter()
.nullsFraction(0.0));
}

@Test(dataProvider = "ndv1Expressions")
public void testNotEqualsOnVariablesWithNDV1(String expressionStr, double expectedOutputRowsCount)
{
Expression exp = expression(expressionStr);

VariableReferenceExpression name = new VariableReferenceExpression(Optional.empty(), "name", MEDIUM_VARCHAR_TYPE);
TypeProvider customTypes = TypeProvider.fromVariables(ImmutableList.<VariableReferenceExpression>builder()
.add(name)
.build());

RowExpression rowExpression = translator.translateAndOptimize(exp, customTypes);

VariableStatsEstimate nameStats = VariableStatsEstimate.builder()
.setNullsFraction(0D)
.setDistinctValuesCount(1D)
.build();

PlanNodeStatsEstimate rowExpressionStatsEstimate = statsCalculator.filterStats(PlanNodeStatsEstimate.builder()
.addVariableStatistics(name, nameStats)
.setOutputRowCount(100D)
.build(), rowExpression, session);

PlanNodeStatsAssertion.assertThat(rowExpressionStatsEstimate)
.outputRowsCount(expectedOutputRowsCount)
// Variable Stats remains unchanged
.variableStats(name, variableStats -> variableStats.distinctValuesCount(1D).nullsFraction(0D));
}

protected PlanNodeStatsAssertion assertExpression(String expression)
{
return assertExpression(expression(expression));
Expand Down
Loading