From 90f1a9b479967c1290f25c49fe96542398f5422d Mon Sep 17 00:00:00 2001 From: Raunaq Morarka Date: Fri, 17 Apr 2026 21:43:06 +0530 Subject: [PATCH] Return unknown equality estimate when NDV and range are unknown On a column with unknown NDV and an unbounded range, StatisticRange.overlapPercentWith falls back to the infinite-to-infinite 0.5 heuristic, which is meant for range overlap, not point equality. It yielded 0.5 * non-null rows per equality, causing an IN list to saturate at the full non-null row count and $not(IN) to subtract to 0. --- .../trino/cost/ComparisonStatsCalculator.java | 9 ++++++ .../trino/cost/TestFilterStatsCalculator.java | 31 +++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/core/trino-main/src/main/java/io/trino/cost/ComparisonStatsCalculator.java b/core/trino-main/src/main/java/io/trino/cost/ComparisonStatsCalculator.java index fa7b7b1efa02..baea12f972c1 100644 --- a/core/trino-main/src/main/java/io/trino/cost/ComparisonStatsCalculator.java +++ b/core/trino-main/src/main/java/io/trino/cost/ComparisonStatsCalculator.java @@ -67,6 +67,15 @@ private static PlanNodeStatsEstimate estimateExpressionEqualToLiteral( filterRange = new StatisticRange(literalValue.getAsDouble(), literalValue.getAsDouble(), 1); } else { + // When the literal cannot be represented as a double and the column has no NDV + // and no range, StatisticRange.overlapPercentWith falls back to the + // infinite-to-infinite 0.5 heuristic, which is meant for range overlap, not point + // equality. Treat the selectivity as unknown instead. + if (isNaN(expressionStatistics.getDistinctValuesCount()) + && !isFinite(expressionStatistics.getLowValue()) + && !isFinite(expressionStatistics.getHighValue())) { + return PlanNodeStatsEstimate.unknown(); + } filterRange = new StatisticRange(NEGATIVE_INFINITY, POSITIVE_INFINITY, 1); } return estimateFilterRange(inputStatistics, expressionStatistics, expressionSymbol, filterRange); diff --git a/core/trino-main/src/test/java/io/trino/cost/TestFilterStatsCalculator.java b/core/trino-main/src/test/java/io/trino/cost/TestFilterStatsCalculator.java index d340d969e5f1..39fc7b4d93ed 100644 --- a/core/trino-main/src/test/java/io/trino/cost/TestFilterStatsCalculator.java +++ b/core/trino-main/src/test/java/io/trino/cost/TestFilterStatsCalculator.java @@ -864,6 +864,37 @@ public void testSparseColumnInPredicateOverlap() .nullsFraction(0.0)); } + @Test + public void testNotInOnColumnWithUnknownNdvAndRange() + { + // Regression: on a varchar column with unknown NDV and unbounded range, + // `c NOT IN ('a', 'b')` used to collapse to 0 rows. Each per-value equality + // returned a 0.5 heuristic selectivity, the IN sum saturated at the full + // non-null row count, and $not(IN) subtracted to 0. + + VarcharType type = createVarcharType(16); + Symbol column = new Symbol(type, "c"); + Reference ref = new Reference(type, "c"); + + SymbolStatsEstimate columnStats = SymbolStatsEstimate.builder() + .setAverageRowSize(NaN) + .setDistinctValuesCount(NaN) + .setLowValue(NEGATIVE_INFINITY) + .setHighValue(POSITIVE_INFINITY) + .setNullsFraction(0) + .build(); + PlanNodeStatsEstimate input = PlanNodeStatsEstimate.builder() + .addSymbolStatistics(column, columnStats) + .setOutputRowCount(1000) + .build(); + + Constant a = new Constant(type, Slices.utf8Slice("a")); + Constant b = new Constant(type, Slices.utf8Slice("b")); + + // NOT IN on an unknown column yields an unknown estimate rather than a fabricated row count. + assertExpression(not(new In(ref, ImmutableList.of(a, b))), input).outputRowsCountUnknown(); + } + private PlanNodeStatsAssertion assertExpression(Expression expression) { return assertExpression(expression, session);