Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/changelog/144942.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
area: ES|QL
issues:
- 144914
pr: 144942
summary: Correctly manage NULL data type for SUM
type: bug
230 changes: 230 additions & 0 deletions x-pack/plugin/esql/qa/testFixtures/src/main/resources/null.csv-spec
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

++, thank you!

Original file line number Diff line number Diff line change
Expand Up @@ -584,3 +584,233 @@ FROM employees | WHERE emp_no IN (10001, null) AND first_name IS NOT NULL | SORT
emp_no:integer | first_name:keyword
10001 | Georgi
;

sumOfNullWithEval
required_capability: fix_sum_of_null_optimization

ROW x = 1
| STATS s = SUM(null)
| EVAL r = s + 1
;

s:null | r:integer
null | null
;

sumOfNullPlusNullWithEval
required_capability: fix_sum_of_null_optimization

ROW x = 1
| STATS s = SUM(null + null)
| EVAL r = s + 1
;

s:null | r:integer
null | null
;

sumOfNullPlusOneWithEval

ROW x = 1
| STATS s = SUM(null + 1)
| EVAL r = s + 1
;

s:long | r:long
null | null
Comment on lines +619 to +620
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Subtle, but looks right!

;

sumOfEvalNullWithEval
required_capability: fix_sum_of_null_optimization

ROW x = 1
| EVAL y = null
| STATS s = SUM(y)
| EVAL r = s + 1
;

s:null | r:integer
null | null
;

sumOfNullifiedUnmappedWithEval
required_capability: fix_sum_of_null_optimization
required_capability: optional_fields_nullify_tech_preview

SET unmapped_fields="nullify"\;
ROW x = 1
| STATS s = SUM(foo)
| EVAL r = s + 1
;

s:null | r:integer
null | null
;

avgOfNullWithEval

ROW x = 1
| STATS a = AVG(null)
| EVAL r = a + 1
;

a:double | r:double
null | null
;

avgOfNullPlusNullWithEval

ROW x = 1
| STATS a = AVG(null + null)
| EVAL r = a + 1
;

a:double | r:double
null | null
;

avgOfEvalNullWithEval

ROW x = 1
| EVAL y = null
| STATS a = AVG(y)
| EVAL r = a + 1
;

a:double | r:double
null | null
;

avgOfNullifiedUnmappedWithEval
required_capability: optional_fields_nullify_tech_preview

SET unmapped_fields="nullify"\;
ROW x = 1
| STATS a = AVG(foo)
| EVAL r = a + 1
;

a:double | r:double
null | null
;

minOfNullWithEval

ROW x = 1
| STATS m = MIN(null)
| EVAL r = m + 1
;

m:null | r:integer
null | null
;

minOfEvalNullWithEval
required_capability: fix_sum_of_null_optimization

ROW x = 1
| EVAL y = null
| STATS m = MIN(y)
| EVAL r = m + 1
;

m:null | r:integer
null | null
;

minOfNullifiedUnmappedWithEval
required_capability: optional_fields_nullify_tech_preview

SET unmapped_fields="nullify"\;
ROW x = 1
| STATS m = MIN(foo)
| EVAL r = m + 1
;

m:null | r:integer
null | null
;

maxOfNullWithEval

ROW x = 1
| STATS m = MAX(null)
| EVAL r = m + 1
;

m:null | r:integer
null | null
;

maxOfEvalNullWithEval
required_capability: fix_sum_of_null_optimization

ROW x = 1
| EVAL y = null
| STATS m = MAX(y)
| EVAL r = m + 1
;

m:null | r:integer
null | null
;

maxOfNullifiedUnmappedWithEval
required_capability: optional_fields_nullify_tech_preview

SET unmapped_fields="nullify"\;
ROW x = 1
| STATS m = MAX(foo)
| EVAL r = m + 1
;

m:null | r:integer
null | null
;

multipleAggsOverNullExpressions
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests starting with this one were inspired by my previous unmerged PR #112392.

required_capability: fix_sum_of_null_optimization

ROW x=1, y=2
| STATS c=count(null + 1),
c_d=count_distinct(1 + null),
a=avg(null + x),
mi=min(y + null),
ma=max(y + x * null),
s=sum(null)
;

c:long | c_d:long | a:double | mi:integer | ma:integer | s:null
0 |0 |null |null |null |null
;

countOfNullWithGrouping

ROW a = [1,2,3], c = 5 | EVAL c = null + c | STATS COUNT(c), COUNT(null), COUNT(null - 1) BY a
;

COUNT(c):long |COUNT(null):long|COUNT(null - 1):long|a:integer
0 |0 |0 |1
0 |0 |0 |2
0 |0 |0 |3
;

countOfNullGroupedByFoldableNull

ROW a = 1+null, c = 5 | EVAL c = null + c | STATS COUNT(c), COUNT(null), COUNT(null - 1) BY a
;

COUNT(c):long |COUNT(null):long|COUNT(null - 1):long|a:integer
0 |0 |0 |null
;

countDistinctOfNullWithGrouping

ROW a = [1,2,3], c = 5 | EVAL c = null + c | STATS COUNT_DISTINCT(c), COUNT_DISTINCT(null), COUNT_DISTINCT(null - 1) BY a
;

COUNT_DISTINCT(c):long|COUNT_DISTINCT(null):long|COUNT_DISTINCT(null - 1):long|a:integer
0 |0 |0 |1
0 |0 |0 |2
0 |0 |0 |3
;
Original file line number Diff line number Diff line change
Expand Up @@ -2011,11 +2011,21 @@ s2point1:l | s_mv:l | s_null:l | s_expr:l | s_expr_null:l | rows:l | languages:i

sumOfConst#[skip:-8.13.99,reason:supported in 8.14]
FROM employees
| STATS s1 = sum(1), s2point1 = sum(2.1), s_mv = sum([-1, 0, 3]) * 3, s_null = sum(null), s_expr = sum(1+1), s_expr_null = sum(1+null), rows = count(*)
| STATS s1 = sum(1), s2point1 = sum(2.1), s_mv = sum([-1, 0, 3]) * 3, s_expr = sum(1+1), s_expr_null = sum(1+null), rows = count(*)
;

s1:l | s2point1:d | s_mv:l | s_null:d | s_expr:l | s_expr_null:l | rows:l
100 | 210.0 | 600 | null | 200 | null | 100
s1:l | s2point1:d | s_mv:l | s_expr:l | s_expr_null:l | rows:l
100 | 210.0 | 600 | 200 | null | 100
;

sumOfNullConst
required_capability: fix_sum_of_null_optimization
FROM employees
| STATS s_null = sum(null)
;

s_null:null
null
;

sumOfConstGrouped#[skip:-8.13.99,reason:supported in 8.14]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -291,13 +291,14 @@ foo:integer

statsAggs
required_capability: optional_fields_nullify_tech_preview
required_capability: fix_sum_of_null_optimization

SET unmapped_fields="nullify"\;
ROW x = 1
| STATS s = SUM(foo)
;

s:double
s:null
null
;

Expand All @@ -315,13 +316,14 @@ null

statsAggsGrouped
required_capability: optional_fields_nullify_tech_preview
required_capability: fix_sum_of_null_optimization

SET unmapped_fields="nullify"\;
ROW x = 1
| STATS s = SUM(foo) BY bar
;

s:double | bar:null
s:null | bar:null
null | null
;

Expand Down Expand Up @@ -583,13 +585,14 @@ null | 1985 | null | abc | fork3

inlinestatsCount
required_capability: optional_fields_nullify_tech_preview
required_capability: fix_sum_of_null_optimization

SET unmapped_fields="nullify"\;
ROW x = 1
| INLINE STATS c = COUNT(*), s = SUM(does_not_exist) BY d = does_not_exist
;

x:integer | does_not_exist:null | c:long | s:double | d:null
x:integer | does_not_exist:null | c:long | s:null | d:null
1 | null | 1 | null | null
;

Expand Down Expand Up @@ -818,14 +821,15 @@ null | 2024-05-10T00:00:00.000Z
maxOverTimeOfDoubleNoGroupingUnmappedNullify
required_capability: optional_fields_nullify_tech_preview
required_capability: ts_command_v0
required_capability: fix_sum_of_null_optimization

SET unmapped_fields="nullify"\;
TS k8s_unmapped
| STATS cost=sum(max_over_time(network.cost)) BY time_bucket = bucket(@timestamp,1minute)
| SORT time_bucket | LIMIT 10
;

cost:double | time_bucket:datetime
cost:null | time_bucket:datetime
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

++!

null | 2024-05-10T00:00:00.000Z
null | 2024-05-10T00:01:00.000Z
null | 2024-05-10T00:02:00.000Z
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2384,6 +2384,12 @@ public enum Cap {
*/
UNMAPPED_FIELDS_DEFAULT_SETTING_RENAME,

/**
* Fix for {@code SUM(null)} producing a type mismatch after surrogate expansion.
* See https://github.com/elastic/elasticsearch/issues/144914
*/
FIX_SUM_OF_NULL_OPTIMIZATION,

// Last capability should still have a comma for fewer merge conflicts when adding new ones :)
// This comment prevents the semicolon from being on the previous capability when Spotless formats the file.
;
Expand Down
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fix looks right to me.

Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
import static org.elasticsearch.xpack.esql.core.type.DataType.DOUBLE;
import static org.elasticsearch.xpack.esql.core.type.DataType.EXPONENTIAL_HISTOGRAM;
import static org.elasticsearch.xpack.esql.core.type.DataType.LONG;
import static org.elasticsearch.xpack.esql.core.type.DataType.NULL;
import static org.elasticsearch.xpack.esql.core.type.DataType.UNSIGNED_LONG;

/**
Expand Down Expand Up @@ -121,6 +122,9 @@ public Sum withFilter(Expression filter) {
@Override
public DataType dataType() {
DataType dt = field().dataType();
if (dt == DataType.NULL) {
return DataType.NULL;
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMHO, a NULL data type should produce a NULL result. If it provides a LONG, it could very well provide a DOUBLE, I see no difference. Plus that a NULL result is consistent with other (most) of the functions we have now.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought about this in #142657.

Generally, I think a NULL input should produce an output that is compatible with ("narrower" than) any of the outputs created if the NULL input was replaced by an input with a proper data type. The NULL type is (or should be) a bottom type in ESQL, so it should always be acceptable as output for a NULL input.

For SUM specifically, the output being either long or double, having a long output may also be acceptable. I think NULL is better (and more consistent with SQL!), but long is also acceptable because it is "narrower" (can be used in more places) than double; for instance, CASE(predicate, 1::long, x) requires x to be long, whereas for CASE(predicate, 1::double, x) both a double and a long are okay because CASE implicitly does a little bit of auto-casting.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The practical implication is that, for SUM(foo) used in a nullify query, some expressions depending on SUM(foo) start breaking if foo goes mapped->unmapped as long as SUM(null) is double. They shouldn't start breaking if SUM(null) is long or NULL.

}
if (dt == DataType.DENSE_VECTOR) {
return DataType.DENSE_VECTOR;
}
Expand Down Expand Up @@ -211,9 +215,14 @@ public Expression surrogate() {
);
}

// SUM(const) is equivalent to MV_SUM(const)*COUNT(*).
return field.foldable()
? new Mul(s, new MvSum(s, field), new Count(s, Literal.keyword(s, StringUtils.WILDCARD), filter(), window()))
: null;
if (field.foldable()) {
if (field().dataType() == NULL) {
return new Literal(s, null, NULL);
}
// SUM(const) is equivalent to MV_SUM(const)*COUNT(*).
return new Mul(s, new MvSum(s, field), new Count(s, Literal.keyword(s, StringUtils.WILDCARD), filter(), window()));
} else {
return null;
}
}
}
Loading
Loading