diff --git a/docs/changelog/111367.yaml b/docs/changelog/111367.yaml new file mode 100644 index 0000000000000..89e6c1d3b4da4 --- /dev/null +++ b/docs/changelog/111367.yaml @@ -0,0 +1,5 @@ +pr: 111367 +summary: "ESQL: Add Values aggregation tests, fix `ConstantBytesRefBlock` memory handling" +area: ES|QL +type: bug +issues: [] diff --git a/docs/reference/esql/functions/aggregation-functions.asciidoc b/docs/reference/esql/functions/aggregation-functions.asciidoc index 821b109741a0a..518aee563e952 100644 --- a/docs/reference/esql/functions/aggregation-functions.asciidoc +++ b/docs/reference/esql/functions/aggregation-functions.asciidoc @@ -9,8 +9,8 @@ The <> command supports these aggregate functions: // tag::agg_list[] * <> -* <> -* <> +* <> +* <> * <> * <> * <> @@ -19,13 +19,13 @@ The <> command supports these aggregate functions: * experimental:[] <> * <> * <> -* <> +* <> * experimental:[] <> // end::agg_list[] -include::count.asciidoc[] -include::count-distinct.asciidoc[] include::layout/avg.asciidoc[] +include::layout/count.asciidoc[] +include::layout/count_distinct.asciidoc[] include::layout/max.asciidoc[] include::layout/median.asciidoc[] include::layout/median_absolute_deviation.asciidoc[] @@ -34,5 +34,5 @@ include::layout/percentile.asciidoc[] include::layout/st_centroid_agg.asciidoc[] include::layout/sum.asciidoc[] include::layout/top.asciidoc[] -include::values.asciidoc[] +include::layout/values.asciidoc[] include::weighted-avg.asciidoc[] diff --git a/docs/reference/esql/functions/appendix/count_distinct.asciidoc b/docs/reference/esql/functions/appendix/count_distinct.asciidoc new file mode 100644 index 0000000000000..065065cf34e06 --- /dev/null +++ b/docs/reference/esql/functions/appendix/count_distinct.asciidoc @@ -0,0 +1,25 @@ +// This is generated by ESQL's AbstractFunctionTestCase. Do no edit it. See ../README.md for how to regenerate it. + +[discrete] +[[esql-agg-count-distinct-approximate]] +==== Counts are approximate + +Computing exact counts requires loading values into a set and returning its +size. This doesn't scale when working on high-cardinality sets and/or large +values as the required memory usage and the need to communicate those +per-shard sets between nodes would utilize too many resources of the cluster. + +This `COUNT_DISTINCT` function is based on the +https://static.googleusercontent.com/media/research.google.com/fr//pubs/archive/40671.pdf[HyperLogLog++] +algorithm, which counts based on the hashes of the values with some interesting +properties: + +include::../../../aggregations/metrics/cardinality-aggregation.asciidoc[tag=explanation] + +The `COUNT_DISTINCT` function takes an optional second parameter to configure +the precision threshold. The precision_threshold options allows to trade memory +for accuracy, and defines a unique count below which counts are expected to be +close to accurate. Above this value, counts might become a bit more fuzzy. The +maximum supported value is 40000, thresholds above this number will have the +same effect as a threshold of 40000. The default value is `3000`. + diff --git a/docs/reference/esql/functions/appendix/values.asciidoc b/docs/reference/esql/functions/appendix/values.asciidoc new file mode 100644 index 0000000000000..ec3cfff2db6a6 --- /dev/null +++ b/docs/reference/esql/functions/appendix/values.asciidoc @@ -0,0 +1,10 @@ +// This is generated by ESQL's AbstractFunctionTestCase. Do no edit it. See ../README.md for how to regenerate it. + +[WARNING] +==== +This can use a significant amount of memory and ES|QL doesn't yet +grow aggregations beyond memory. So this aggregation will work until +it is used to collect more values than can fit into memory. Once it +collects too many values it will fail the query with +a <>. +==== diff --git a/docs/reference/esql/functions/count-distinct.asciidoc b/docs/reference/esql/functions/count-distinct.asciidoc deleted file mode 100644 index a9f30d24e0e83..0000000000000 --- a/docs/reference/esql/functions/count-distinct.asciidoc +++ /dev/null @@ -1,85 +0,0 @@ -[discrete] -[[esql-agg-count-distinct]] -=== `COUNT_DISTINCT` - -*Syntax* - -[source,esql] ----- -COUNT_DISTINCT(expression[, precision_threshold]) ----- - -*Parameters* - -`expression`:: -Expression that outputs the values on which to perform a distinct count. - -`precision_threshold`:: -Precision threshold. Refer to <>. The -maximum supported value is 40000. Thresholds above this number will have the -same effect as a threshold of 40000. The default value is 3000. - -*Description* - -Returns the approximate number of distinct values. - -*Supported types* - -Can take any field type as input. - -*Examples* - -[source.merge.styled,esql] ----- -include::{esql-specs}/stats_count_distinct.csv-spec[tag=count-distinct] ----- -[%header.monospaced.styled,format=dsv,separator=|] -|=== -include::{esql-specs}/stats_count_distinct.csv-spec[tag=count-distinct-result] -|=== - -With the optional second parameter to configure the precision threshold: - -[source.merge.styled,esql] ----- -include::{esql-specs}/stats_count_distinct.csv-spec[tag=count-distinct-precision] ----- -[%header.monospaced.styled,format=dsv,separator=|] -|=== -include::{esql-specs}/stats_count_distinct.csv-spec[tag=count-distinct-precision-result] -|=== - -The expression can use inline functions. This example splits a string into -multiple values using the `SPLIT` function and counts the unique values: - -[source.merge.styled,esql] ----- -include::{esql-specs}/stats_count_distinct.csv-spec[tag=docsCountDistinctWithExpression] ----- -[%header.monospaced.styled,format=dsv,separator=|] -|=== -include::{esql-specs}/stats_count_distinct.csv-spec[tag=docsCountDistinctWithExpression-result] -|=== - -[discrete] -[[esql-agg-count-distinct-approximate]] -==== Counts are approximate - -Computing exact counts requires loading values into a set and returning its -size. This doesn't scale when working on high-cardinality sets and/or large -values as the required memory usage and the need to communicate those -per-shard sets between nodes would utilize too many resources of the cluster. - -This `COUNT_DISTINCT` function is based on the -https://static.googleusercontent.com/media/research.google.com/fr//pubs/archive/40671.pdf[HyperLogLog++] -algorithm, which counts based on the hashes of the values with some interesting -properties: - -include::../../aggregations/metrics/cardinality-aggregation.asciidoc[tag=explanation] - -The `COUNT_DISTINCT` function takes an optional second parameter to configure -the precision threshold. The precision_threshold options allows to trade memory -for accuracy, and defines a unique count below which counts are expected to be -close to accurate. Above this value, counts might become a bit more fuzzy. The -maximum supported value is 40000, thresholds above this number will have the -same effect as a threshold of 40000. The default value is `3000`. \ No newline at end of file diff --git a/docs/reference/esql/functions/description/count.asciidoc b/docs/reference/esql/functions/description/count.asciidoc new file mode 100644 index 0000000000000..ee806d65a8ea3 --- /dev/null +++ b/docs/reference/esql/functions/description/count.asciidoc @@ -0,0 +1,5 @@ +// This is generated by ESQL's AbstractFunctionTestCase. Do no edit it. See ../README.md for how to regenerate it. + +*Description* + +Returns the total number (count) of input values. diff --git a/docs/reference/esql/functions/description/count_distinct.asciidoc b/docs/reference/esql/functions/description/count_distinct.asciidoc new file mode 100644 index 0000000000000..d10825bb991f5 --- /dev/null +++ b/docs/reference/esql/functions/description/count_distinct.asciidoc @@ -0,0 +1,5 @@ +// This is generated by ESQL's AbstractFunctionTestCase. Do no edit it. See ../README.md for how to regenerate it. + +*Description* + +Returns the approximate number of distinct values. diff --git a/docs/reference/esql/functions/description/values.asciidoc b/docs/reference/esql/functions/description/values.asciidoc new file mode 100644 index 0000000000000..b3cebcce955f0 --- /dev/null +++ b/docs/reference/esql/functions/description/values.asciidoc @@ -0,0 +1,5 @@ +// This is generated by ESQL's AbstractFunctionTestCase. Do no edit it. See ../README.md for how to regenerate it. + +*Description* + +Returns all values in a group as a multivalued field. The order of the returned values isn't guaranteed. If you need the values returned in order use <>. diff --git a/docs/reference/esql/functions/count.asciidoc b/docs/reference/esql/functions/examples/count.asciidoc similarity index 63% rename from docs/reference/esql/functions/count.asciidoc rename to docs/reference/esql/functions/examples/count.asciidoc index 66cfe76350cdd..fb696b51e054c 100644 --- a/docs/reference/esql/functions/count.asciidoc +++ b/docs/reference/esql/functions/examples/count.asciidoc @@ -1,27 +1,4 @@ -[discrete] -[[esql-agg-count]] -=== `COUNT` - -*Syntax* - -[source,esql] ----- -COUNT([expression]) ----- - -*Parameters* - -`expression`:: -Expression that outputs values to be counted. -If omitted, equivalent to `COUNT(*)` (the number of rows). - -*Description* - -Returns the total number (count) of input values. - -*Supported types* - -Can take any field type as input. +// This is generated by ESQL's AbstractFunctionTestCase. Do no edit it. See ../README.md for how to regenerate it. *Examples* @@ -33,9 +10,7 @@ include::{esql-specs}/stats.csv-spec[tag=count] |=== include::{esql-specs}/stats.csv-spec[tag=count-result] |=== - -To count the number of rows, use `COUNT()` or `COUNT(*)`: - +To count the number of rows, use `COUNT()` or `COUNT(*)` [source.merge.styled,esql] ---- include::{esql-specs}/docs.csv-spec[tag=countAll] @@ -44,10 +19,7 @@ include::{esql-specs}/docs.csv-spec[tag=countAll] |=== include::{esql-specs}/docs.csv-spec[tag=countAll-result] |=== - -The expression can use inline functions. This example splits a string into -multiple values using the `SPLIT` function and counts the values: - +The expression can use inline functions. This example splits a string into multiple values using the `SPLIT` function and counts the values [source.merge.styled,esql] ---- include::{esql-specs}/stats.csv-spec[tag=docsCountWithExpression] @@ -56,11 +28,7 @@ include::{esql-specs}/stats.csv-spec[tag=docsCountWithExpression] |=== include::{esql-specs}/stats.csv-spec[tag=docsCountWithExpression-result] |=== - -[[esql-agg-count-or-null]] -To count the number of times an expression returns `TRUE` use -a <> command to remove rows that shouldn't be included: - +To count the number of times an expression returns `TRUE` use a <> command to remove rows that shouldn't be included [source.merge.styled,esql] ---- include::{esql-specs}/stats.csv-spec[tag=count-where] @@ -69,10 +37,7 @@ include::{esql-specs}/stats.csv-spec[tag=count-where] |=== include::{esql-specs}/stats.csv-spec[tag=count-where-result] |=== - -To count the same stream of data based on two different expressions -use the pattern `COUNT( OR NULL)`: - +To count the same stream of data based on two different expressions use the pattern `COUNT( OR NULL)` [source.merge.styled,esql] ---- include::{esql-specs}/stats.csv-spec[tag=count-or-null] @@ -81,3 +46,4 @@ include::{esql-specs}/stats.csv-spec[tag=count-or-null] |=== include::{esql-specs}/stats.csv-spec[tag=count-or-null-result] |=== + diff --git a/docs/reference/esql/functions/examples/count_distinct.asciidoc b/docs/reference/esql/functions/examples/count_distinct.asciidoc new file mode 100644 index 0000000000000..44968c0652ec0 --- /dev/null +++ b/docs/reference/esql/functions/examples/count_distinct.asciidoc @@ -0,0 +1,31 @@ +// This is generated by ESQL's AbstractFunctionTestCase. Do no edit it. See ../README.md for how to regenerate it. + +*Examples* + +[source.merge.styled,esql] +---- +include::{esql-specs}/stats_count_distinct.csv-spec[tag=count-distinct] +---- +[%header.monospaced.styled,format=dsv,separator=|] +|=== +include::{esql-specs}/stats_count_distinct.csv-spec[tag=count-distinct-result] +|=== +With the optional second parameter to configure the precision threshold +[source.merge.styled,esql] +---- +include::{esql-specs}/stats_count_distinct.csv-spec[tag=count-distinct-precision] +---- +[%header.monospaced.styled,format=dsv,separator=|] +|=== +include::{esql-specs}/stats_count_distinct.csv-spec[tag=count-distinct-precision-result] +|=== +The expression can use inline functions. This example splits a string into multiple values using the `SPLIT` function and counts the unique values +[source.merge.styled,esql] +---- +include::{esql-specs}/stats_count_distinct.csv-spec[tag=docsCountDistinctWithExpression] +---- +[%header.monospaced.styled,format=dsv,separator=|] +|=== +include::{esql-specs}/stats_count_distinct.csv-spec[tag=docsCountDistinctWithExpression-result] +|=== + diff --git a/docs/reference/esql/functions/examples/values.asciidoc b/docs/reference/esql/functions/examples/values.asciidoc new file mode 100644 index 0000000000000..c013fc39d92ca --- /dev/null +++ b/docs/reference/esql/functions/examples/values.asciidoc @@ -0,0 +1,13 @@ +// This is generated by ESQL's AbstractFunctionTestCase. Do no edit it. See ../README.md for how to regenerate it. + +*Example* + +[source.merge.styled,esql] +---- +include::{esql-specs}/string.csv-spec[tag=values-grouped] +---- +[%header.monospaced.styled,format=dsv,separator=|] +|=== +include::{esql-specs}/string.csv-spec[tag=values-grouped-result] +|=== + diff --git a/docs/reference/esql/functions/kibana/definition/count.json b/docs/reference/esql/functions/kibana/definition/count.json new file mode 100644 index 0000000000000..e05ebc6789816 --- /dev/null +++ b/docs/reference/esql/functions/kibana/definition/count.json @@ -0,0 +1,159 @@ +{ + "comment" : "This is generated by ESQL's AbstractFunctionTestCase. Do no edit it. See ../README.md for how to regenerate it.", + "type" : "agg", + "name" : "count", + "description" : "Returns the total number (count) of input values.", + "signatures" : [ + { + "params" : [ + { + "name" : "field", + "type" : "boolean", + "optional" : true, + "description" : "Expression that outputs values to be counted. If omitted, equivalent to `COUNT(*)` (the number of rows)." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "cartesian_point", + "optional" : true, + "description" : "Expression that outputs values to be counted. If omitted, equivalent to `COUNT(*)` (the number of rows)." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "datetime", + "optional" : true, + "description" : "Expression that outputs values to be counted. If omitted, equivalent to `COUNT(*)` (the number of rows)." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "double", + "optional" : true, + "description" : "Expression that outputs values to be counted. If omitted, equivalent to `COUNT(*)` (the number of rows)." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "geo_point", + "optional" : true, + "description" : "Expression that outputs values to be counted. If omitted, equivalent to `COUNT(*)` (the number of rows)." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "integer", + "optional" : true, + "description" : "Expression that outputs values to be counted. If omitted, equivalent to `COUNT(*)` (the number of rows)." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "ip", + "optional" : true, + "description" : "Expression that outputs values to be counted. If omitted, equivalent to `COUNT(*)` (the number of rows)." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "keyword", + "optional" : true, + "description" : "Expression that outputs values to be counted. If omitted, equivalent to `COUNT(*)` (the number of rows)." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "long", + "optional" : true, + "description" : "Expression that outputs values to be counted. If omitted, equivalent to `COUNT(*)` (the number of rows)." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "text", + "optional" : true, + "description" : "Expression that outputs values to be counted. If omitted, equivalent to `COUNT(*)` (the number of rows)." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "unsigned_long", + "optional" : true, + "description" : "Expression that outputs values to be counted. If omitted, equivalent to `COUNT(*)` (the number of rows)." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "version", + "optional" : true, + "description" : "Expression that outputs values to be counted. If omitted, equivalent to `COUNT(*)` (the number of rows)." + } + ], + "variadic" : false, + "returnType" : "long" + } + ], + "examples" : [ + "FROM employees\n| STATS COUNT(height)", + "FROM employees \n| STATS count = COUNT(*) BY languages \n| SORT languages DESC", + "ROW words=\"foo;bar;baz;qux;quux;foo\"\n| STATS word_count = COUNT(SPLIT(words, \";\"))", + "ROW n=1\n| WHERE n < 0\n| STATS COUNT(n)", + "ROW n=1\n| STATS COUNT(n > 0 OR NULL), COUNT(n < 0 OR NULL)" + ] +} diff --git a/docs/reference/esql/functions/kibana/definition/count_distinct.json b/docs/reference/esql/functions/kibana/definition/count_distinct.json new file mode 100644 index 0000000000000..801bd26f7d022 --- /dev/null +++ b/docs/reference/esql/functions/kibana/definition/count_distinct.json @@ -0,0 +1,607 @@ +{ + "comment" : "This is generated by ESQL's AbstractFunctionTestCase. Do no edit it. See ../README.md for how to regenerate it.", + "type" : "agg", + "name" : "count_distinct", + "description" : "Returns the approximate number of distinct values.", + "signatures" : [ + { + "params" : [ + { + "name" : "field", + "type" : "boolean", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "boolean", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + }, + { + "name" : "precision", + "type" : "integer", + "optional" : true, + "description" : "Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "boolean", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + }, + { + "name" : "precision", + "type" : "long", + "optional" : true, + "description" : "Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "boolean", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + }, + { + "name" : "precision", + "type" : "unsigned_long", + "optional" : true, + "description" : "Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "datetime", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "datetime", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + }, + { + "name" : "precision", + "type" : "integer", + "optional" : true, + "description" : "Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "datetime", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + }, + { + "name" : "precision", + "type" : "long", + "optional" : true, + "description" : "Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "datetime", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + }, + { + "name" : "precision", + "type" : "unsigned_long", + "optional" : true, + "description" : "Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "double", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "double", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + }, + { + "name" : "precision", + "type" : "integer", + "optional" : true, + "description" : "Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "double", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + }, + { + "name" : "precision", + "type" : "long", + "optional" : true, + "description" : "Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "double", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + }, + { + "name" : "precision", + "type" : "unsigned_long", + "optional" : true, + "description" : "Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "integer", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "integer", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + }, + { + "name" : "precision", + "type" : "integer", + "optional" : true, + "description" : "Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "integer", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + }, + { + "name" : "precision", + "type" : "long", + "optional" : true, + "description" : "Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "integer", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + }, + { + "name" : "precision", + "type" : "unsigned_long", + "optional" : true, + "description" : "Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "ip", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "ip", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + }, + { + "name" : "precision", + "type" : "integer", + "optional" : true, + "description" : "Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "ip", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + }, + { + "name" : "precision", + "type" : "long", + "optional" : true, + "description" : "Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "ip", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + }, + { + "name" : "precision", + "type" : "unsigned_long", + "optional" : true, + "description" : "Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "keyword", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "keyword", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + }, + { + "name" : "precision", + "type" : "integer", + "optional" : true, + "description" : "Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "keyword", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + }, + { + "name" : "precision", + "type" : "long", + "optional" : true, + "description" : "Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "keyword", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + }, + { + "name" : "precision", + "type" : "unsigned_long", + "optional" : true, + "description" : "Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "long", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "long", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + }, + { + "name" : "precision", + "type" : "integer", + "optional" : true, + "description" : "Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "long", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + }, + { + "name" : "precision", + "type" : "long", + "optional" : true, + "description" : "Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "long", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + }, + { + "name" : "precision", + "type" : "unsigned_long", + "optional" : true, + "description" : "Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "text", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "text", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + }, + { + "name" : "precision", + "type" : "integer", + "optional" : true, + "description" : "Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "text", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + }, + { + "name" : "precision", + "type" : "long", + "optional" : true, + "description" : "Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "text", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + }, + { + "name" : "precision", + "type" : "unsigned_long", + "optional" : true, + "description" : "Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "version", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "version", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + }, + { + "name" : "precision", + "type" : "integer", + "optional" : true, + "description" : "Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "version", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + }, + { + "name" : "precision", + "type" : "long", + "optional" : true, + "description" : "Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000." + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "version", + "optional" : false, + "description" : "Column or literal for which to count the number of distinct values." + }, + { + "name" : "precision", + "type" : "unsigned_long", + "optional" : true, + "description" : "Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000." + } + ], + "variadic" : false, + "returnType" : "long" + } + ], + "examples" : [ + "FROM hosts\n| STATS COUNT_DISTINCT(ip0), COUNT_DISTINCT(ip1)", + "FROM hosts\n| STATS COUNT_DISTINCT(ip0, 80000), COUNT_DISTINCT(ip1, 5)", + "ROW words=\"foo;bar;baz;qux;quux;foo\"\n| STATS distinct_word_count = COUNT_DISTINCT(SPLIT(words, \";\"))" + ] +} diff --git a/docs/reference/esql/functions/kibana/definition/values.json b/docs/reference/esql/functions/kibana/definition/values.json new file mode 100644 index 0000000000000..3e0036c4d25b6 --- /dev/null +++ b/docs/reference/esql/functions/kibana/definition/values.json @@ -0,0 +1,119 @@ +{ + "comment" : "This is generated by ESQL's AbstractFunctionTestCase. Do no edit it. See ../README.md for how to regenerate it.", + "type" : "agg", + "name" : "values", + "description" : "Returns all values in a group as a multivalued field. The order of the returned values isn't guaranteed. If you need the values returned in order use <>.", + "signatures" : [ + { + "params" : [ + { + "name" : "field", + "type" : "boolean", + "optional" : false, + "description" : "" + } + ], + "variadic" : false, + "returnType" : "boolean" + }, + { + "params" : [ + { + "name" : "field", + "type" : "datetime", + "optional" : false, + "description" : "" + } + ], + "variadic" : false, + "returnType" : "datetime" + }, + { + "params" : [ + { + "name" : "field", + "type" : "double", + "optional" : false, + "description" : "" + } + ], + "variadic" : false, + "returnType" : "double" + }, + { + "params" : [ + { + "name" : "field", + "type" : "integer", + "optional" : false, + "description" : "" + } + ], + "variadic" : false, + "returnType" : "integer" + }, + { + "params" : [ + { + "name" : "field", + "type" : "ip", + "optional" : false, + "description" : "" + } + ], + "variadic" : false, + "returnType" : "ip" + }, + { + "params" : [ + { + "name" : "field", + "type" : "keyword", + "optional" : false, + "description" : "" + } + ], + "variadic" : false, + "returnType" : "keyword" + }, + { + "params" : [ + { + "name" : "field", + "type" : "long", + "optional" : false, + "description" : "" + } + ], + "variadic" : false, + "returnType" : "long" + }, + { + "params" : [ + { + "name" : "field", + "type" : "text", + "optional" : false, + "description" : "" + } + ], + "variadic" : false, + "returnType" : "text" + }, + { + "params" : [ + { + "name" : "field", + "type" : "version", + "optional" : false, + "description" : "" + } + ], + "variadic" : false, + "returnType" : "version" + } + ], + "examples" : [ + " FROM employees\n| EVAL first_letter = SUBSTRING(first_name, 0, 1)\n| STATS first_name=MV_SORT(VALUES(first_name)) BY first_letter\n| SORT first_letter" + ] +} diff --git a/docs/reference/esql/functions/kibana/docs/count.md b/docs/reference/esql/functions/kibana/docs/count.md new file mode 100644 index 0000000000000..dc9c356a847ed --- /dev/null +++ b/docs/reference/esql/functions/kibana/docs/count.md @@ -0,0 +1,11 @@ + + +### COUNT +Returns the total number (count) of input values. + +``` +FROM employees +| STATS COUNT(height) +``` diff --git a/docs/reference/esql/functions/kibana/docs/count_distinct.md b/docs/reference/esql/functions/kibana/docs/count_distinct.md new file mode 100644 index 0000000000000..a6b451bf9d38d --- /dev/null +++ b/docs/reference/esql/functions/kibana/docs/count_distinct.md @@ -0,0 +1,11 @@ + + +### COUNT_DISTINCT +Returns the approximate number of distinct values. + +``` +FROM hosts +| STATS COUNT_DISTINCT(ip0), COUNT_DISTINCT(ip1) +``` diff --git a/docs/reference/esql/functions/kibana/docs/values.md b/docs/reference/esql/functions/kibana/docs/values.md new file mode 100644 index 0000000000000..cba62fc27255e --- /dev/null +++ b/docs/reference/esql/functions/kibana/docs/values.md @@ -0,0 +1,13 @@ + + +### VALUES +Returns all values in a group as a multivalued field. The order of the returned values isn't guaranteed. If you need the values returned in order use <>. + +``` + FROM employees +| EVAL first_letter = SUBSTRING(first_name, 0, 1) +| STATS first_name=MV_SORT(VALUES(first_name)) BY first_letter +| SORT first_letter +``` diff --git a/docs/reference/esql/functions/layout/count.asciidoc b/docs/reference/esql/functions/layout/count.asciidoc new file mode 100644 index 0000000000000..8c16d74cde9a7 --- /dev/null +++ b/docs/reference/esql/functions/layout/count.asciidoc @@ -0,0 +1,15 @@ +// This is generated by ESQL's AbstractFunctionTestCase. Do no edit it. See ../README.md for how to regenerate it. + +[discrete] +[[esql-count]] +=== `COUNT` + +*Syntax* + +[.text-center] +image::esql/functions/signature/count.svg[Embedded,opts=inline] + +include::../parameters/count.asciidoc[] +include::../description/count.asciidoc[] +include::../types/count.asciidoc[] +include::../examples/count.asciidoc[] diff --git a/docs/reference/esql/functions/layout/count_distinct.asciidoc b/docs/reference/esql/functions/layout/count_distinct.asciidoc new file mode 100644 index 0000000000000..2c9848186e806 --- /dev/null +++ b/docs/reference/esql/functions/layout/count_distinct.asciidoc @@ -0,0 +1,16 @@ +// This is generated by ESQL's AbstractFunctionTestCase. Do no edit it. See ../README.md for how to regenerate it. + +[discrete] +[[esql-count_distinct]] +=== `COUNT_DISTINCT` + +*Syntax* + +[.text-center] +image::esql/functions/signature/count_distinct.svg[Embedded,opts=inline] + +include::../parameters/count_distinct.asciidoc[] +include::../description/count_distinct.asciidoc[] +include::../types/count_distinct.asciidoc[] +include::../examples/count_distinct.asciidoc[] +include::../appendix/count_distinct.asciidoc[] diff --git a/docs/reference/esql/functions/layout/values.asciidoc b/docs/reference/esql/functions/layout/values.asciidoc new file mode 100644 index 0000000000000..7d90d4314699a --- /dev/null +++ b/docs/reference/esql/functions/layout/values.asciidoc @@ -0,0 +1,18 @@ +// This is generated by ESQL's AbstractFunctionTestCase. Do no edit it. See ../README.md for how to regenerate it. + +[discrete] +[[esql-values]] +=== `VALUES` + +preview::["Do not use `VALUES` on production environments. This functionality is in technical preview and may be changed or removed in a future release. Elastic will work to fix any issues, but features in technical preview are not subject to the support SLA of official GA features."] + +*Syntax* + +[.text-center] +image::esql/functions/signature/values.svg[Embedded,opts=inline] + +include::../parameters/values.asciidoc[] +include::../description/values.asciidoc[] +include::../types/values.asciidoc[] +include::../examples/values.asciidoc[] +include::../appendix/values.asciidoc[] diff --git a/docs/reference/esql/functions/parameters/count.asciidoc b/docs/reference/esql/functions/parameters/count.asciidoc new file mode 100644 index 0000000000000..d470061a83e2e --- /dev/null +++ b/docs/reference/esql/functions/parameters/count.asciidoc @@ -0,0 +1,6 @@ +// This is generated by ESQL's AbstractFunctionTestCase. Do no edit it. See ../README.md for how to regenerate it. + +*Parameters* + +`field`:: +Expression that outputs values to be counted. If omitted, equivalent to `COUNT(*)` (the number of rows). diff --git a/docs/reference/esql/functions/parameters/count_distinct.asciidoc b/docs/reference/esql/functions/parameters/count_distinct.asciidoc new file mode 100644 index 0000000000000..f84cf27c3e075 --- /dev/null +++ b/docs/reference/esql/functions/parameters/count_distinct.asciidoc @@ -0,0 +1,9 @@ +// This is generated by ESQL's AbstractFunctionTestCase. Do no edit it. See ../README.md for how to regenerate it. + +*Parameters* + +`field`:: +Column or literal for which to count the number of distinct values. + +`precision`:: +Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000. diff --git a/docs/reference/esql/functions/parameters/values.asciidoc b/docs/reference/esql/functions/parameters/values.asciidoc new file mode 100644 index 0000000000000..8903aa1a472a3 --- /dev/null +++ b/docs/reference/esql/functions/parameters/values.asciidoc @@ -0,0 +1,6 @@ +// This is generated by ESQL's AbstractFunctionTestCase. Do no edit it. See ../README.md for how to regenerate it. + +*Parameters* + +`field`:: + diff --git a/docs/reference/esql/functions/signature/count.svg b/docs/reference/esql/functions/signature/count.svg new file mode 100644 index 0000000000000..9b19652b98788 --- /dev/null +++ b/docs/reference/esql/functions/signature/count.svg @@ -0,0 +1 @@ +COUNT(field) \ No newline at end of file diff --git a/docs/reference/esql/functions/signature/count_distinct.svg b/docs/reference/esql/functions/signature/count_distinct.svg new file mode 100644 index 0000000000000..a5b77da7c555a --- /dev/null +++ b/docs/reference/esql/functions/signature/count_distinct.svg @@ -0,0 +1 @@ +COUNT_DISTINCT(field,precision) \ No newline at end of file diff --git a/docs/reference/esql/functions/signature/values.svg b/docs/reference/esql/functions/signature/values.svg new file mode 100644 index 0000000000000..0fa116ce1eb14 --- /dev/null +++ b/docs/reference/esql/functions/signature/values.svg @@ -0,0 +1 @@ +VALUES(field) \ No newline at end of file diff --git a/docs/reference/esql/functions/types/count.asciidoc b/docs/reference/esql/functions/types/count.asciidoc new file mode 100644 index 0000000000000..70e79d4899605 --- /dev/null +++ b/docs/reference/esql/functions/types/count.asciidoc @@ -0,0 +1,20 @@ +// This is generated by ESQL's AbstractFunctionTestCase. Do no edit it. See ../README.md for how to regenerate it. + +*Supported types* + +[%header.monospaced.styled,format=dsv,separator=|] +|=== +field | result +boolean | long +cartesian_point | long +datetime | long +double | long +geo_point | long +integer | long +ip | long +keyword | long +long | long +text | long +unsigned_long | long +version | long +|=== diff --git a/docs/reference/esql/functions/types/count_distinct.asciidoc b/docs/reference/esql/functions/types/count_distinct.asciidoc new file mode 100644 index 0000000000000..4b201d45732f1 --- /dev/null +++ b/docs/reference/esql/functions/types/count_distinct.asciidoc @@ -0,0 +1,44 @@ +// This is generated by ESQL's AbstractFunctionTestCase. Do no edit it. See ../README.md for how to regenerate it. + +*Supported types* + +[%header.monospaced.styled,format=dsv,separator=|] +|=== +field | precision | result +boolean | integer | long +boolean | long | long +boolean | unsigned_long | long +boolean | | long +datetime | integer | long +datetime | long | long +datetime | unsigned_long | long +datetime | | long +double | integer | long +double | long | long +double | unsigned_long | long +double | | long +integer | integer | long +integer | long | long +integer | unsigned_long | long +integer | | long +ip | integer | long +ip | long | long +ip | unsigned_long | long +ip | | long +keyword | integer | long +keyword | long | long +keyword | unsigned_long | long +keyword | | long +long | integer | long +long | long | long +long | unsigned_long | long +long | | long +text | integer | long +text | long | long +text | unsigned_long | long +text | | long +version | integer | long +version | long | long +version | unsigned_long | long +version | | long +|=== diff --git a/docs/reference/esql/functions/types/values.asciidoc b/docs/reference/esql/functions/types/values.asciidoc new file mode 100644 index 0000000000000..705745d76dbab --- /dev/null +++ b/docs/reference/esql/functions/types/values.asciidoc @@ -0,0 +1,17 @@ +// This is generated by ESQL's AbstractFunctionTestCase. Do no edit it. See ../README.md for how to regenerate it. + +*Supported types* + +[%header.monospaced.styled,format=dsv,separator=|] +|=== +field | result +boolean | boolean +datetime | datetime +double | double +integer | integer +ip | ip +keyword | keyword +long | long +text | text +version | version +|=== diff --git a/x-pack/plugin/esql/compute/src/main/generated-src/org/elasticsearch/compute/aggregation/ValuesBytesRefAggregator.java b/x-pack/plugin/esql/compute/src/main/generated-src/org/elasticsearch/compute/aggregation/ValuesBytesRefAggregator.java index 736b320a9dde8..602fd29433193 100644 --- a/x-pack/plugin/esql/compute/src/main/generated-src/org/elasticsearch/compute/aggregation/ValuesBytesRefAggregator.java +++ b/x-pack/plugin/esql/compute/src/main/generated-src/org/elasticsearch/compute/aggregation/ValuesBytesRefAggregator.java @@ -100,7 +100,7 @@ Block toBlock(BlockFactory blockFactory) { } BytesRef scratch = new BytesRef(); if (values.size() == 1) { - return blockFactory.newConstantBytesRefBlockWith(values.get(0, scratch), 1); + return blockFactory.newConstantBytesRefBlockWith(BytesRef.deepCopyOf(values.get(0, scratch)), 1); } try (BytesRefBlock.Builder builder = blockFactory.newBytesRefBlockBuilder((int) values.size())) { builder.beginPositionEntry(); diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/X-ValuesAggregator.java.st b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/X-ValuesAggregator.java.st index ea62dcf295825..a8884c58116f3 100644 --- a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/X-ValuesAggregator.java.st +++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/X-ValuesAggregator.java.st @@ -192,7 +192,7 @@ $elseif(double)$ $elseif(int)$ return blockFactory.newConstantIntBlockWith((int) values.get(0), 1); $elseif(BytesRef)$ - return blockFactory.newConstantBytesRefBlockWith(values.get(0, scratch), 1); + return blockFactory.newConstantBytesRefBlockWith(BytesRef.deepCopyOf(values.get(0, scratch)), 1); $endif$ } try ($Type$Block.Builder builder = blockFactory.new$Type$BlockBuilder((int) values.size())) { diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/meta.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/meta.csv-spec index c036e04bc8ba3..7b5941b88988d 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/meta.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/meta.csv-spec @@ -19,7 +19,7 @@ synopsis:keyword "double cos(angle:double|integer|long|unsigned_long)" "double cosh(angle:double|integer|long|unsigned_long)" "long count(?field:boolean|cartesian_point|date|double|geo_point|integer|ip|keyword|long|text|unsigned_long|version)" -"long count_distinct(field:boolean|cartesian_point|date|double|geo_point|integer|ip|keyword|long|text|version, ?precision:integer)" +"long count_distinct(field:boolean|date|double|integer|ip|keyword|long|text|version, ?precision:integer|long|unsigned_long)" "integer date_diff(unit:keyword|text, startTimestamp:date, endTimestamp:date)" "long date_extract(datePart:keyword|text, date:date)" "keyword date_format(?dateFormat:keyword|text, date:date)" @@ -139,8 +139,8 @@ coalesce |first |"boolean|cartesian_point|car concat |[string1, string2] |["keyword|text", "keyword|text"] |[Strings to concatenate., Strings to concatenate.] cos |angle |"double|integer|long|unsigned_long" |An angle, in radians. If `null`, the function returns `null`. cosh |angle |"double|integer|long|unsigned_long" |An angle, in radians. If `null`, the function returns `null`. -count |field |"boolean|cartesian_point|date|double|geo_point|integer|ip|keyword|long|text|unsigned_long|version" |Column or literal for which to count the number of values. -count_distinct|[field, precision] |["boolean|cartesian_point|date|double|geo_point|integer|ip|keyword|long|text|version", integer] |[Column or literal for which to count the number of distinct values., ] +count |field |"boolean|cartesian_point|date|double|geo_point|integer|ip|keyword|long|text|unsigned_long|version" |Expression that outputs values to be counted. If omitted, equivalent to `COUNT(*)` (the number of rows). +count_distinct|[field, precision] |["boolean|date|double|integer|ip|keyword|long|text|version", "integer|long|unsigned_long"] |[Column or literal for which to count the number of distinct values., Precision threshold. Refer to <>. The maximum supported value is 40000. Thresholds above this number will have the same effect as a threshold of 40000. The default value is 3000.] date_diff |[unit, startTimestamp, endTimestamp]|["keyword|text", date, date] |[Time difference unit, A string representing a start timestamp, A string representing an end timestamp] date_extract |[datePart, date] |["keyword|text", date] |[Part of the date to extract. Can be: `aligned_day_of_week_in_month`\, `aligned_day_of_week_in_year`\, `aligned_week_of_month`\, `aligned_week_of_year`\, `ampm_of_day`\, `clock_hour_of_ampm`\, `clock_hour_of_day`\, `day_of_month`\, `day_of_week`\, `day_of_year`\, `epoch_day`\, `era`\, `hour_of_ampm`\, `hour_of_day`\, `instant_seconds`\, `micro_of_day`\, `micro_of_second`\, `milli_of_day`\, `milli_of_second`\, `minute_of_day`\, `minute_of_hour`\, `month_of_year`\, `nano_of_day`\, `nano_of_second`\, `offset_seconds`\, `proleptic_month`\, `second_of_day`\, `second_of_minute`\, `year`\, or `year_of_era`. Refer to https://docs.oracle.com/javase/8/docs/api/java/time/temporal/ChronoField.html[java.time.temporal.ChronoField] for a description of these values. If `null`\, the function returns `null`., Date expression. If `null`\, the function returns `null`.] date_format |[dateFormat, date] |["keyword|text", date] |[Date format (optional). If no format is specified\, the `yyyy-MM-dd'T'HH:mm:ss.SSSZ` format is used. If `null`\, the function returns `null`., Date expression. If `null`\, the function returns `null`.] @@ -356,7 +356,7 @@ to_ver |Converts an input string to a version value. to_version |Converts an input string to a version value. top |Collects the top values for a field. Includes repeated values. trim |Removes leading and trailing whitespaces from a string. -values |Collect values for a field. +values |Returns all values in a group as a multivalued field. The order of the returned values isn't guaranteed. If you need the values returned in order use <>. weighted_avg |The weighted average of a numeric field. ; diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/FunctionInfo.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/FunctionInfo.java index 94e3aa4e1dd68..f275496c6787a 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/FunctionInfo.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/FunctionInfo.java @@ -23,6 +23,11 @@ */ String[] returnType(); + /** + * Whether this function is a preview (Not ready for production environments) or not. + */ + boolean preview() default false; + /** * The description of the function rendered in {@code META FUNCTIONS} * and the docs. These should be complete sentences. diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/aggregate/Count.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/aggregate/Count.java index 52e053f843e14..9b6190408dbd4 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/aggregate/Count.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/aggregate/Count.java @@ -19,6 +19,7 @@ import org.elasticsearch.xpack.esql.core.type.DataType; import org.elasticsearch.xpack.esql.core.util.StringUtils; import org.elasticsearch.xpack.esql.expression.SurrogateExpression; +import org.elasticsearch.xpack.esql.expression.function.Example; import org.elasticsearch.xpack.esql.expression.function.FunctionInfo; import org.elasticsearch.xpack.esql.expression.function.Param; import org.elasticsearch.xpack.esql.expression.function.scalar.multivalue.MvCount; @@ -35,7 +36,32 @@ public class Count extends AggregateFunction implements EnclosedAgg, ToAggregator, SurrogateExpression { public static final NamedWriteableRegistry.Entry ENTRY = new NamedWriteableRegistry.Entry(Expression.class, "Count", Count::new); - @FunctionInfo(returnType = "long", description = "Returns the total number (count) of input values.", isAggregation = true) + @FunctionInfo( + returnType = "long", + description = "Returns the total number (count) of input values.", + isAggregation = true, + examples = { + @Example(file = "stats", tag = "count"), + @Example(description = "To count the number of rows, use `COUNT()` or `COUNT(*)`", file = "docs", tag = "countAll"), + @Example( + description = "The expression can use inline functions. This example splits a string into " + + "multiple values using the `SPLIT` function and counts the values", + file = "stats", + tag = "docsCountWithExpression" + ), + @Example( + description = "To count the number of times an expression returns `TRUE` use " + + "a <> command to remove rows that shouldn't be included", + file = "stats", + tag = "count-where" + ), + @Example( + description = "To count the same stream of data based on two different expressions " + + "use the pattern `COUNT( OR NULL)`", + file = "stats", + tag = "count-or-null" + ) } + ) public Count( Source source, @Param( @@ -54,7 +80,7 @@ public Count( "text", "unsigned_long", "version" }, - description = "Column or literal for which to count the number of values." + description = "Expression that outputs values to be counted. If omitted, equivalent to `COUNT(*)` (the number of rows)." ) Expression field ) { super(source, field); diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/aggregate/CountDistinct.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/aggregate/CountDistinct.java index 7686d10a03d9e..858c6e659449c 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/aggregate/CountDistinct.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/aggregate/CountDistinct.java @@ -24,6 +24,7 @@ import org.elasticsearch.xpack.esql.core.type.DataType; import org.elasticsearch.xpack.esql.expression.EsqlTypeResolutions; import org.elasticsearch.xpack.esql.expression.SurrogateExpression; +import org.elasticsearch.xpack.esql.expression.function.Example; import org.elasticsearch.xpack.esql.expression.function.FunctionInfo; import org.elasticsearch.xpack.esql.expression.function.OptionalArgument; import org.elasticsearch.xpack.esql.expression.function.Param; @@ -53,15 +54,63 @@ public class CountDistinct extends AggregateFunction implements OptionalArgument private static final int DEFAULT_PRECISION = 3000; private final Expression precision; - @FunctionInfo(returnType = "long", description = "Returns the approximate number of distinct values.", isAggregation = true) + @FunctionInfo( + returnType = "long", + description = "Returns the approximate number of distinct values.", + appendix = """ + [discrete] + [[esql-agg-count-distinct-approximate]] + ==== Counts are approximate + + Computing exact counts requires loading values into a set and returning its + size. This doesn't scale when working on high-cardinality sets and/or large + values as the required memory usage and the need to communicate those + per-shard sets between nodes would utilize too many resources of the cluster. + + This `COUNT_DISTINCT` function is based on the + https://static.googleusercontent.com/media/research.google.com/fr//pubs/archive/40671.pdf[HyperLogLog++] + algorithm, which counts based on the hashes of the values with some interesting + properties: + + include::../../../aggregations/metrics/cardinality-aggregation.asciidoc[tag=explanation] + + The `COUNT_DISTINCT` function takes an optional second parameter to configure + the precision threshold. The precision_threshold options allows to trade memory + for accuracy, and defines a unique count below which counts are expected to be + close to accurate. Above this value, counts might become a bit more fuzzy. The + maximum supported value is 40000, thresholds above this number will have the + same effect as a threshold of 40000. The default value is `3000`. + """, + isAggregation = true, + examples = { + @Example(file = "stats_count_distinct", tag = "count-distinct"), + @Example( + description = "With the optional second parameter to configure the precision threshold", + file = "stats_count_distinct", + tag = "count-distinct-precision" + ), + @Example( + description = "The expression can use inline functions. This example splits a string into " + + "multiple values using the `SPLIT` function and counts the unique values", + file = "stats_count_distinct", + tag = "docsCountDistinctWithExpression" + ) } + ) public CountDistinct( Source source, @Param( name = "field", - type = { "boolean", "cartesian_point", "date", "double", "geo_point", "integer", "ip", "keyword", "long", "text", "version" }, + type = { "boolean", "date", "double", "integer", "ip", "keyword", "long", "text", "version" }, description = "Column or literal for which to count the number of distinct values." ) Expression field, - @Param(optional = true, name = "precision", type = { "integer" }) Expression precision + @Param( + optional = true, + name = "precision", + type = { "integer", "long", "unsigned_long" }, + description = "Precision threshold. Refer to <>. " + + "The maximum supported value is 40000. Thresholds above this number will have the " + + "same effect as a threshold of 40000. The default value is 3000." + ) Expression precision ) { super(source, field, precision != null ? List.of(precision) : List.of()); this.precision = precision; @@ -108,19 +157,17 @@ protected TypeResolution resolveType() { return new TypeResolution("Unresolved children"); } - TypeResolution resolution = EsqlTypeResolutions.isExact(field(), sourceText(), DEFAULT); - if (resolution.unresolved()) { - return resolution; - } + TypeResolution resolution = EsqlTypeResolutions.isExact(field(), sourceText(), DEFAULT) + .and( + isType( + field(), + dt -> dt != DataType.UNSIGNED_LONG && dt != DataType.SOURCE, + sourceText(), + DEFAULT, + "any exact type except unsigned_long, _source, or counter types" + ) + ); - boolean resolved = resolution.resolved(); - resolution = isType( - field(), - dt -> resolved && dt != DataType.UNSIGNED_LONG && dt != DataType.SOURCE, - sourceText(), - DEFAULT, - "any exact type except unsigned_long, _source, or counter types" - ); if (resolution.unresolved() || precision == null) { return resolution; } diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/aggregate/Values.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/aggregate/Values.java index 7d2fbcddb113b..79276b26be6d5 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/aggregate/Values.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/aggregate/Values.java @@ -21,6 +21,7 @@ import org.elasticsearch.xpack.esql.core.tree.Source; import org.elasticsearch.xpack.esql.core.type.DataType; import org.elasticsearch.xpack.esql.expression.EsqlTypeResolutions; +import org.elasticsearch.xpack.esql.expression.function.Example; import org.elasticsearch.xpack.esql.expression.function.FunctionInfo; import org.elasticsearch.xpack.esql.expression.function.Param; import org.elasticsearch.xpack.esql.planner.ToAggregator; @@ -34,13 +35,25 @@ public class Values extends AggregateFunction implements ToAggregator { public static final NamedWriteableRegistry.Entry ENTRY = new NamedWriteableRegistry.Entry(Expression.class, "Values", Values::new); @FunctionInfo( - returnType = { "boolean|date|double|integer|ip|keyword|long|text|version" }, - description = "Collect values for a field.", - isAggregation = true + returnType = { "boolean", "date", "double", "integer", "ip", "keyword", "long", "text", "version" }, + preview = true, + description = "Returns all values in a group as a multivalued field. The order of the returned values isn't guaranteed. " + + "If you need the values returned in order use <>.", + appendix = """ + [WARNING] + ==== + This can use a significant amount of memory and ES|QL doesn't yet + grow aggregations beyond memory. So this aggregation will work until + it is used to collect more values than can fit into memory. Once it + collects too many values it will fail the query with + a <>. + ====""", + isAggregation = true, + examples = @Example(file = "string", tag = "values-grouped") ) public Values( Source source, - @Param(name = "field", type = { "boolean|date|double|integer|ip|keyword|long|text|version" }) Expression v + @Param(name = "field", type = { "boolean", "date", "double", "integer", "ip", "keyword", "long", "text", "version" }) Expression v ) { super(source, v); } diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/AbstractFunctionTestCase.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/AbstractFunctionTestCase.java index 0ec0a29dc530b..64c72b46c303b 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/AbstractFunctionTestCase.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/AbstractFunctionTestCase.java @@ -77,6 +77,7 @@ import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; +import java.math.BigInteger; import java.nio.file.Files; import java.nio.file.Path; import java.time.Duration; @@ -298,7 +299,12 @@ protected final List rows(List multirowFields) ) { var multiRowData = field.multiRowData(); for (int row = initialRow; row < initialRow + pageSize; row++) { - wrapper.accept(multiRowData.get(row)); + var data = multiRowData.get(row); + if (data instanceof BigInteger bigIntegerData) { + wrapper.accept(NumericUtils.asLongUnsigned(bigIntegerData)); + } else { + wrapper.accept(data); + } } blocks[i] = wrapper.builder().build(); @@ -545,7 +551,7 @@ public static void renderDocs() throws IOException { renderDescription(description.description(), info.detailedDescription(), info.note()); boolean hasExamples = renderExamples(info); boolean hasAppendix = renderAppendix(info.appendix()); - renderFullLayout(name, hasExamples, hasAppendix); + renderFullLayout(name, info.preview(), hasExamples, hasAppendix); renderKibanaInlineDocs(name, info); List args = description.args(); if (name.equals("case")) { @@ -571,6 +577,11 @@ public static void renderDocs() throws IOException { private static final String DOCS_WARNING = "// This is generated by ESQL's AbstractFunctionTestCase. Do no edit it. See ../README.md for how to regenerate it.\n\n"; + private static final String PREVIEW_CALLOUT = + "\npreview::[\"Do not use `VALUES` on production environments. This functionality is in technical preview and " + + "may be changed or removed in a future release. Elastic will work to fix any issues, but features in technical preview " + + "are not subject to the support SLA of official GA features.\"]\n"; + private static void renderTypes(List argNames) throws IOException { StringBuilder header = new StringBuilder(); for (String arg : argNames) { @@ -686,12 +697,12 @@ private static boolean renderAppendix(String appendix) throws IOException { return true; } - private static void renderFullLayout(String name, boolean hasExamples, boolean hasAppendix) throws IOException { + private static void renderFullLayout(String name, boolean preview, boolean hasExamples, boolean hasAppendix) throws IOException { String rendered = DOCS_WARNING + """ [discrete] [[esql-$NAME$]] === `$UPPER_NAME$` - + $PREVIEW_CALLOUT$ *Syntax* [.text-center] @@ -700,7 +711,9 @@ private static void renderFullLayout(String name, boolean hasExamples, boolean h include::../parameters/$NAME$.asciidoc[] include::../description/$NAME$.asciidoc[] include::../types/$NAME$.asciidoc[] - """.replace("$NAME$", name).replace("$UPPER_NAME$", name.toUpperCase(Locale.ROOT)); + """.replace("$NAME$", name) + .replace("$UPPER_NAME$", name.toUpperCase(Locale.ROOT)) + .replace("$PREVIEW_CALLOUT$", preview ? PREVIEW_CALLOUT : ""); if (hasExamples) { rendered += "include::../examples/" + name + ".asciidoc[]\n"; } diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/MultiRowTestCaseSupplier.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/MultiRowTestCaseSupplier.java index 2896dec814a71..973249e4a743c 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/MultiRowTestCaseSupplier.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/MultiRowTestCaseSupplier.java @@ -14,7 +14,9 @@ import org.elasticsearch.geo.ShapeTestUtils; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.xpack.esql.core.type.DataType; +import org.elasticsearch.xpack.versionfield.Version; +import java.math.BigInteger; import java.util.ArrayList; import java.util.List; @@ -149,6 +151,55 @@ public static List longCases(int minRows, int maxRows, long m return cases; } + public static List ulongCases(int minRows, int maxRows, BigInteger min, BigInteger max, boolean includeZero) { + List cases = new ArrayList<>(); + + // Zero + if (BigInteger.ZERO.compareTo(max) <= 0 && BigInteger.ZERO.compareTo(min) >= 0 && includeZero) { + cases.add( + new TypedDataSupplier( + "<0 unsigned longs>", + () -> randomList(minRows, maxRows, () -> BigInteger.ZERO), + DataType.UNSIGNED_LONG, + false, + true + ) + ); + } + + // Small values, less than Long.MAX_VALUE + BigInteger lower1 = min.max(BigInteger.ONE); + BigInteger upper1 = max.min(BigInteger.valueOf(Long.MAX_VALUE)); + if (lower1.compareTo(upper1) < 0) { + cases.add( + new TypedDataSupplier( + "", + () -> randomList(minRows, maxRows, () -> ESTestCase.randomUnsignedLongBetween(lower1, upper1)), + DataType.UNSIGNED_LONG, + false, + true + ) + ); + } + + // Big values, greater than Long.MAX_VALUE + BigInteger lower2 = min.max(BigInteger.valueOf(Long.MAX_VALUE).add(BigInteger.ONE)); + BigInteger upper2 = max.min(ESTestCase.UNSIGNED_LONG_MAX); + if (lower2.compareTo(upper2) < 0) { + cases.add( + new TypedDataSupplier( + "", + () -> randomList(minRows, maxRows, () -> ESTestCase.randomUnsignedLongBetween(lower2, upper2)), + DataType.UNSIGNED_LONG, + false, + true + ) + ); + } + + return cases; + } + public static List doubleCases(int minRows, int maxRows, double min, double max, boolean includeZero) { List cases = new ArrayList<>(); @@ -326,6 +377,41 @@ public static List ipCases(int minRows, int maxRows) { ); } + public static List versionCases(int minRows, int maxRows) { + return List.of( + new TypedDataSupplier( + "", + () -> randomList(minRows, maxRows, () -> new Version(Integer.toString(ESTestCase.between(0, 100))).toBytesRef()), + DataType.VERSION, + false, + true + ), + new TypedDataSupplier( + "", + () -> randomList( + minRows, + maxRows, + () -> new Version(ESTestCase.between(0, 100) + "." + ESTestCase.between(0, 100)).toBytesRef() + ), + DataType.VERSION, + false, + true + ), + new TypedDataSupplier( + "", + () -> randomList( + minRows, + maxRows, + () -> new Version(ESTestCase.between(0, 100) + "." + ESTestCase.between(0, 100) + "." + ESTestCase.between(0, 100)) + .toBytesRef() + ), + DataType.VERSION, + false, + true + ) + ); + } + public static List geoPointCases(int minRows, int maxRows, boolean withAltitude) { List cases = new ArrayList<>(); @@ -343,7 +429,7 @@ public static List geoPointCases(int minRows, int maxRows, bo cases.add( new TypedDataSupplier( "", - () -> randomList(minRows, maxRows, () -> GEO.asWkb(GeometryTestUtils.randomPoint(true))), + () -> randomList(minRows, maxRows, () -> GEO.asWkb(GeometryTestUtils.randomPoint(false))), DataType.GEO_POINT, false, true @@ -381,4 +467,38 @@ public static List cartesianPointCases(int minRows, int maxRo return cases; } + + public static List stringCases(int minRows, int maxRows, DataType type) { + return List.of( + new TypedDataSupplier("", () -> randomList(minRows, maxRows, () -> new BytesRef("")), type, false, true), + new TypedDataSupplier( + "", + () -> randomList(minRows, maxRows, () -> new BytesRef(ESTestCase.randomAlphaOfLengthBetween(1, 30))), + type, + false, + true + ), + new TypedDataSupplier( + "", + () -> randomList(minRows, maxRows, () -> new BytesRef(ESTestCase.randomAlphaOfLengthBetween(300, 3000))), + type, + false, + true + ), + new TypedDataSupplier( + "", + () -> randomList(minRows, maxRows, () -> new BytesRef(ESTestCase.randomRealisticUnicodeOfLengthBetween(1, 30))), + type, + false, + true + ), + new TypedDataSupplier( + "", + () -> randomList(minRows, maxRows, () -> new BytesRef(ESTestCase.randomRealisticUnicodeOfLengthBetween(300, 3000))), + type, + false, + true + ) + ); + } } diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/TestCaseSupplier.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/TestCaseSupplier.java index 3c9c1795ff210..cd375b8c53595 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/TestCaseSupplier.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/TestCaseSupplier.java @@ -880,6 +880,12 @@ public static List longCases(long min, long max, boolean incl return cases; } + /** + * Generate cases for {@link DataType#UNSIGNED_LONG}. + *

+ * For multi-row parameters, see {@link MultiRowTestCaseSupplier#ulongCases}. + *

+ */ public static List ulongCases(BigInteger min, BigInteger max, boolean includeZero) { List cases = new ArrayList<>(); @@ -1142,6 +1148,12 @@ public static List ipCases() { ); } + /** + * Generate cases for String DataTypes. + *

+ * For multi-row parameters, see {@link MultiRowTestCaseSupplier#stringCases}. + *

+ */ public static List stringCases(DataType type) { List result = new ArrayList<>(); result.add(new TypedDataSupplier("", () -> new BytesRef(""), type)); @@ -1170,6 +1182,9 @@ public static List stringCases(DataType type) { /** * Supplier test case data for {@link Version} fields. + *

+ * For multi-row parameters, see {@link MultiRowTestCaseSupplier#versionCases}. + *

*/ public static List versionCases(String prefix) { return List.of( diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/aggregate/CountDistinctTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/aggregate/CountDistinctTests.java new file mode 100644 index 0000000000000..c2638e8da9196 --- /dev/null +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/aggregate/CountDistinctTests.java @@ -0,0 +1,176 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.esql.expression.function.aggregate; + +import com.carrotsearch.randomizedtesting.annotations.Name; +import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; + +import org.apache.lucene.internal.hppc.BitMixer; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.hash.MurmurHash3; +import org.elasticsearch.common.unit.ByteSizeValue; +import org.elasticsearch.common.util.MockBigArrays; +import org.elasticsearch.common.util.PageCacheRecycler; +import org.elasticsearch.search.aggregations.metrics.HyperLogLogPlusPlus; +import org.elasticsearch.xpack.esql.core.expression.Expression; +import org.elasticsearch.xpack.esql.core.tree.Source; +import org.elasticsearch.xpack.esql.core.type.DataType; +import org.elasticsearch.xpack.esql.expression.function.AbstractAggregationTestCase; +import org.elasticsearch.xpack.esql.expression.function.MultiRowTestCaseSupplier; +import org.elasticsearch.xpack.esql.expression.function.TestCaseSupplier; + +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.List; +import java.util.function.Supplier; +import java.util.stream.Stream; + +import static org.hamcrest.Matchers.equalTo; + +public class CountDistinctTests extends AbstractAggregationTestCase { + public CountDistinctTests(@Name("TestCase") Supplier testCaseSupplier) { + this.testCase = testCaseSupplier.get(); + } + + @ParametersFactory + public static Iterable parameters() { + var suppliers = new ArrayList(); + + var precisionSuppliers = Stream.of( + TestCaseSupplier.intCases(0, 100_000, true), + TestCaseSupplier.longCases(0L, 100_000L, true), + TestCaseSupplier.ulongCases(BigInteger.ZERO, BigInteger.valueOf(100_000L), true) + ).flatMap(List::stream).toList(); + + Stream.of( + MultiRowTestCaseSupplier.intCases(1, 1000, Integer.MIN_VALUE, Integer.MAX_VALUE, true), + MultiRowTestCaseSupplier.longCases(1, 1000, Long.MIN_VALUE, Long.MAX_VALUE, true), + MultiRowTestCaseSupplier.doubleCases(1, 1000, -Double.MAX_VALUE, Double.MAX_VALUE, true), + MultiRowTestCaseSupplier.dateCases(1, 1000), + MultiRowTestCaseSupplier.booleanCases(1, 1000), + MultiRowTestCaseSupplier.ipCases(1, 1000), + MultiRowTestCaseSupplier.versionCases(1, 1000), + // Lower values for strings, as they take more space and may trigger the circuit breaker + MultiRowTestCaseSupplier.stringCases(1, 100, DataType.KEYWORD), + MultiRowTestCaseSupplier.stringCases(1, 100, DataType.TEXT) + ).flatMap(List::stream).forEach(fieldCaseSupplier -> { + // With precision + for (var precisionCaseSupplier : precisionSuppliers) { + suppliers.add(makeSupplier(fieldCaseSupplier, precisionCaseSupplier)); + } + + // Without precision + suppliers.add(makeSupplier(fieldCaseSupplier)); + }); + + // No rows + for (var dataType : List.of( + DataType.INTEGER, + DataType.LONG, + DataType.DOUBLE, + DataType.DATETIME, + DataType.BOOLEAN, + DataType.IP, + DataType.VERSION, + DataType.KEYWORD, + DataType.TEXT + )) { + var emptyFieldSupplier = new TestCaseSupplier.TypedDataSupplier("No rows (" + dataType + ")", List::of, dataType, false, true); + + // With precision + for (var precisionCaseSupplier : precisionSuppliers) { + suppliers.add(makeSupplier(emptyFieldSupplier, precisionCaseSupplier)); + } + + // Without precision + suppliers.add(makeSupplier(emptyFieldSupplier)); + } + + // "No rows" expects 0 here instead of null + // return parameterSuppliersFromTypedDataWithDefaultChecks(suppliers); + return parameterSuppliersFromTypedData(randomizeBytesRefsOffset(suppliers)); + } + + @Override + protected Expression build(Source source, List args) { + return new CountDistinct(source, args.get(0), args.size() > 1 ? args.get(1) : null); + } + + private static TestCaseSupplier makeSupplier( + TestCaseSupplier.TypedDataSupplier fieldSupplier, + TestCaseSupplier.TypedDataSupplier precisionSupplier + ) { + return new TestCaseSupplier(fieldSupplier.name(), List.of(fieldSupplier.type(), precisionSupplier.type()), () -> { + var fieldTypedData = fieldSupplier.get(); + var precisionTypedData = precisionSupplier.get().forceLiteral(); + var values = fieldTypedData.multiRowData(); + var precision = ((Number) precisionTypedData.data()).intValue(); + + long result; + + if (fieldTypedData.type() == DataType.BOOLEAN) { + result = values.stream().distinct().count(); + } else { + result = calculateExpectedResult(values, precision); + } + + return new TestCaseSupplier.TestCase( + List.of(fieldTypedData, precisionTypedData), + "CountDistinct[field=Attribute[channel=0],precision=Attribute[channel=1]]", + DataType.LONG, + equalTo(result) + ); + }); + } + + private static TestCaseSupplier makeSupplier(TestCaseSupplier.TypedDataSupplier fieldSupplier) { + return new TestCaseSupplier(fieldSupplier.name() + ", no precision", List.of(fieldSupplier.type()), () -> { + var fieldTypedData = fieldSupplier.get(); + var values = fieldTypedData.multiRowData(); + + long result; + + if (fieldTypedData.type() == DataType.BOOLEAN) { + result = values.stream().distinct().count(); + } else { + result = calculateExpectedResult(values, 3000); + } + + return new TestCaseSupplier.TestCase( + List.of(fieldTypedData), + "CountDistinct[field=Attribute[channel=0]]", + DataType.LONG, + equalTo(result) + ); + }); + } + + private static long calculateExpectedResult(List values, int precision) { + // Can't use driverContext().bigArrays() from a static context + var bigArrays = new MockBigArrays(PageCacheRecycler.NON_RECYCLING_INSTANCE, ByteSizeValue.ofMb(256)).withCircuitBreaking(); + try (var hll = new HyperLogLogPlusPlus(HyperLogLogPlusPlus.precisionFromThreshold(precision), bigArrays, 1)) { + var hash = new MurmurHash3.Hash128(); + for (var value : values) { + if (value instanceof Integer casted) { + hll.collect(0, BitMixer.mix64(casted)); + } else if (value instanceof Long casted) { + hll.collect(0, BitMixer.mix64(casted)); + } else if (value instanceof Double casted) { + hll.collect(0, BitMixer.mix64(Double.doubleToLongBits(casted))); + } else if (value instanceof BytesRef casted) { + MurmurHash3.hash128(casted.bytes, casted.offset, casted.length, 0, hash); + hll.collect(0, BitMixer.mix64(hash.h1)); + } else { + throw new IllegalArgumentException("Unsupported data type: " + value.getClass()); + } + } + + return hll.cardinality(0); + } + } +} diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/aggregate/CountTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/aggregate/CountTests.java new file mode 100644 index 0000000000000..09076f2d70fd9 --- /dev/null +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/aggregate/CountTests.java @@ -0,0 +1,106 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.esql.expression.function.aggregate; + +import com.carrotsearch.randomizedtesting.annotations.Name; +import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; + +import org.elasticsearch.xpack.esql.core.expression.Expression; +import org.elasticsearch.xpack.esql.core.tree.Source; +import org.elasticsearch.xpack.esql.core.type.DataType; +import org.elasticsearch.xpack.esql.expression.function.AbstractAggregationTestCase; +import org.elasticsearch.xpack.esql.expression.function.MultiRowTestCaseSupplier; +import org.elasticsearch.xpack.esql.expression.function.TestCaseSupplier; + +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.List; +import java.util.function.Supplier; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.hamcrest.Matchers.equalTo; + +public class CountTests extends AbstractAggregationTestCase { + public CountTests(@Name("TestCase") Supplier testCaseSupplier) { + this.testCase = testCaseSupplier.get(); + } + + @ParametersFactory + public static Iterable parameters() { + var suppliers = new ArrayList(); + + Stream.of( + MultiRowTestCaseSupplier.intCases(1, 1000, Integer.MIN_VALUE, Integer.MAX_VALUE, true), + MultiRowTestCaseSupplier.longCases(1, 1000, Long.MIN_VALUE, Long.MAX_VALUE, true), + MultiRowTestCaseSupplier.ulongCases(1, 1000, BigInteger.ZERO, UNSIGNED_LONG_MAX, true), + MultiRowTestCaseSupplier.doubleCases(1, 1000, -Double.MAX_VALUE, Double.MAX_VALUE, true), + MultiRowTestCaseSupplier.dateCases(1, 1000), + MultiRowTestCaseSupplier.booleanCases(1, 1000), + MultiRowTestCaseSupplier.ipCases(1, 1000), + MultiRowTestCaseSupplier.versionCases(1, 1000), + MultiRowTestCaseSupplier.geoPointCases(1, 1000, true), + MultiRowTestCaseSupplier.cartesianPointCases(1, 1000, true), + // Lower values for strings, as they take more space and may trigger the circuit breaker + MultiRowTestCaseSupplier.stringCases(1, 100, DataType.KEYWORD), + MultiRowTestCaseSupplier.stringCases(1, 100, DataType.TEXT) + ).flatMap(List::stream).map(CountTests::makeSupplier).collect(Collectors.toCollection(() -> suppliers)); + + // No rows + for (var dataType : List.of( + DataType.INTEGER, + DataType.LONG, + DataType.DOUBLE, + DataType.DATETIME, + DataType.BOOLEAN, + DataType.IP, + DataType.VERSION, + DataType.KEYWORD, + DataType.TEXT, + DataType.GEO_POINT, + DataType.CARTESIAN_POINT, + DataType.UNSIGNED_LONG + )) { + suppliers.add( + new TestCaseSupplier( + "No rows (" + dataType + ")", + List.of(dataType), + () -> new TestCaseSupplier.TestCase( + List.of(TestCaseSupplier.TypedData.multiRow(List.of(), dataType, "field")), + "Count[field=Attribute[channel=0]]", + DataType.LONG, + equalTo(0L) + ) + ) + ); + } + + // "No rows" expects 0 here instead of null + // return parameterSuppliersFromTypedDataWithDefaultChecks(suppliers); + return parameterSuppliersFromTypedData(randomizeBytesRefsOffset(suppliers)); + } + + @Override + protected Expression build(Source source, List args) { + return new Count(source, args.get(0)); + } + + private static TestCaseSupplier makeSupplier(TestCaseSupplier.TypedDataSupplier fieldSupplier) { + return new TestCaseSupplier(fieldSupplier.name(), List.of(fieldSupplier.type()), () -> { + var fieldTypedData = fieldSupplier.get(); + var rowCount = fieldTypedData.multiRowData().size(); + + return new TestCaseSupplier.TestCase( + List.of(fieldTypedData), + "Count[field=Attribute[channel=0]]", + DataType.LONG, + equalTo((long) rowCount) + ); + }); + } +} diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/aggregate/ValuesTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/aggregate/ValuesTests.java new file mode 100644 index 0000000000000..704bd3ab204a3 --- /dev/null +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/aggregate/ValuesTests.java @@ -0,0 +1,110 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.esql.expression.function.aggregate; + +import com.carrotsearch.randomizedtesting.annotations.Name; +import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; + +import org.elasticsearch.xpack.esql.core.expression.Expression; +import org.elasticsearch.xpack.esql.core.tree.Source; +import org.elasticsearch.xpack.esql.core.type.DataType; +import org.elasticsearch.xpack.esql.expression.function.AbstractAggregationTestCase; +import org.elasticsearch.xpack.esql.expression.function.MultiRowTestCaseSupplier; +import org.elasticsearch.xpack.esql.expression.function.TestCaseSupplier; +import org.hamcrest.BaseMatcher; +import org.hamcrest.Description; +import org.hamcrest.Matcher; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.function.Supplier; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.hamcrest.Matchers.containsInAnyOrder; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.nullValue; + +public class ValuesTests extends AbstractAggregationTestCase { + public ValuesTests(@Name("TestCase") Supplier testCaseSupplier) { + this.testCase = testCaseSupplier.get(); + } + + @ParametersFactory + public static Iterable parameters() { + var suppliers = new ArrayList(); + + Stream.of( + MultiRowTestCaseSupplier.intCases(1, 1000, Integer.MIN_VALUE, Integer.MAX_VALUE, true), + MultiRowTestCaseSupplier.longCases(1, 1000, Long.MIN_VALUE, Long.MAX_VALUE, true), + MultiRowTestCaseSupplier.doubleCases(1, 1000, -Double.MAX_VALUE, Double.MAX_VALUE, true), + MultiRowTestCaseSupplier.dateCases(1, 1000), + MultiRowTestCaseSupplier.booleanCases(1, 1000), + MultiRowTestCaseSupplier.ipCases(1, 1000), + MultiRowTestCaseSupplier.versionCases(1, 1000), + // Lower values for strings, as they take more space and may trigger the circuit breaker + MultiRowTestCaseSupplier.stringCases(1, 100, DataType.KEYWORD), + MultiRowTestCaseSupplier.stringCases(1, 100, DataType.TEXT) + ).flatMap(List::stream).map(ValuesTests::makeSupplier).collect(Collectors.toCollection(() -> suppliers)); + + return parameterSuppliersFromTypedDataWithDefaultChecks(suppliers); + } + + @Override + protected Expression build(Source source, List args) { + return new Values(source, args.get(0)); + } + + @SuppressWarnings("unchecked") + private static TestCaseSupplier makeSupplier(TestCaseSupplier.TypedDataSupplier fieldSupplier) { + return new TestCaseSupplier(fieldSupplier.name(), List.of(fieldSupplier.type()), () -> { + var fieldTypedData = fieldSupplier.get(); + + var expected = fieldTypedData.multiRowData() + .stream() + .map(v -> (Comparable>) v) + .collect(Collectors.toSet()); + + return new TestCaseSupplier.TestCase( + List.of(fieldTypedData), + "Values[field=Attribute[channel=0]]", + fieldSupplier.type(), + expected.isEmpty() ? nullValue() : valuesInAnyOrder(expected) + ); + }); + } + + private static Matcher valuesInAnyOrder(Collection data) { + if (data == null) { + return nullValue(); + } + if (data.size() == 1) { + return equalTo(data.iterator().next()); + } + var matcher = containsInAnyOrder(data.toArray()); + // New Matcher, as `containsInAnyOrder` returns Matcher> instead of Matcher + return new BaseMatcher<>() { + @Override + public void describeTo(Description description) { + matcher.describeTo(description); + } + + @Override + public boolean matches(Object item) { + if (item instanceof Iterable == false) { + return false; + } + + var castedItem = (Iterable) item; + + return matcher.matches(castedItem); + } + }; + } +}