-
Notifications
You must be signed in to change notification settings - Fork 181
Add values stats function with UDAF
#4276
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
aefe658
5cac526
c43c1bc
8424136
50a4bf8
cb5dcdf
03dbfdb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,51 @@ | ||
| /* | ||
| * Copyright OpenSearch Contributors | ||
| * SPDX-License-Identifier: Apache-2.0 | ||
| */ | ||
|
|
||
| package org.opensearch.sql.calcite.udf.udaf; | ||
|
|
||
| import org.opensearch.sql.common.setting.Settings; | ||
|
|
||
| /** | ||
| * Holder class to provide static access to Settings for UDAF functions. Since Calcite UDAF | ||
| * functions are instantiated via reflection with default constructor, we need this static holder to | ||
| * access settings. | ||
| */ | ||
| public class SettingsHolder { | ||
| private static volatile Settings settings; | ||
|
|
||
| /** Private constructor to prevent instantiation */ | ||
| private SettingsHolder() {} | ||
|
|
||
| /** | ||
| * Set the settings instance. This should be called during plugin initialization. | ||
| * | ||
| * @param s Settings instance | ||
| */ | ||
| public static void setSettings(Settings s) { | ||
| settings = s; | ||
| } | ||
|
|
||
| /** | ||
| * Get the settings instance. | ||
| * | ||
| * @return Settings instance or null if not initialized | ||
| */ | ||
| public static Settings getSettings() { | ||
| return settings; | ||
| } | ||
|
|
||
| /** | ||
| * Get the maximum limit for VALUES aggregate function. | ||
| * | ||
| * @return Maximum limit (0 means unlimited) | ||
| */ | ||
| public static int getValuesMaxLimit() { | ||
| if (settings != null) { | ||
| Integer limit = settings.getSettingValue(Settings.Key.PPL_VALUES_MAX_LIMIT); | ||
| return limit != null ? limit : 0; | ||
| } | ||
| return 0; // Default when settings not available | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,100 @@ | ||
| /* | ||
| * Copyright OpenSearch Contributors | ||
| * SPDX-License-Identifier: Apache-2.0 | ||
| */ | ||
|
|
||
| package org.opensearch.sql.calcite.udf.udaf; | ||
|
|
||
| import java.util.ArrayList; | ||
| import java.util.Set; | ||
| import java.util.TreeSet; | ||
| import org.opensearch.sql.calcite.udf.UserDefinedAggFunction; | ||
|
|
||
| /** | ||
| * VALUES aggregate function implementation. Returns distinct values from a field in lexicographical | ||
| * order as a multivalue field. | ||
| * | ||
| * <p>Behavior: | ||
| * | ||
| * <ul> | ||
| * <li>Returns unique values only (no duplicates) | ||
| * <li>Values are sorted in lexicographical order | ||
| * <li>Processes field values as strings (casts all inputs to strings) | ||
| * <li>Configurable limit via plugins.ppl.values.max.limit setting (0 = unlimited) | ||
| * <li>Supports only scalar data types (rejects STRUCT/ARRAY types) | ||
| * <li>Implementation uses TreeSet for automatic sorting and deduplication | ||
| * </ul> | ||
| */ | ||
| public class ValuesAggFunction | ||
| implements UserDefinedAggFunction<ValuesAggFunction.ValuesAccumulator> { | ||
|
|
||
| @Override | ||
| public ValuesAccumulator init() { | ||
| return new ValuesAccumulator(); | ||
| } | ||
|
|
||
| @Override | ||
| public Object result(ValuesAccumulator accumulator) { | ||
| return accumulator.value(); | ||
| } | ||
|
|
||
| @Override | ||
| public ValuesAccumulator add(ValuesAccumulator acc, Object... values) { | ||
| // Handle case where no values are passed | ||
| if (values == null || values.length == 0) { | ||
| return acc; | ||
| } | ||
|
|
||
| Object value = values[0]; | ||
|
|
||
| // Filter out null values and check limit | ||
| int limit = getMaxValuesLimit(); | ||
| if (value != null && (limit == 0 || acc.size() < limit)) { | ||
| // Convert value to string, handling all types safely | ||
| String stringValue = convertToString(value); | ||
| acc.add(stringValue, limit); | ||
| } | ||
|
|
||
| return acc; | ||
| } | ||
|
|
||
| /** Converts any value to its string representation. */ | ||
| private String convertToString(Object value) { | ||
|
||
| if (value == null) { | ||
| return null; | ||
| } | ||
| return String.valueOf(value); | ||
| } | ||
|
|
||
| public static class ValuesAccumulator implements Accumulator { | ||
| private final Set<String> values; | ||
|
|
||
| public ValuesAccumulator() { | ||
| this.values = new TreeSet<>(); // TreeSet maintains sorted order and uniqueness | ||
| } | ||
|
|
||
| @Override | ||
| public Object value(Object... argList) { | ||
| return new ArrayList<>(values); // Return List<String> to match expected type | ||
| } | ||
|
|
||
| public void add(String value, int limit) { | ||
| if (limit == 0 || values.size() < limit) { | ||
| values.add(value); | ||
| } | ||
| } | ||
|
|
||
| public int size() { | ||
| return values.size(); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Get the maximum limit for values from settings. | ||
| * | ||
| * @return Maximum limit (0 means unlimited) | ||
| */ | ||
| private int getMaxValuesLimit() { | ||
| return SettingsHolder.getValuesMaxLimit(); | ||
| } | ||
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,6 +34,8 @@ The following table dataSources the aggregation functions and also indicates how | |
| +----------+-------------+-------------+ | ||
| | LIST | Ignore | Ignore | | ||
| +----------+-------------+-------------+ | ||
| | VALUES | Ignore | Ignore | | ||
| +----------+-------------+-------------+ | ||
|
|
||
|
|
||
| Syntax | ||
|
|
@@ -577,6 +579,56 @@ Example with result field rename:: | |
| | ["Amber","Hattie","Nanette","Dale"] | | ||
| +-------------------------------------+ | ||
|
|
||
| VALUES | ||
| ------ | ||
|
|
||
| Description | ||
| >>>>>>>>>>> | ||
|
|
||
| Version: 3.3.0 (Calcite engine only) | ||
|
|
||
| Usage: VALUES(expr). Collects all unique values from the specified expression into a sorted array. Values are converted to strings, nulls are filtered, and duplicates are removed. | ||
|
|
||
| The maximum number of unique values returned is controlled by the ``plugins.ppl.values.max.limit`` setting: | ||
| * Default value is 0, which means unlimited values are returned | ||
| * Can be configured to any positive integer to limit the number of unique values | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated here: 50a4bf8 |
||
| * See the `PPL Settings <../admin/settings.rst#plugins-ppl-values-max-limit>`_ documentation for more details | ||
|
|
||
| * expr: The field expression to collect unique values from. | ||
| * This aggregation function doesn't support Array, Struct, Object field types. | ||
| * Returns distinct values only (no duplicates) | ||
|
||
| * Values are sorted in lexicographical order | ||
|
|
||
| Example with string fields:: | ||
|
|
||
| PPL> source=accounts | stats values(firstname); | ||
| fetched rows / total rows = 1/1 | ||
| +-------------------------------------+ | ||
| | values(firstname) | | ||
| |-------------------------------------| | ||
| | ["Amber","Dale","Hattie","Nanette"] | | ||
| +-------------------------------------+ | ||
|
|
||
| Example with numeric fields (sorted as strings):: | ||
|
|
||
| PPL> source=accounts | stats values(age); | ||
| fetched rows / total rows = 1/1 | ||
| +---------------------------+ | ||
| | values(age) | | ||
| |---------------------------| | ||
| | ["28","32","33","36","39"] | | ||
| +---------------------------+ | ||
|
|
||
| Example with result field rename:: | ||
|
|
||
| PPL> source=accounts | stats values(firstname) as unique_names; | ||
| fetched rows / total rows = 1/1 | ||
| +-------------------------------------+ | ||
| | unique_names | | ||
| |-------------------------------------| | ||
| | ["Amber","Dale","Hattie","Nanette"] | | ||
| +-------------------------------------+ | ||
|
|
||
| Example 1: Calculate the count of events | ||
| ======================================== | ||
|
|
||
|
|
@@ -833,3 +885,17 @@ PPL query:: | |
| | 1 | [email protected] | | ||
| +-----+-----------------------+ | ||
|
|
||
| Example 16: Collect unique values in a field using VALUES | ||
| ========================================================== | ||
|
|
||
| The example shows how to collect all unique firstname values, sorted lexicographically with duplicates removed. | ||
|
|
||
| PPL query:: | ||
|
|
||
| PPL> source=accounts | stats values(firstname); | ||
| fetched rows / total rows = 1/1 | ||
| +-------------------------------------+ | ||
| | values(firstname) | | ||
| |-------------------------------------| | ||
| | ["Amber","Dale","Hattie","Nanette"] | | ||
| +-------------------------------------+ | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
there is null check on line 52, but convertToString(value); can also return null, which one is correct?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Simplified this to
String stringValue = String.valueOf(value);There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
String.valueOf(value); will return "null", is it expected?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@penghuo For cases where
value ==> "null"(string value), we'll include "null" in output. I feel this should be fine. As these are not actual null values. These are null strings ingested by the users.For cases where
value ==> nullis handled in line 51.