diff --git a/DEVELOPER_GUIDE.rst b/DEVELOPER_GUIDE.rst index 85282d67d19..7482d0675d8 100644 --- a/DEVELOPER_GUIDE.rst +++ b/DEVELOPER_GUIDE.rst @@ -173,6 +173,7 @@ Here are other files and sub-folders that you are likely to touch: - ``build.gradle``: Gradle build script. - ``docs``: documentation for developers and reference manual for users. - ``doc-test``: code that run .rst docs in ``docs`` folder by Python doctest library. +- ``language-grammar``: centralized package for ANTLR grammar files. See `Language Grammar Package`_ for details. Note that other related project code has already merged into this single repository together: @@ -441,3 +442,29 @@ with an appropriate label `backport ` is merged to main wi PR. For example, if a PR on main needs to be backported to `1.x` branch, add a label `backport 1.x` to the PR and make sure the backport workflow runs on the PR along with other checks. Once this PR is merged to main, the workflow will create a backport PR to the `1.x` branch. + +Language Grammar Package +======================== + +The ``language-grammar`` package serves as a centralized repository for all ANTLR grammar files used throughout the OpenSearch SQL project. This package contains the definitive versions of grammar files for: + +- SQL parsing (``OpenSearchSQLParser.g4``, ``OpenSearchSQLLexer.g4``) +- PPL parsing (``OpenSearchPPLParser.g4``, ``OpenSearchPPLLexer.g4``) +- Legacy SQL parsing (``OpenSearchLegacySqlParser.g4``, ``OpenSearchLegacySqlLexer.g4``) +- Spark SQL extensions (``SparkSqlBase.g4``, ``FlintSparkSqlExtensions.g4``, ``SqlBaseParser.g4``, ``SqlBaseLexer.g4``) + +Purpose +------- + +The language-grammar package enables sharing of grammar files between the main SQL repository and the Spark repository, ensuring consistency and reducing duplication. Once updated, the package automatically triggers CI to upload the new version to Maven Central for consumption by other projects. + +Updating Grammar Files +---------------------- + +When grammar files are modified in their respective modules (``sql/``, ``ppl/``, ``legacy/``, ``async-query-core/``), they must be manually copied to the ``language-grammar/src/main/antlr4/`` directory. + +**Workflow:** + +1. Modify grammar files in their source locations (e.g., ``sql/src/main/antlr/``) +2. Copy updated files to ``language-grammar/src/main/antlr4/`` +3. Commit changes to trigger automatic Maven publication via CI diff --git a/language-grammar/src/main/antlr4/OpenSearchPPLLexer.g4 b/language-grammar/src/main/antlr4/OpenSearchPPLLexer.g4 index b7dc4b7286d..4c37be2f318 100644 --- a/language-grammar/src/main/antlr4/OpenSearchPPLLexer.g4 +++ b/language-grammar/src/main/antlr4/OpenSearchPPLLexer.g4 @@ -13,9 +13,12 @@ options { caseInsensitive = true; } SEARCH: 'SEARCH'; DESCRIBE: 'DESCRIBE'; SHOW: 'SHOW'; +EXPLAIN: 'EXPLAIN'; FROM: 'FROM'; WHERE: 'WHERE'; FIELDS: 'FIELDS'; +FIELD: 'FIELD'; +TABLE: 'TABLE'; // Alias for FIELDS command RENAME: 'RENAME'; STATS: 'STATS'; EVENTSTATS: 'EVENTSTATS'; @@ -23,13 +26,14 @@ DEDUP: 'DEDUP'; SORT: 'SORT'; EVAL: 'EVAL'; HEAD: 'HEAD'; -TOP_APPROX: 'TOP_APPROX'; +BIN: 'BIN'; TOP: 'TOP'; -RARE_APPROX: 'RARE_APPROX'; RARE: 'RARE'; PARSE: 'PARSE'; -METHOD: 'METHOD'; +SPATH: 'SPATH'; REGEX: 'REGEX'; +REX: 'REX'; +SED: 'SED'; PUNCT: 'PUNCT'; GROK: 'GROK'; PATTERN: 'PATTERN'; @@ -39,10 +43,22 @@ KMEANS: 'KMEANS'; AD: 'AD'; ML: 'ML'; FILLNULL: 'FILLNULL'; -EXPAND: 'EXPAND'; FLATTEN: 'FLATTEN'; TRENDLINE: 'TRENDLINE'; +TIMECHART: 'TIMECHART'; APPENDCOL: 'APPENDCOL'; +EXPAND: 'EXPAND'; +SIMPLE_PATTERN: 'SIMPLE_PATTERN'; +BRAIN: 'BRAIN'; +VARIABLE_COUNT_THRESHOLD: 'VARIABLE_COUNT_THRESHOLD'; +FREQUENCY_THRESHOLD_PERCENTAGE: 'FREQUENCY_THRESHOLD_PERCENTAGE'; +METHOD: 'METHOD'; +MAX_SAMPLE_COUNT: 'MAX_SAMPLE_COUNT'; +MAX_MATCH: 'MAX_MATCH'; +OFFSET_FIELD: 'OFFSET_FIELD'; +BUFFER_LIMIT: 'BUFFER_LIMIT'; +LABEL: 'LABEL'; +AGGREGATION: 'AGGREGATION'; //Native JOIN KEYWORDS JOIN: 'JOIN'; @@ -56,52 +72,35 @@ CROSS: 'CROSS'; LEFT_HINT: 'HINT.LEFT'; RIGHT_HINT: 'HINT.RIGHT'; -//CORRELATION KEYWORDS -CORRELATE: 'CORRELATE'; -SELF: 'SELF'; -EXACT: 'EXACT'; -APPROXIMATE: 'APPROXIMATE'; -SCOPE: 'SCOPE'; -MAPPING: 'MAPPING'; - -//EXPLAIN KEYWORDS -EXPLAIN: 'EXPLAIN'; -FORMATTED: 'FORMATTED'; -COST: 'COST'; -CODEGEN: 'CODEGEN'; -EXTENDED: 'EXTENDED'; -SIMPLE: 'SIMPLE'; - // COMMAND ASSIST KEYWORDS AS: 'AS'; BY: 'BY'; SOURCE: 'SOURCE'; INDEX: 'INDEX'; +A: 'A'; +ASC: 'ASC'; D: 'D'; DESC: 'DESC'; DATASOURCES: 'DATASOURCES'; USING: 'USING'; WITH: 'WITH'; +SIMPLE: 'SIMPLE'; +STANDARD: 'STANDARD'; +COST: 'COST'; +EXTENDED: 'EXTENDED'; +OVERRIDE: 'OVERRIDE'; +OVERWRITE: 'OVERWRITE'; // SORT FIELD KEYWORDS -// TODO #963: Implement 'num', 'str', and 'ip' sort syntax +// TODO #3180: Fix broken sort functionality AUTO: 'AUTO'; STR: 'STR'; -IP: 'IP'; NUM: 'NUM'; -// FIELDSUMMARY keywords -FIELDSUMMARY: 'FIELDSUMMARY'; -INCLUDEFIELDS: 'INCLUDEFIELDS'; -NULLS: 'NULLS'; - -//TRENDLINE KEYWORDS +// TRENDLINE KEYWORDS SMA: 'SMA'; WMA: 'WMA'; -// APPENDCOL options -OVERRIDE: 'OVERRIDE'; - // ARGUMENT KEYWORDS KEEPEMPTY: 'KEEPEMPTY'; CONSECUTIVE: 'CONSECUTIVE'; @@ -109,6 +108,7 @@ DEDUP_SPLITVALUES: 'DEDUP_SPLITVALUES'; PARTITIONS: 'PARTITIONS'; ALLNUM: 'ALLNUM'; DELIM: 'DELIM'; +BUCKET_NULLABLE: 'BUCKET_NULLABLE'; CENTROIDS: 'CENTROIDS'; ITERATIONS: 'ITERATIONS'; DISTANCE_TYPE: 'DISTANCE_TYPE'; @@ -124,6 +124,13 @@ TIME_ZONE: 'TIME_ZONE'; TRAINING_DATA_SIZE: 'TRAINING_DATA_SIZE'; ANOMALY_SCORE_THRESHOLD: 'ANOMALY_SCORE_THRESHOLD'; APPEND: 'APPEND'; +COUNTFIELD: 'COUNTFIELD'; +SHOWCOUNT: 'SHOWCOUNT'; +LIMIT: 'LIMIT'; +USEOTHER: 'USEOTHER'; +INPUT: 'INPUT'; +OUTPUT: 'OUTPUT'; +PATH: 'PATH'; // COMPARISON FUNCTION KEYWORDS CASE: 'CASE'; @@ -131,6 +138,9 @@ ELSE: 'ELSE'; IN: 'IN'; EXISTS: 'EXISTS'; +// Geo IP eval function +GEOIP: 'GEOIP'; + // LOGICAL KEYWORDS NOT: 'NOT'; OR: 'OR'; @@ -139,6 +149,7 @@ XOR: 'XOR'; TRUE: 'TRUE'; FALSE: 'FALSE'; REGEXP: 'REGEXP'; +REGEX_MATCH: 'REGEX_MATCH'; // DATETIME, INTERVAL AND UNIT KEYWORDS CONVERT_TZ: 'CONVERT_TZ'; @@ -186,12 +197,14 @@ LONG: 'LONG'; FLOAT: 'FLOAT'; STRING: 'STRING'; BOOLEAN: 'BOOLEAN'; +IP: 'IP'; // SPECIAL CHARACTERS AND OPERATORS PIPE: '|'; COMMA: ','; DOT: '.'; EQUAL: '='; +DOUBLE_EQUAL: '=='; GREATER: '>'; LESS: '<'; NOT_GREATER: '<' '='; @@ -208,6 +221,8 @@ LT_PRTHS: '('; RT_PRTHS: ')'; LT_SQR_PRTHS: '['; RT_SQR_PRTHS: ']'; +LT_CURLY: '{'; +RT_CURLY: '}'; SINGLE_QUOTE: '\''; DOUBLE_QUOTE: '"'; BACKTICK: '`'; @@ -240,11 +255,12 @@ VAR_SAMP: 'VAR_SAMP'; VAR_POP: 'VAR_POP'; STDDEV_SAMP: 'STDDEV_SAMP'; STDDEV_POP: 'STDDEV_POP'; +PERC: 'PERC'; PERCENTILE: 'PERCENTILE'; PERCENTILE_APPROX: 'PERCENTILE_APPROX'; +EARLIEST: 'EARLIEST'; +LATEST: 'LATEST'; TAKE: 'TAKE'; -FIRST: 'FIRST'; -LAST: 'LAST'; LIST: 'LIST'; VALUES: 'VALUES'; PER_DAY: 'PER_DAY'; @@ -256,7 +272,22 @@ SPARKLINE: 'SPARKLINE'; C: 'C'; DC: 'DC'; +// SCALAR WINDOW FUNCTIONS +ROW_NUMBER: 'ROW_NUMBER'; +RANK: 'RANK'; +DENSE_RANK: 'DENSE_RANK'; +PERCENT_RANK: 'PERCENT_RANK'; +CUME_DIST: 'CUME_DIST'; +FIRST: 'FIRST'; +LAST: 'LAST'; +NTH: 'NTH'; +NTILE: 'NTILE'; + // BASIC FUNCTIONS +PLUS_FUCTION: 'ADD'; +MINUS_FUCTION: 'SUBTRACT'; +STAR_FUNCTION: 'MULTIPLY'; +DIVIDE_FUNCTION: 'DIVIDE'; ABS: 'ABS'; CBRT: 'CBRT'; CEIL: 'CEIL'; @@ -265,12 +296,13 @@ CONV: 'CONV'; CRC32: 'CRC32'; E: 'E'; EXP: 'EXP'; +EXPM1: 'EXPM1'; FLOOR: 'FLOOR'; LN: 'LN'; LOG: 'LOG'; -LOG10: 'LOG10'; -LOG2: 'LOG2'; +LOG_WITH_BASE: ([0-9]+ ('.' [0-9]+)?)? ('LOG' | 'log') [0-9]+ ('.' [0-9]+)?; MOD: 'MOD'; +MODULUS: 'MODULUS'; PI: 'PI'; POSITION: 'POSITION'; POW: 'POW'; @@ -278,9 +310,10 @@ POWER: 'POWER'; RAND: 'RAND'; ROUND: 'ROUND'; SIGN: 'SIGN'; -SIGNUM: 'SIGNUM'; SQRT: 'SQRT'; TRUNCATE: 'TRUNCATE'; +RINT: 'RINT'; +SIGNUM: 'SIGNUM'; // TRIGONOMETRIC FUNCTIONS ACOS: 'ACOS'; @@ -288,10 +321,12 @@ ASIN: 'ASIN'; ATAN: 'ATAN'; ATAN2: 'ATAN2'; COS: 'COS'; +COSH: 'COSH'; COT: 'COT'; DEGREES: 'DEGREES'; RADIANS: 'RADIANS'; SIN: 'SIN'; +SINH: 'SINH'; TAN: 'TAN'; // CRYPTOGRAPHIC FUNCTIONS @@ -306,7 +341,6 @@ CURDATE: 'CURDATE'; CURRENT_DATE: 'CURRENT_DATE'; CURRENT_TIME: 'CURRENT_TIME'; CURRENT_TIMESTAMP: 'CURRENT_TIMESTAMP'; -CURRENT_TIMEZONE: 'CURRENT_TIMEZONE'; CURTIME: 'CURTIME'; DATE: 'DATE'; DATEDIFF: 'DATEDIFF'; @@ -319,7 +353,6 @@ DAYOFWEEK: 'DAYOFWEEK'; DAYOFYEAR: 'DAYOFYEAR'; DAY_OF_MONTH: 'DAY_OF_MONTH'; DAY_OF_WEEK: 'DAY_OF_WEEK'; -DURATION: 'DURATION'; EXTRACT: 'EXTRACT'; FROM_DAYS: 'FROM_DAYS'; FROM_UNIXTIME: 'FROM_UNIXTIME'; @@ -328,7 +361,6 @@ LAST_DAY: 'LAST_DAY'; LOCALTIME: 'LOCALTIME'; LOCALTIMESTAMP: 'LOCALTIMESTAMP'; MAKEDATE: 'MAKEDATE'; -MAKE_DATE: 'MAKE_DATE'; MAKETIME: 'MAKETIME'; MONTHNAME: 'MONTHNAME'; NOW: 'NOW'; @@ -355,11 +387,6 @@ UTC_TIMESTAMP: 'UTC_TIMESTAMP'; WEEKDAY: 'WEEKDAY'; YEARWEEK: 'YEARWEEK'; -// RELATIVE TIME FUNCTIONS -RELATIVE_TIMESTAMP: 'RELATIVE_TIMESTAMP'; -EARLIEST: 'EARLIEST'; -LATEST: 'LATEST'; - // TEXT FUNCTIONS SUBSTR: 'SUBSTR'; SUBSTRING: 'SUBSTRING'; @@ -381,67 +408,45 @@ REPLACE: 'REPLACE'; REVERSE: 'REVERSE'; CAST: 'CAST'; -// JSON TEXT FUNCTIONS -JSON: 'JSON'; -JSON_OBJECT: 'JSON_OBJECT'; -JSON_ARRAY: 'JSON_ARRAY'; -JSON_ARRAY_LENGTH: 'JSON_ARRAY_LENGTH'; -TO_JSON_STRING: 'TO_JSON_STRING'; -JSON_EXTRACT: 'JSON_EXTRACT'; -JSON_DELETE : 'JSON_DELETE'; -JSON_KEYS: 'JSON_KEYS'; -JSON_VALID: 'JSON_VALID'; -JSON_APPEND: 'JSON_APPEND'; -JSON_EXTEND : 'JSON_EXTEND'; -JSON_SET: 'JSON_SET'; -//JSON_ARRAY_ALL_MATCH: 'JSON_ARRAY_ALL_MATCH'; -//JSON_ARRAY_ANY_MATCH: 'JSON_ARRAY_ANY_MATCH'; -//JSON_ARRAY_FILTER: 'JSON_ARRAY_FILTER'; -//JSON_ARRAY_MAP: 'JSON_ARRAY_MAP'; -//JSON_ARRAY_REDUCE: 'JSON_ARRAY_REDUCE'; +// BOOL FUNCTIONS +LIKE: 'LIKE'; +ISNULL: 'ISNULL'; +ISNOTNULL: 'ISNOTNULL'; +CIDRMATCH: 'CIDRMATCH'; +BETWEEN: 'BETWEEN'; +ISPRESENT: 'ISPRESENT'; +ISEMPTY: 'ISEMPTY'; +ISBLANK: 'ISBLANK'; // COLLECTION FUNCTIONS ARRAY: 'ARRAY'; ARRAY_LENGTH: 'ARRAY_LENGTH'; - -// LAMBDA FUNCTIONS -//EXISTS: 'EXISTS'; +MVJOIN: 'MVJOIN'; FORALL: 'FORALL'; FILTER: 'FILTER'; TRANSFORM: 'TRANSFORM'; REDUCE: 'REDUCE'; -// BOOL FUNCTIONS -LIKE: 'LIKE'; -ISNULL: 'ISNULL'; -ISNOTNULL: 'ISNOTNULL'; -BETWEEN: 'BETWEEN'; -CIDRMATCH: 'CIDRMATCH'; -ISPRESENT: 'ISPRESENT'; -ISEMPTY: 'ISEMPTY'; -ISBLANK: 'ISBLANK'; +// JSON FUNCTIONS +JSON_VALID: 'JSON_VALID'; +JSON: 'JSON'; +JSON_OBJECT: 'JSON_OBJECT'; +JSON_ARRAY: 'JSON_ARRAY'; +JSON_ARRAY_LENGTH: 'JSON_ARRAY_LENGTH'; +JSON_EXTRACT: 'JSON_EXTRACT'; +JSON_KEYS: 'JSON_KEYS'; +JSON_SET: 'JSON_SET'; +JSON_DELETE: 'JSON_DELETE'; +JSON_APPEND: 'JSON_APPEND'; +JSON_EXTEND: 'JSON_EXTEND'; // FLOWCONTROL FUNCTIONS IFNULL: 'IFNULL'; NULLIF: 'NULLIF'; IF: 'IF'; TYPEOF: 'TYPEOF'; - -//OTHER CONDITIONAL EXPRESSIONS COALESCE: 'COALESCE'; -//GEOLOCATION FUNCTIONS -GEOIP: 'GEOIP'; - -//GEOLOCATION PROPERTIES -COUNTRY_ISO_CODE: 'COUNTRY_ISO_CODE'; -COUNTRY_NAME: 'COUNTRY_NAME'; -CONTINENT_NAME: 'CONTINENT_NAME'; -REGION_ISO_CODE: 'REGION_ISO_CODE'; -REGION_NAME: 'REGION_NAME'; -CITY_NAME: 'CITY_NAME'; -LOCATION: 'LOCATION'; - // RELEVANCE FUNCTIONS AND PARAMETERS MATCH: 'MATCH'; MATCH_PHRASE: 'MATCH_PHRASE'; @@ -485,6 +490,11 @@ ZERO_TERMS_QUERY: 'ZERO_TERMS_QUERY'; // SPAN KEYWORDS SPAN: 'SPAN'; +BINS: 'BINS'; +MINSPAN: 'MINSPAN'; +START: 'START'; +END: 'END'; +ALIGNTIME: 'ALIGNTIME'; MS: 'MS'; S: 'S'; M: 'M'; @@ -493,6 +503,26 @@ W: 'W'; Q: 'Q'; Y: 'Y'; +// Extended timescale units +SEC: 'SEC'; +SECS: 'SECS'; +SECONDS: 'SECONDS'; +MINS: 'MINS'; +MINUTES: 'MINUTES'; +HR: 'HR'; +HRS: 'HRS'; +HOURS: 'HOURS'; +DAYS: 'DAYS'; +MON: 'MON'; +MONTHS: 'MONTHS'; +US: 'US'; +CS: 'CS'; +DS: 'DS'; + + +// PERCENTILE SHORTCUT FUNCTIONS +// Must precede ID to avoid conflicts with identifier matching +PERCENTILE_SHORTCUT: PERC(INTEGER_LITERAL | DECIMAL_LITERAL) | 'P'(INTEGER_LITERAL | DECIMAL_LITERAL); // LITERALS AND VALUES //STRING_LITERAL: DQUOTA_STRING | SQUOTA_STRING | BQUOTA_STRING; @@ -500,9 +530,10 @@ ID: ID_LITERAL; CLUSTER: CLUSTER_PREFIX_LITERAL; INTEGER_LITERAL: DEC_DIGIT+; DECIMAL_LITERAL: (DEC_DIGIT+)? '.' DEC_DIGIT+; +FLOAT_LITERAL: (DEC_DIGIT+)? '.' DEC_DIGIT+ 'F'; +DOUBLE_LITERAL: (DEC_DIGIT+)? '.' DEC_DIGIT+ 'D'; fragment DATE_SUFFIX: ([\-.][*0-9]+)+; -fragment ID_LITERAL: [@*A-Z]+?[*A-Z_\-0-9]*; fragment CLUSTER_PREFIX_LITERAL: [*A-Z]+?[*A-Z_\-0-9]* COLON; ID_DATE_SUFFIX: CLUSTER_PREFIX_LITERAL? ID_LITERAL DATE_SUFFIX; DQUOTA_STRING: '"' ( '\\'. | '""' | ~('"'| '\\') )* '"'; @@ -510,6 +541,10 @@ SQUOTA_STRING: '\'' ('\\'. | '\'\'' | ~('\'' | '\\'))* '\'' BQUOTA_STRING: '`' ( '\\'. | '``' | ~('`'|'\\'))* '`'; fragment DEC_DIGIT: [0-9]; +// Identifiers cannot start with a single '_' since this an OpenSearch reserved +// metadata field. Two underscores (or more) is acceptable, such as '__field'. +fragment ID_LITERAL: ([@*A-Z_])+?[*A-Z_\-0-9]*; + LINE_COMMENT: '//' ('\\\n' | ~[\r\n])* '\r'? '\n'? -> channel(HIDDEN); BLOCK_COMMENT: '/*' .*? '*/' -> channel(HIDDEN); diff --git a/language-grammar/src/main/antlr4/OpenSearchPPLParser.g4 b/language-grammar/src/main/antlr4/OpenSearchPPLParser.g4 index cae57b53181..d5cb4e3452b 100644 --- a/language-grammar/src/main/antlr4/OpenSearchPPLParser.g4 +++ b/language-grammar/src/main/antlr4/OpenSearchPPLParser.g4 @@ -3,168 +3,233 @@ * SPDX-License-Identifier: Apache-2.0 */ + parser grammar OpenSearchPPLParser; options { tokenVocab = OpenSearchPPLLexer; } + root : pplStatement? EOF ; // statement pplStatement - : dmlStatement - ; - -dmlStatement - : (explainCommand PIPE)? queryStatement + : explainStatement + | queryStatement ; queryStatement : pplCommands (PIPE commands)* ; +explainStatement + : EXPLAIN (explainMode)? queryStatement + ; + +explainMode + : SIMPLE + | STANDARD + | COST + | EXTENDED + ; + subSearch : searchCommand (PIPE commands)* ; // commands pplCommands - : searchCommand - | describeCommand + : describeCommand + | showDataSourcesCommand + | searchCommand ; commands : whereCommand - | correlateCommand - | joinCommand | fieldsCommand + | tableCommand + | joinCommand + | renameCommand | statsCommand + | eventstatsCommand | dedupCommand | sortCommand + | evalCommand | headCommand + | binCommand | topCommand | rareCommand - | evalCommand | grokCommand | parseCommand + | spathCommand | patternsCommand | lookupCommand - | renameCommand + | kmeansCommand + | adCommand + | mlCommand | fillnullCommand - | fieldsummaryCommand - | flattenCommand - | expandCommand | trendlineCommand | appendcolCommand + | appendCommand + | expandCommand + | flattenCommand + | reverseCommand + | regexCommand + | timechartCommand + | rexCommand ; commandName : SEARCH | DESCRIBE | SHOW - | AD - | ML - | KMEANS | WHERE - | CORRELATE - | JOIN | FIELDS + | TABLE + | JOIN + | RENAME | STATS | EVENTSTATS | DEDUP - | EXPLAIN | SORT + | EVAL | HEAD + | BIN | TOP - | TOP_APPROX | RARE - | RARE_APPROX - | EVAL | GROK | PARSE | PATTERNS | LOOKUP - | RENAME - | EXPAND + | KMEANS + | AD + | ML | FILLNULL - | FIELDSUMMARY + | EXPAND | FLATTEN | TRENDLINE - | APPENDCOL + | TIMECHART + | EXPLAIN + | REVERSE + | REGEX + | APPEND + | REX ; searchCommand - : (SEARCH)? fromClause # searchFrom - | (SEARCH)? fromClause logicalExpression # searchFromFilter - | (SEARCH)? logicalExpression fromClause # searchFilterFrom + : (SEARCH)? (searchExpression)* fromClause (searchExpression)* # searchFrom + ; + +searchExpression + : LT_PRTHS searchExpression RT_PRTHS # groupedExpression + | NOT searchExpression # notExpression + | searchExpression OR searchExpression # orExpression + | searchExpression AND searchExpression # andExpression + | searchTerm # termExpression + ; + +searchTerm + : searchFieldComparison # searchComparisonTerm + | searchFieldInList # searchInListTerm + | searchLiteral # searchLiteralTerm + ; + +// Unified search literal for both free text and field comparisons +searchLiteral + : numericLiteral + | booleanLiteral + | ID + | stringLiteral + | searchableKeyWord ; -fieldsummaryCommand - : FIELDSUMMARY (fieldsummaryParameter)* - ; +searchFieldComparison + : fieldExpression searchComparisonOperator searchLiteral # searchFieldCompare + ; + +searchFieldInList + : fieldExpression IN LT_PRTHS searchLiteralList RT_PRTHS # searchFieldInValues + ; + +searchLiteralList + : searchLiteral (COMMA searchLiteral)* # searchLiterals + ; + +searchComparisonOperator + : EQUAL # equals + | NOT_EQUAL # notEquals + | LESS # lessThan + | NOT_GREATER # lessOrEqual + | GREATER # greaterThan + | NOT_LESS # greaterOrEqual + ; -fieldsummaryParameter - : INCLUDEFIELDS EQUAL fieldList # fieldsummaryIncludeFields - | NULLS EQUAL booleanLiteral # fieldsummaryNulls - ; describeCommand : DESCRIBE tableSourceClause ; -explainCommand - : EXPLAIN explainMode - ; - -explainMode - : FORMATTED - | COST - | CODEGEN - | EXTENDED - | SIMPLE - ; - showDataSourcesCommand - : SHOW DATASOURCES - ; + : SHOW DATASOURCES + ; whereCommand - : WHERE logicalExpression - ; - -correlateCommand - : CORRELATE correlationType FIELDS LT_PRTHS fieldList RT_PRTHS (scopeClause)? mappingList - ; - -correlationType - : SELF - | EXACT - | APPROXIMATE - ; + : WHERE logicalExpression + ; -scopeClause - : SCOPE LT_PRTHS fieldExpression COMMA value = literalValue (unit = timespanUnit)? RT_PRTHS - ; +fieldsCommand + : FIELDS fieldsCommandBody + ; -mappingList - : MAPPING LT_PRTHS ( mappingClause (COMMA mappingClause)* ) RT_PRTHS - ; +// Table command - alias for fields command +tableCommand + : TABLE fieldsCommandBody + ; -mappingClause - : left = qualifiedName comparisonOperator right = qualifiedName # mappingCompareExpr - ; +fieldsCommandBody + : (PLUS | MINUS)? wcFieldList + ; -fieldsCommand - : FIELDS (PLUS | MINUS)? fieldList +// Wildcard field list supporting both comma-separated and space-separated fields +wcFieldList + : selectFieldExpression (COMMA? selectFieldExpression)* ; renameCommand - : RENAME renameClasue (COMMA renameClasue)* + : RENAME renameClasue (COMMA? renameClasue)* ; statsCommand - : (STATS | EVENTSTATS) (PARTITIONS EQUAL partitions = integerLiteral)? (ALLNUM EQUAL allnum = booleanLiteral)? (DELIM EQUAL delim = stringLiteral)? statsAggTerm (COMMA statsAggTerm)* (statsByClause)? (DEDUP_SPLITVALUES EQUAL dedupsplit = booleanLiteral)? + : STATS statsArgs statsAggTerm (COMMA statsAggTerm)* (statsByClause)? (dedupSplitArg)? + ; + +statsArgs + : (partitionsArg | allnumArg | delimArg | bucketNullableArg)* + ; + +partitionsArg + : PARTITIONS EQUAL partitions = integerLiteral + ; + +allnumArg + : ALLNUM EQUAL allnum = booleanLiteral + ; + +delimArg + : DELIM EQUAL delim = stringLiteral + ; + +bucketNullableArg + : BUCKET_NULLABLE EQUAL bucket_nullable = booleanLiteral + ; + +dedupSplitArg + : DEDUP_SPLITVALUES EQUAL dedupsplit = booleanLiteral + ; + +eventstatsCommand + : EVENTSTATS eventstatsAggTerm (COMMA eventstatsAggTerm)* (statsByClause)? ; dedupCommand @@ -172,7 +237,30 @@ dedupCommand ; sortCommand - : SORT sortbyClause + : SORT (count = integerLiteral)? sortbyClause (ASC | A | DESC | D)? + ; + +reverseCommand + : REVERSE + ; + +timechartCommand + : TIMECHART timechartParameter* statsFunction (BY fieldExpression)? + ; + +timechartParameter + : (spanClause | SPAN EQUAL spanLiteral) + | timechartArg + ; + +timechartArg + : LIMIT EQUAL integerLiteral + | USEOTHER EQUAL (booleanLiteral | ident) + ; + +spanLiteral + : integerLiteral timespanUnit + | stringLiteral ; evalCommand @@ -183,12 +271,42 @@ headCommand : HEAD (number = integerLiteral)? (FROM from = integerLiteral)? ; +binCommand + : BIN fieldExpression binOption* (AS alias = qualifiedName)? + ; + +binOption + : SPAN EQUAL span = spanValue + | BINS EQUAL bins = integerLiteral + | MINSPAN EQUAL minspan = literalValue (minspanUnit = timespanUnit)? + | ALIGNTIME EQUAL aligntime = aligntimeValue + | START EQUAL start = numericLiteral + | END EQUAL end = numericLiteral + ; + +aligntimeValue + : EARLIEST + | LATEST + | literalValue + ; + +spanValue + : literalValue (timespanUnit)? # numericSpanValue + | logSpanValue # logBasedSpanValue + | ident timespanUnit # extendedTimeSpanValue + | ident # identifierSpanValue + ; + +logSpanValue + : LOG_WITH_BASE # logWithBaseSpan + ; + topCommand - : (TOP | TOP_APPROX) (number = integerLiteral)? fieldList (byClause)? + : TOP (number = integerLiteral)? (COUNTFIELD EQUAL countfield = stringLiteral)? (SHOWCOUNT EQUAL showcount = booleanLiteral)? fieldList (byClause)? ; rareCommand - : (RARE | RARE_APPROX) (number = integerLiteral)? fieldList (byClause)? + : RARE (number = integerLiteral)? (COUNTFIELD EQUAL countfield = stringLiteral)? (SHOWCOUNT EQUAL showcount = booleanLiteral)? fieldList (byClause)? ; grokCommand @@ -199,20 +317,73 @@ parseCommand : PARSE (source_field = expression) (pattern = stringLiteral) ; -patternsCommand - : PATTERNS (patternsParameter)* (source_field = expression) +spathCommand + : SPATH spathParameter* ; -patternsParameter - : (NEW_FIELD EQUAL new_field = stringLiteral) - | (PATTERN EQUAL pattern = stringLiteral) +spathParameter + : (INPUT EQUAL input = expression) + | (OUTPUT EQUAL output = expression) + | ((PATH EQUAL)? path = indexablePath) ; +indexablePath + : pathElement (DOT pathElement)* + ; + +pathElement + : ident pathArrayAccess? + ; + +pathArrayAccess + : LT_CURLY (INTEGER_LITERAL)? RT_CURLY + ; +regexCommand + : REGEX regexExpr + ; + +regexExpr + : field=qualifiedName operator=(EQUAL | NOT_EQUAL) pattern=stringLiteral + ; + +rexCommand + : REX rexExpr + ; + +rexExpr + : FIELD EQUAL field=qualifiedName (rexOption)* pattern=stringLiteral (rexOption)* + ; + +rexOption + : MAX_MATCH EQUAL maxMatch=integerLiteral + | MODE EQUAL (EXTRACT | SED) + | OFFSET_FIELD EQUAL offsetField=qualifiedName + ; patternsMethod : PUNCT | REGEX ; +patternsCommand + : PATTERNS (source_field = expression) (statsByClause)? (METHOD EQUAL method = patternMethod)? (MODE EQUAL pattern_mode = patternMode)? (MAX_SAMPLE_COUNT EQUAL max_sample_count = integerLiteral)? (BUFFER_LIMIT EQUAL buffer_limit = integerLiteral)? (NEW_FIELD EQUAL new_field = stringLiteral)? (patternsParameter)* + ; + +patternsParameter + : (PATTERN EQUAL pattern = stringLiteral) + | (VARIABLE_COUNT_THRESHOLD EQUAL variable_count_threshold = integerLiteral) + | (FREQUENCY_THRESHOLD_PERCENTAGE EQUAL frequency_threshold_percentage = decimalLiteral) + ; + +patternMethod + : SIMPLE_PATTERN + | BRAIN + ; + +patternMode + : LABEL + | AGGREGATION + ; + // lookup lookupCommand : LOOKUP tableSource lookupMappingList ((APPEND | REPLACE) outputCandidateList)? @@ -235,36 +406,28 @@ lookupPair ; fillnullCommand - : FILLNULL (fillNullWithTheSameValue - | fillNullWithFieldVariousValues) + : FILLNULL fillNullWith + | FILLNULL fillNullUsing ; -fillNullWithTheSameValue - : WITH nullReplacement = valueExpression IN nullableFieldList = fieldList +fillNullWith + : WITH replacement = valueExpression (IN fieldList)? ; -fillNullWithFieldVariousValues - : USING nullableReplacementExpression (COMMA nullableReplacementExpression)* +fillNullUsing + : USING replacementPair (COMMA replacementPair)* ; -nullableReplacementExpression - : nullableField = fieldExpression EQUAL nullableReplacement = valueExpression +replacementPair + : fieldExpression EQUAL replacement = valueExpression ; -expandCommand - : EXPAND fieldExpression (AS alias = qualifiedName)? - ; - -flattenCommand - : FLATTEN fieldExpression (AS alias = identifierSeq)? - ; - trendlineCommand : TRENDLINE (SORT sortField)? trendlineClause (trendlineClause)* ; trendlineClause - : trendlineType LT_PRTHS numberOfDataPoints = INTEGER_LITERAL COMMA field = fieldExpression RT_PRTHS (AS alias = qualifiedName)? + : trendlineType LT_PRTHS numberOfDataPoints = integerLiteral COMMA field = fieldExpression RT_PRTHS (AS alias = qualifiedName)? ; trendlineType @@ -272,10 +435,22 @@ trendlineType | WMA ; +expandCommand + : EXPAND fieldExpression (AS alias = qualifiedName)? + ; + +flattenCommand + : FLATTEN fieldExpression (AS aliases = identifierSeq)? + ; + appendcolCommand : APPENDCOL (OVERRIDE EQUAL override = booleanLiteral)? LT_SQR_PRTHS commands (PIPE commands)* RT_SQR_PRTHS ; +appendCommand + : APPEND LT_SQR_PRTHS searchCommand? (PIPE commands)* RT_SQR_PRTHS + ; + kmeansCommand : KMEANS (kmeansParameter)* ; @@ -317,6 +492,10 @@ mlArg fromClause : SOURCE EQUAL tableOrSubqueryClause | INDEX EQUAL tableOrSubqueryClause + | SOURCE EQUAL tableFunction + | INDEX EQUAL tableFunction + | SOURCE EQUAL dynamicSourceClause + | INDEX EQUAL dynamicSourceClause ; tableOrSubqueryClause @@ -324,36 +503,64 @@ tableOrSubqueryClause | tableSourceClause ; -// One tableSourceClause will generate one Relation node with/without one alias -// even if the relation contains more than one table sources. -// These table sources in one relation will be readed one by one in OpenSearch. -// But it may have different behaivours in different execution backends. -// For example, a Spark UnresovledRelation node only accepts one data source. tableSourceClause : tableSource (COMMA tableSource)* (AS alias = qualifiedName)? ; +dynamicSourceClause + : LT_SQR_PRTHS sourceReferences (COMMA sourceFilterArgs)? RT_SQR_PRTHS + ; + +sourceReferences + : sourceReference (COMMA sourceReference)* + ; + +sourceReference + : (CLUSTER)? wcQualifiedName + ; + +sourceFilterArgs + : sourceFilterArg (COMMA sourceFilterArg)* + ; + +sourceFilterArg + : ident EQUAL literalValue + | ident IN valueList + ; + // join joinCommand - : (joinType) JOIN sideAlias joinHintList? joinCriteria? right = tableOrSubqueryClause + : JOIN (joinOption)* (fieldList)? right = tableOrSubqueryClause + | sqlLikeJoinType? JOIN (joinOption)* sideAlias joinHintList? joinCriteria right = tableOrSubqueryClause ; -joinType - : INNER? +sqlLikeJoinType + : INNER | CROSS - | LEFT OUTER? + | (LEFT OUTER? | OUTER) | RIGHT OUTER? | FULL OUTER? | LEFT? SEMI | LEFT? ANTI ; +joinType + : INNER + | CROSS + | OUTER + | LEFT + | RIGHT + | FULL + | SEMI + | ANTI + ; + sideAlias : (LEFT EQUAL leftAlias = qualifiedName)? COMMA? (RIGHT EQUAL rightAlias = qualifiedName)? ; joinCriteria - : ON logicalExpression + : (ON | WHERE) logicalExpression ; joinHintList @@ -365,8 +572,14 @@ hintPair | rightHintKey = RIGHT_HINT DOT ID EQUAL rightHintValue = ident #rightHint ; +joinOption + : OVERWRITE EQUAL booleanLiteral # overwriteOption + | TYPE EQUAL joinType # typeOption + | MAX EQUAL integerLiteral # maxOption + ; + renameClasue - : orignalField = wcFieldExpression AS renamedField = wcFieldExpression + : orignalField = renameFieldExpression AS renamedField = renameFieldExpression ; byClause @@ -377,6 +590,7 @@ statsByClause : BY fieldList | BY bySpanClause | BY bySpanClause COMMA fieldList + | BY fieldList COMMA bySpanClause ; bySpanClause @@ -392,12 +606,34 @@ sortbyClause ; evalClause - : fieldExpression EQUAL expression - | geoipCommand + : fieldExpression EQUAL logicalExpression ; -geoipCommand - : fieldExpression EQUAL GEOIP LT_PRTHS ipAddress = functionArg (COMMA properties = geoIpPropertyList)? RT_PRTHS +eventstatsAggTerm + : windowFunction (AS alias = wcFieldExpression)? + ; + +windowFunction + : windowFunctionName LT_PRTHS functionArgs RT_PRTHS + ; + +windowFunctionName + : statsFunctionName + | scalarWindowFunctionName + ; + +scalarWindowFunctionName + : ROW_NUMBER + | RANK + | DENSE_RANK + | PERCENT_RANK + | CUME_DIST + | FIRST + | LAST + | NTH + | NTILE + | DISTINCT_COUNT + | DC ; // aggregation terms @@ -407,10 +643,13 @@ statsAggTerm // aggregation functions statsFunction - : statsFunctionName LT_PRTHS valueExpression RT_PRTHS # statsFunctionCall - | COUNT LT_PRTHS RT_PRTHS # countAllFunctionCall - | (DISTINCT_COUNT | DC | DISTINCT_COUNT_APPROX) LT_PRTHS valueExpression RT_PRTHS # distinctCountFunctionCall - | percentileFunctionName = (PERCENTILE | PERCENTILE_APPROX) LT_PRTHS valueExpression COMMA percent = integerLiteral RT_PRTHS # percentileFunctionCall + : (COUNT | C) LT_PRTHS evalExpression RT_PRTHS # countEvalFunctionCall + | (COUNT | C) (LT_PRTHS RT_PRTHS)? # countAllFunctionCall + | PERCENTILE_SHORTCUT LT_PRTHS valueExpression RT_PRTHS # percentileShortcutFunctionCall + | (DISTINCT_COUNT | DC | DISTINCT_COUNT_APPROX) LT_PRTHS valueExpression RT_PRTHS # distinctCountFunctionCall + | takeAggFunction # takeAggFunctionCall + | percentileApproxFunction # percentileApproxFunctionCall + | statsFunctionName LT_PRTHS functionArgs RT_PRTHS # statsFunctionCall ; statsFunctionName @@ -419,72 +658,89 @@ statsFunctionName | SUM | MIN | MAX + | VAR_SAMP + | VAR_POP | STDDEV_SAMP | STDDEV_POP + | PERCENTILE + | PERCENTILE_APPROX + | MEDIAN + | LIST + | FIRST + | EARLIEST + | LATEST + | LAST ; -// expressions -expression - : logicalExpression - | valueExpression +takeAggFunction + : TAKE LT_PRTHS fieldExpression (COMMA size = integerLiteral)? RT_PRTHS + ; + +percentileApproxFunction + : (PERCENTILE | PERCENTILE_APPROX) LT_PRTHS aggField = valueExpression + COMMA percent = numericLiteral (COMMA compression = numericLiteral)? RT_PRTHS ; +numericLiteral + : integerLiteral + | decimalLiteral + | doubleLiteral + | floatLiteral + ; + +// predicates logicalExpression : NOT logicalExpression # logicalNot - | LT_PRTHS logicalExpression RT_PRTHS # parentheticLogicalExpr - | comparisonExpression # comparsion - | left = logicalExpression (AND)? right = logicalExpression # logicalAnd - | left = logicalExpression OR right = logicalExpression # logicalOr + | left = logicalExpression AND right = logicalExpression # logicalAnd | left = logicalExpression XOR right = logicalExpression # logicalXor - | booleanExpression # booleanExpr - ; - -comparisonExpression - : left = valueExpression comparisonOperator right = valueExpression # compareExpr - | valueExpression NOT? IN valueList # inExpr - | expr1 = functionArg NOT? BETWEEN expr2 = functionArg AND expr3 = functionArg # between + | left = logicalExpression OR right = logicalExpression # logicalOr + | expression # logicalExpr ; -valueExpressionList - : valueExpression - | LT_PRTHS valueExpression (COMMA valueExpression)* RT_PRTHS +expression + : valueExpression # valueExpr + | relevanceExpression # relevanceExpr + | left = expression comparisonOperator right = expression # compareExpr + | expression NOT? IN valueList # inExpr + | expression NOT? BETWEEN expression AND expression # between ; valueExpression - : left = valueExpression binaryOperator = (STAR | DIVIDE | MODULE) right = valueExpression # binaryArithmetic - | left = valueExpression binaryOperator = (PLUS | MINUS) right = valueExpression # binaryArithmetic - | primaryExpression # valueExpressionDefault - | positionFunction # positionFunctionCall - | caseFunction # caseExpr - | timestampFunction # timestampFunctionCall - | LT_PRTHS valueExpression RT_PRTHS # parentheticValueExpr - | LT_SQR_PRTHS subSearch RT_SQR_PRTHS # scalarSubqueryExpr - | ident ARROW expression # lambda - | LT_PRTHS ident (COMMA ident)+ RT_PRTHS ARROW expression # lambda - ; - -primaryExpression + : left = valueExpression binaryOperator = (STAR | DIVIDE | MODULE) right = valueExpression # binaryArithmetic + | left = valueExpression binaryOperator = (PLUS | MINUS) right = valueExpression # binaryArithmetic + | literalValue # literalValueExpr + | functionCall # functionCallExpr + | lambda # lambdaExpr + | LT_SQR_PRTHS subSearch RT_SQR_PRTHS # scalarSubqueryExpr + | valueExpression NOT? IN LT_SQR_PRTHS subSearch RT_SQR_PRTHS # inSubqueryExpr + | LT_PRTHS valueExpression (COMMA valueExpression)* RT_PRTHS NOT? IN LT_SQR_PRTHS subSearch RT_SQR_PRTHS # inSubqueryExpr + | EXISTS LT_SQR_PRTHS subSearch RT_SQR_PRTHS # existsSubqueryExpr + | fieldExpression # fieldExpr + | LT_PRTHS logicalExpression RT_PRTHS # nestedValueExpr + ; + +evalExpression + : EVAL LT_PRTHS logicalExpression RT_PRTHS + ; + +functionCall : evalFunctionCall - | fieldExpression - | literalValue | dataTypeFunctionCall + | positionFunctionCall + | caseFunctionCall + | timestampFunctionCall + | extractFunctionCall + | getFormatFunctionCall ; -positionFunction +positionFunctionCall : positionFunctionName LT_PRTHS functionArg IN functionArg RT_PRTHS ; -booleanExpression - : booleanFunctionCall # booleanFunctionCallExpr - | valueExpressionList NOT? IN LT_SQR_PRTHS subSearch RT_SQR_PRTHS # inSubqueryExpr - | EXISTS LT_SQR_PRTHS subSearch RT_SQR_PRTHS # existsSubqueryExpr - | cidrMatchFunctionCall # cidrFunctionCallExpr +caseFunctionCall + : CASE LT_PRTHS logicalExpression COMMA valueExpression (COMMA logicalExpression COMMA valueExpression)* (ELSE valueExpression)? RT_PRTHS ; - caseFunction - : CASE LT_PRTHS logicalExpression COMMA valueExpression (COMMA logicalExpression COMMA valueExpression)* (ELSE valueExpression)? RT_PRTHS - ; - relevanceExpression : singleFieldRelevanceFunction | multiFieldRelevanceFunction @@ -497,7 +753,7 @@ singleFieldRelevanceFunction // Field is a list of columns multiFieldRelevanceFunction - : multiFieldRelevanceFunctionName LT_PRTHS LT_SQR_PRTHS field = relevanceFieldAndWeight (COMMA field = relevanceFieldAndWeight)* RT_SQR_PRTHS COMMA query = relevanceQuery (COMMA relevanceArg)* RT_PRTHS + : multiFieldRelevanceFunctionName LT_PRTHS (LT_SQR_PRTHS field = relevanceFieldAndWeight (COMMA field = relevanceFieldAndWeight)* RT_SQR_PRTHS COMMA)? query = relevanceQuery (COMMA relevanceArg)* RT_PRTHS ; // tables @@ -507,16 +763,12 @@ tableSource ; tableFunction - : qualifiedName LT_PRTHS functionArgs RT_PRTHS + : qualifiedName LT_PRTHS namedFunctionArgs RT_PRTHS ; // fields fieldList - : fieldExpression (COMMA fieldExpression)* - ; - -wcFieldList - : wcFieldExpression (COMMA wcFieldExpression)* + : fieldExpression ((COMMA)? fieldExpression)* ; sortField @@ -525,8 +777,6 @@ sortField sortFieldExpression : fieldExpression - - // TODO #963: Implement 'num', 'str', and 'ip' sort syntax | AUTO LT_PRTHS fieldExpression RT_PRTHS | STR LT_PRTHS fieldExpression RT_PRTHS | IP LT_PRTHS fieldExpression RT_PRTHS @@ -541,6 +791,16 @@ wcFieldExpression : wcQualifiedName ; +selectFieldExpression + : wcQualifiedName + | STAR + ; + +renameFieldExpression + : wcQualifiedName + | STAR + ; + // functions evalFunctionCall : evalFunctionName LT_PRTHS functionArgs RT_PRTHS @@ -548,16 +808,7 @@ evalFunctionCall // cast function dataTypeFunctionCall - : CAST LT_PRTHS expression AS convertedDataType RT_PRTHS - ; - -// boolean functions -booleanFunctionCall - : conditionFunctionBase LT_PRTHS functionArgs RT_PRTHS - ; - -cidrMatchFunctionCall - : CIDRMATCH LT_PRTHS ipAddress = functionArg COMMA cidrBlock = functionArg RT_PRTHS + : CAST LT_PRTHS logicalExpression AS convertedDataType RT_PRTHS ; convertedDataType @@ -571,28 +822,48 @@ convertedDataType | typeName = FLOAT | typeName = STRING | typeName = BOOLEAN + | typeName = IP + | typeName = JSON ; evalFunctionName : mathematicalFunctionName | dateTimeFunctionName | textFunctionName - | conditionFunctionBase + | conditionFunctionName + | flowControlFunctionName | systemFunctionName | positionFunctionName - | coalesceFunctionName | cryptographicFunctionName | jsonFunctionName + | geoipFunctionName | collectionFunctionName - | lambdaFunctionName ; functionArgs : (functionArg (COMMA functionArg)*)? ; +namedFunctionArgs + : (namedFunctionArg (COMMA namedFunctionArg)*)? + ; + functionArg - : (ident EQUAL)? valueExpression + : functionArgExpression + ; + +namedFunctionArg + : (ident EQUAL)? functionArgExpression + ; + +functionArgExpression + : lambda + | logicalExpression + ; + +lambda + : ident ARROW logicalExpression + | LT_PRTHS ident (COMMA ident)+ RT_PRTHS ARROW logicalExpression ; relevanceArg @@ -644,6 +915,8 @@ relevanceFieldAndWeight relevanceFieldWeight : integerLiteral | decimalLiteral + | doubleLiteral + | floatLiteral ; relevanceField @@ -662,6 +935,10 @@ relevanceArgValue mathematicalFunctionName : ABS + | PLUS_FUCTION + | MINUS_FUCTION + | STAR_FUNCTION + | DIVIDE_FUNCTION | CBRT | CEIL | CEILING @@ -669,37 +946,72 @@ mathematicalFunctionName | CRC32 | E | EXP + | EXPM1 | FLOOR | LN | LOG - | LOG10 - | LOG2 + | LOG_WITH_BASE | MOD + | MODULUS | PI | POW | POWER | RAND | ROUND | SIGN - | SIGNUM | SQRT | TRUNCATE + | RINT + | SIGNUM + | SUM + | AVG | trigonometricFunctionName ; +geoipFunctionName + : GEOIP + ; + +collectionFunctionName + : ARRAY + | ARRAY_LENGTH + | MVJOIN + | FORALL + | EXISTS + | FILTER + | TRANSFORM + | REDUCE + ; + + trigonometricFunctionName : ACOS | ASIN | ATAN | ATAN2 | COS + | COSH | COT | DEGREES | RADIANS | SIN + | SINH | TAN ; +jsonFunctionName + : JSON + | JSON_OBJECT + | JSON_ARRAY + | JSON_ARRAY_LENGTH + | JSON_EXTRACT + | JSON_KEYS + | JSON_SET + | JSON_DELETE + | JSON_APPEND + | JSON_EXTEND + ; + cryptographicFunctionName : MD5 | SHA1 @@ -714,7 +1026,6 @@ dateTimeFunctionName | CURRENT_DATE | CURRENT_TIME | CURRENT_TIMESTAMP - | CURRENT_TIMEZONE | CURTIME | DATE | DATEDIFF @@ -738,7 +1049,6 @@ dateTimeFunctionName | LOCALTIME | LOCALTIMESTAMP | MAKEDATE - | MAKE_DATE | MAKETIME | MICROSECOND | MINUTE @@ -774,16 +1084,9 @@ dateTimeFunctionName | WEEK_OF_YEAR | YEAR | YEARWEEK - | relativeTimeFunctionName ; -relativeTimeFunctionName - : RELATIVE_TIMESTAMP - | EARLIEST - | LATEST - ; - -getFormatFunction +getFormatFunctionCall : GET_FORMAT LT_PRTHS getFormatType COMMA functionArg RT_PRTHS ; @@ -794,7 +1097,7 @@ getFormatType | TIMESTAMP ; -extractFunction +extractFunctionCall : EXTRACT LT_PRTHS datetimePart FROM functionArg RT_PRTHS ; @@ -829,7 +1132,7 @@ datetimePart | complexDateTimePart ; -timestampFunction +timestampFunctionCall : timestampFunctionName LT_PRTHS simpleDateTimePart COMMA firstArg = functionArg COMMA secondArg = functionArg RT_PRTHS ; @@ -839,19 +1142,26 @@ timestampFunctionName ; // condition function return boolean value -conditionFunctionBase +conditionFunctionName : LIKE - | IF | ISNULL | ISNOTNULL - | IFNULL - | NULLIF - | ISPRESENT + | CIDRMATCH + | REGEX_MATCH | JSON_VALID - | EARLIEST - | LATEST + | ISPRESENT | ISEMPTY | ISBLANK + | EARLIEST + | LATEST + ; + +// flow control function return non-boolean value +flowControlFunctionName + : IF + | IFNULL + | NULLIF + | COALESCE ; systemFunctionName @@ -876,75 +1186,23 @@ textFunctionName | LOCATE | REPLACE | REVERSE - | ISEMPTY - | ISBLANK - ; - -jsonFunctionName - : JSON - | JSON_OBJECT - | JSON_ARRAY - | JSON_ARRAY_LENGTH - | TO_JSON_STRING - | JSON_EXTRACT - | JSON_DELETE - | JSON_APPEND - | JSON_KEYS - | JSON_VALID - | JSON_EXTEND - | JSON_SET -// | JSON_ARRAY_ALL_MATCH -// | JSON_ARRAY_ANY_MATCH -// | JSON_ARRAY_FILTER -// | JSON_ARRAY_MAP -// | JSON_ARRAY_REDUCE ; -collectionFunctionName - : ARRAY - | ARRAY_LENGTH - ; - -lambdaFunctionName - : FORALL - | EXISTS - | FILTER - | TRANSFORM - | REDUCE - ; - positionFunctionName : POSITION ; -coalesceFunctionName - : COALESCE - ; - -geoIpPropertyList - : geoIpProperty (COMMA geoIpProperty)* - ; - -geoIpProperty - : COUNTRY_ISO_CODE - | COUNTRY_NAME - | CONTINENT_NAME - | REGION_ISO_CODE - | REGION_NAME - | CITY_NAME - | TIME_ZONE - | LOCATION - ; - // operators comparisonOperator : EQUAL + | DOUBLE_EQUAL | NOT_EQUAL | LESS | NOT_LESS | GREATER | NOT_GREATER | REGEXP + | LIKE ; singleFieldRelevanceFunctionName @@ -962,12 +1220,14 @@ multiFieldRelevanceFunctionName // literals and values literalValue - : stringLiteral + : intervalLiteral + | stringLiteral | integerLiteral | decimalLiteral + | doubleLiteral + | floatLiteral | booleanLiteral | datetimeLiteral //#datetime - | intervalLiteral ; intervalLiteral @@ -987,6 +1247,14 @@ decimalLiteral : (PLUS | MINUS)? DECIMAL_LITERAL ; +doubleLiteral + : (PLUS | MINUS)? DOUBLE_LITERAL + ; + +floatLiteral + : (PLUS | MINUS)? FLOAT_LITERAL + ; + booleanLiteral : TRUE | FALSE @@ -1052,6 +1320,20 @@ timespanUnit | MONTH | QUARTER | YEAR + | SEC + | SECS + | SECONDS + | MINS + | MINUTES + | HR + | HRS + | HOURS + | DAYS + | MON + | MONTHS + | US + | CS + | DS ; valueList @@ -1062,11 +1344,6 @@ qualifiedName : ident (DOT ident)* # identsAsQualifiedName ; -identifierSeq - : qualifiedName (COMMA qualifiedName)* # identsAsQualifiedNameSeq - | LT_PRTHS qualifiedName (COMMA qualifiedName)* RT_PRTHS # identsAsQualifiedNameSeq - ; - tableQualifiedName : tableIdent (DOT ident)* # identsAsTableQualifiedName ; @@ -1075,6 +1352,11 @@ wcQualifiedName : wildcard (DOT wildcard)* # identsAsWildcardQualifiedName ; +identifierSeq + : qualifiedName (COMMA qualifiedName)* # identsAsQualifiedNameSeq + | LT_PRTHS qualifiedName (COMMA qualifiedName)* RT_PRTHS # identsAsQualifiedNameSeq + ; + ident : (DOT)? ID | BACKTICK ident BACKTICK @@ -1094,40 +1376,49 @@ wildcard ; keywordsCanBeId + : searchableKeyWord + | IN + ; + +searchableKeyWord : D // OD SQL and ODBC special | timespanUnit | SPAN | evalFunctionName + | jsonFunctionName | relevanceArgName | intervalUnit - | dateTimeFunctionName - | textFunctionName - | jsonFunctionName - | mathematicalFunctionName - | positionFunctionName - | cryptographicFunctionName + | trendlineType | singleFieldRelevanceFunctionName | multiFieldRelevanceFunctionName | commandName - | comparisonOperator + | collectionFunctionName + | REGEX | explainMode - | correlationType - | geoIpProperty + | REGEXP // commands assist keywords - | GEOIP - | OVERRIDE + | CASE + | ELSE | ARROW - | IN + | BETWEEN + | EXISTS | SOURCE | INDEX + | A + | ASC | DESC | DATASOURCES | FROM | PATTERN | NEW_FIELD - | SCOPE - | MAPPING + | METHOD + | VARIABLE_COUNT_THRESHOLD + | FREQUENCY_THRESHOLD_PERCENTAGE + | MAX_SAMPLE_COUNT + | BUFFER_LIMIT | WITH + | REGEX + | PUNCT | USING | CAST | GET_FORMAT @@ -1135,8 +1426,12 @@ keywordsCanBeId | INTERVAL | PLUS | MINUS - | INCLUDEFIELDS - | NULLS + | OVERRIDE + // SORT FIELD KEYWORDS + | AUTO + | STR + | IP + | NUM // ARGUMENT KEYWORDS | KEEPEMPTY | CONSECUTIVE @@ -1144,6 +1439,7 @@ keywordsCanBeId | PARTITIONS | ALLNUM | DELIM + | BUCKET_NULLABLE | CENTROIDS | ITERATIONS | DISTANCE_TYPE @@ -1158,12 +1454,17 @@ keywordsCanBeId | TIME_ZONE | TRAINING_DATA_SIZE | ANOMALY_SCORE_THRESHOLD - // AGGREGATIONS + | COUNTFIELD + | SHOWCOUNT + | PATH + | INPUT + | OUTPUT + + // AGGREGATIONS AND WINDOW | statsFunctionName + | windowFunctionName | DISTINCT_COUNT | DISTINCT_COUNT_APPROX - | PERCENTILE - | PERCENTILE_APPROX | ESTDC | ESTDC_ERROR | MEAN @@ -1176,8 +1477,6 @@ keywordsCanBeId | VAR_SAMP | VAR_POP | TAKE - | FIRST - | LAST | LIST | VALUES | PER_DAY @@ -1197,12 +1496,7 @@ keywordsCanBeId | FULL | SEMI | ANTI - | BETWEEN - | CIDRMATCH - | trendlineType - // SORT FIELD KEYWORDS - | AUTO - | STR - | IP - | NUM + | LEFT_HINT + | RIGHT_HINT + | PERCENTILE_SHORTCUT ;