From 4c30c61dc973c82ee82c70a29428772dc0b8f22d Mon Sep 17 00:00:00 2001 From: Louis Chu Date: Mon, 17 Nov 2025 09:35:02 -0800 Subject: [PATCH 01/16] Add frequent used queries Signed-off-by: Louis Chu --- .../big5/queries/dedup_metrics_size_field.ppl | 22 +++++++++ .../parse_regex_with_cast_transformation.ppl | 29 ++++++++++++ ...t_engine_like_pattern_with_aggregation.ppl | 47 +++++++++++++++++++ .../script_engine_like_pattern_with_sort.ppl | 37 +++++++++++++++ 4 files changed, 135 insertions(+) create mode 100644 integ-test/src/test/resources/big5/queries/dedup_metrics_size_field.ppl create mode 100644 integ-test/src/test/resources/big5/queries/parse_regex_with_cast_transformation.ppl create mode 100644 integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_aggregation.ppl create mode 100644 integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_sort.ppl diff --git a/integ-test/src/test/resources/big5/queries/dedup_metrics_size_field.ppl b/integ-test/src/test/resources/big5/queries/dedup_metrics_size_field.ppl new file mode 100644 index 00000000000..cd019f7d9f9 --- /dev/null +++ b/integ-test/src/test/resources/big5/queries/dedup_metrics_size_field.ppl @@ -0,0 +1,22 @@ +/* +{ + "name": "dedup_metrics_size_field", + "operation-type": "search", + "index": "{{index_name | default('custom-big5')}}", + "body": { + "query": { + "exists": { + "field": "metrics.size", + "boost": 1.0 + } + }, + "_source": { + "includes": ["agent", "process", "log", "message", "tags", "cloud", "input", "@timestamp", "ecs", "data_stream", "meta", "host", "metrics", "metrics.size", "aws", "event"], + "excludes": [] + } + } +} +*/ +source = custom-big5 +| dedup `metrics.size` +| sort - `@timestamp` \ No newline at end of file diff --git a/integ-test/src/test/resources/big5/queries/parse_regex_with_cast_transformation.ppl b/integ-test/src/test/resources/big5/queries/parse_regex_with_cast_transformation.ppl new file mode 100644 index 00000000000..93771ae3fce --- /dev/null +++ b/integ-test/src/test/resources/big5/queries/parse_regex_with_cast_transformation.ppl @@ -0,0 +1,29 @@ +/* +{ + "name": "parse_field_with_cast_transformation", + "operation-type": "search", + "index": "{{index_name | default('custom-big5')}}", + "body": { + "query": { + "match_all": {} + }, + "_source": { + "includes": ["aws.cloudwatch.log_stream", "@timestamp"], + "excludes": [] + }, + "sort": [ + { + "@timestamp": { + "order": "desc", + "missing": "_last" + } + } + ] + } +} +*/ +source = custom-big5 +| parse `aws.cloudwatch.log_stream` "eni-(?[0-9]+)" +| eval eniid_num = cast(eniid as int) +| fields `aws.cloudwatch.log_stream`, eniid, eniid_num, `@timestamp` +| sort - `@timestamp` \ No newline at end of file diff --git a/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_aggregation.ppl b/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_aggregation.ppl new file mode 100644 index 00000000000..a23c4dbd94c --- /dev/null +++ b/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_aggregation.ppl @@ -0,0 +1,47 @@ +/* +{ + "name": "script_engine_like_pattern_with_aggregation", + "operation-type": "search", + "index": "{{index_name | default('custom-big5')}}", + "body": { + "query": { + "script": { + "script": { + "source": "{\"langType\":\"calcite\",\"script\":\"...\"}", + "lang": "opensearch_compounded_script", + "params": { + "utcTimestamp": "{{current_timestamp}}" + } + }, + "boost": 1.0 + } + }, + "_source": { + "includes": ["message", "metrics.size"], + "excludes": [] + }, + "aggregations": { + "composite_buckets": { + "composite": { + "size": 10000, + "sources": [ + { + "metrics.size": { + "terms": { + "field": "metrics.size", + "missing_bucket": true, + "missing_order": "first", + "order": "asc" + } + } + } + ] + } + } + } + } +} +*/ +source = custom-big5 +| where like(`message`, '%sshd%') +| stats count() by `metrics.size` \ No newline at end of file diff --git a/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_sort.ppl b/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_sort.ppl new file mode 100644 index 00000000000..405061cb570 --- /dev/null +++ b/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_sort.ppl @@ -0,0 +1,37 @@ +/* +{ + "name": "script_engine_like_pattern_with_sort", + "operation-type": "search", + "index": "{{index_name | default('big5')}}", + "body": { + "query": { + "script": { + "script": { + "source": "{\"langType\":\"calcite\",\"script\":\"...\"}", + "lang": "opensearch_compounded_script", + "params": { + "utcTimestamp": "{{current_timestamp}}" + } + }, + "boost": 1.0 + } + }, + "_source": { + "includes": ["agent", "process", "log", "message", "tags", "cloud", "input", "@timestamp", "ecs", "data_stream", "meta", "host", "metrics", "aws", "event"], + "excludes": [] + }, + "sort": [ + { + "@timestamp": { + "order": "desc", + "missing": "_last" + } + } + ] + } +} +*/ +source = custom-big5 +| where like(`message`, '%sshd%') +| sort - `@timestamp` +| head 10 \ No newline at end of file From 1297fa24fe44a4503cfdadd8911cdd04cef8ebe3 Mon Sep 17 00:00:00 2001 From: Louis Chu Date: Mon, 24 Nov 2025 06:44:25 -0800 Subject: [PATCH 02/16] Add new queries to CalcitePPLBig5IT Signed-off-by: Louis Chu --- .../sql/calcite/big5/CalcitePPLBig5IT.java | 30 +++++++++++++++++++ .../src/test/resources/big5/data/big5.json | 4 +++ .../big5/queries/dedup_metrics_size_field.ppl | 2 +- .../parse_regex_with_cast_transformation.ppl | 14 ++++----- ...t_engine_like_pattern_with_aggregation.ppl | 2 +- .../script_engine_like_pattern_with_sort.ppl | 2 +- .../calcite/dedup_metrics_size_field.yaml | 16 ++++++++++ .../parse_regex_with_cast_transformation.yaml | 14 +++++++++ 8 files changed, 74 insertions(+), 10 deletions(-) create mode 100644 integ-test/src/test/resources/expectedOutput/calcite/dedup_metrics_size_field.yaml create mode 100644 integ-test/src/test/resources/expectedOutput/calcite/parse_regex_with_cast_transformation.yaml diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java index 665b3f0a874..ad54d83c8be 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java @@ -5,6 +5,8 @@ package org.opensearch.sql.calcite.big5; +import static org.opensearch.sql.util.MatcherUtils.assertYamlEqualsIgnoreId; + import java.io.IOException; import org.junit.FixMethodOrder; import org.junit.Test; @@ -42,4 +44,32 @@ public void coalesce_nonexistent_field_fallback() throws IOException { String ppl = sanitize(loadExpectedQuery("coalesce_nonexistent_field_fallback.ppl")); timing(summary, "coalesce_nonexistent_field_fallback", ppl); } + + @Test + public void dedup_metrics_size_field() throws IOException { + String ppl = sanitize(loadExpectedQuery("dedup_metrics_size_field.ppl")); + timing(summary, "dedup_metrics_size_field", ppl); + String expected = loadExpectedPlan("dedup_metrics_size_field.yaml"); + assertYamlEqualsIgnoreId(expected, explainQueryYaml(ppl)); + } + + @Test + public void parse_regex_with_cast_transformation() throws IOException { + String ppl = sanitize(loadExpectedQuery("parse_regex_with_cast_transformation.ppl")); + timing(summary, "parse_regex_with_cast_transformation", ppl); + String expected = loadExpectedPlan("parse_regex_with_cast_transformation.yaml"); + assertYamlEqualsIgnoreId(expected, explainQueryYaml(ppl)); + } + + @Test + public void script_engine_like_pattern_with_aggregation() throws IOException { + String ppl = sanitize(loadExpectedQuery("script_engine_like_pattern_with_aggregation.ppl")); + timing(summary, "script_engine_like_pattern_with_aggregation", ppl); + } + + @Test + public void script_engine_like_pattern_with_sort() throws IOException { + String ppl = sanitize(loadExpectedQuery("script_engine_like_pattern_with_sort.ppl")); + timing(summary, "script_engine_like_pattern_with_sort", ppl); + } } diff --git a/integ-test/src/test/resources/big5/data/big5.json b/integ-test/src/test/resources/big5/data/big5.json index 30a6a81ab8e..c9a0dc07a14 100644 --- a/integ-test/src/test/resources/big5/data/big5.json +++ b/integ-test/src/test/resources/big5/data/big5.json @@ -1,2 +1,6 @@ {"index":{}} {"message":"2023-04-30T21:48:56.160Z Apr 30 21:48:56 ip-66-221-134-40 journal: donkey glazer fly shark whip servant thornfalcon","process":{"name":"journal"},"aws.cloudwatch":{"ingestion_time":"2023-04-30T21:48:56.160Z","log_group":"/var/log/messages","log_stream":"luckcrafter"},"tags":["preserve_original_event"],"meta":{"file":"2023-01-02/1682891301-gotext.ndjson.gz"},"cloud":{"region":"eu-central-1"},"@timestamp":"2023-01-02T22:02:34.000Z","input":{"type":"aws-cloudwatch"},"metrics":{"tmin":849,"size":1981},"log.file.path":"/var/log/messages/luckcrafter","event":{"id":"sunsetmark","dataset":"generic","ingested":"2023-07-20T03:36:30.223806Z"},"agent":{"id":"c315dc22-3ea6-44dc-8d56-fd02f675367b","name":"fancydancer","ephemeral_id":"c315dc22-3ea6-44dc-8d56-fd02f675367b","type":"filebeat","version":"8.8.0"}} +{"index":{}} +{"message":"2024-04-11T18:00:10.965Z Apr 11 18:00:10 ip-32-11-43-93 sshd: cloak bolt thorn hugger rib jackal wolverine shaker boar fighter taker boulderfox","process":{"name":"sshd"},"aws.cloudwatch":{"log_stream":"mirrorlighter","ingestion_time":"2024-04-11T18:00:10.965Z","log_group":"/var/log/messages"},"tags":["preserve_original_event"],"meta":{"file":"2024-04-11/1712851210-sshd.ndjson.gz"},"cloud":{"region":"ap-southeast-3"},"@timestamp":"2023-05-01T21:59:58.000Z","input":{"type":"aws-cloudwatch"},"metrics":{"size":3166,"tmin":1},"log.file.path":"/var/log/messages/mirrorlighter","event":{"id":"patternantler","ingested":"2024-04-11T17:39:10.965818973Z","dataset":"generic"},"agent":{"id":"c79a289f-6c16-4de2-a6c8-8ee5c84473d5","name":"brindlehugger","type":"filebeat","version":"8.8.0","ephemeral_id":"c79a289f-6c16-4de2-a6c8-8ee5c84473d5"}} +{"index":{}} +{"message":"2024-04-11T10:15:01.628Z Apr 11 10:15:01 ip-95-21-51-112 kernel: kicker stinger slave dolphin sparkox","process":{"name":"kernel"},"aws.cloudwatch":{"log_stream":"plumebard","ingestion_time":"2024-04-11T10:15:01.628Z","log_group":"/var/log/messages"},"tags":["preserve_original_event"],"meta":{"file":"2024-04-11/1712826901-kernel.ndjson.gz"},"cloud":{"region":"ap-south-1"},"@timestamp":"2023-03-01T22:31:11.000Z","input":{"type":"aws-cloudwatch"},"metrics":{"size":3993,"tmin":1},"log.file.path":"/var/log/messages/plumebard","event":{"id":"chipgambler","ingested":"2024-04-11T10:09:29.628941177Z","dataset":"generic"},"agent":{"id":"5f25fa16-6a99-489f-b1c5-f27c0627a459","name":"lemongrabber","type":"filebeat","version":"8.8.0","ephemeral_id":"5f25fa16-6a99-489f-b1c5-f27c0627a459"}} diff --git a/integ-test/src/test/resources/big5/queries/dedup_metrics_size_field.ppl b/integ-test/src/test/resources/big5/queries/dedup_metrics_size_field.ppl index cd019f7d9f9..aca5106807c 100644 --- a/integ-test/src/test/resources/big5/queries/dedup_metrics_size_field.ppl +++ b/integ-test/src/test/resources/big5/queries/dedup_metrics_size_field.ppl @@ -17,6 +17,6 @@ } } */ -source = custom-big5 +source = big5 | dedup `metrics.size` | sort - `@timestamp` \ No newline at end of file diff --git a/integ-test/src/test/resources/big5/queries/parse_regex_with_cast_transformation.ppl b/integ-test/src/test/resources/big5/queries/parse_regex_with_cast_transformation.ppl index 93771ae3fce..d8ca87e8e5c 100644 --- a/integ-test/src/test/resources/big5/queries/parse_regex_with_cast_transformation.ppl +++ b/integ-test/src/test/resources/big5/queries/parse_regex_with_cast_transformation.ppl @@ -1,14 +1,14 @@ /* { - "name": "parse_field_with_cast_transformation", + "name": "parse_regex_with_cast_transformation", "operation-type": "search", - "index": "{{index_name | default('custom-big5')}}", + "index": "{{index_name | default('big5')}}", "body": { "query": { "match_all": {} }, "_source": { - "includes": ["aws.cloudwatch.log_stream", "@timestamp"], + "includes": ["log.file.path", "@timestamp"], "excludes": [] }, "sort": [ @@ -22,8 +22,8 @@ } } */ -source = custom-big5 -| parse `aws.cloudwatch.log_stream` "eni-(?[0-9]+)" -| eval eniid_num = cast(eniid as int) -| fields `aws.cloudwatch.log_stream`, eniid, eniid_num, `@timestamp` +source = big5 +| parse `log.file.path` '/var/log/(?\\w+)/(?\\w+)' +| eval filename_len = length(filename) +| fields `log.file.path`, logType, filename, filename_len, `@timestamp` | sort - `@timestamp` \ No newline at end of file diff --git a/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_aggregation.ppl b/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_aggregation.ppl index a23c4dbd94c..83003fc5602 100644 --- a/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_aggregation.ppl +++ b/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_aggregation.ppl @@ -42,6 +42,6 @@ } } */ -source = custom-big5 +source = big5 | where like(`message`, '%sshd%') | stats count() by `metrics.size` \ No newline at end of file diff --git a/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_sort.ppl b/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_sort.ppl index 405061cb570..81abe8f2b41 100644 --- a/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_sort.ppl +++ b/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_sort.ppl @@ -31,7 +31,7 @@ } } */ -source = custom-big5 +source = big5 | where like(`message`, '%sshd%') | sort - `@timestamp` | head 10 \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite/dedup_metrics_size_field.yaml b/integ-test/src/test/resources/expectedOutput/calcite/dedup_metrics_size_field.yaml new file mode 100644 index 00000000000..3c948626ef1 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/dedup_metrics_size_field.yaml @@ -0,0 +1,16 @@ +calcite: + logical: | + LogicalSystemLimit(sort0=[$7], dir0=[DESC-nulls-last], fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(agent=[$0], process=[$6], log=[$8], message=[$11], tags=[$12], cloud=[$13], input=[$15], @timestamp=[$17], ecs=[$18], data_stream=[$20], meta=[$24], host=[$26], metrics=[$27], aws=[$30], event=[$35]) + LogicalSort(sort0=[$17], dir0=[DESC-nulls-last]) + LogicalProject(agent=[$0], agent.ephemeral_id=[$1], agent.id=[$2], agent.name=[$3], agent.type=[$4], agent.version=[$5], process=[$6], process.name=[$7], log=[$8], log.file=[$9], log.file.path=[$10], message=[$11], tags=[$12], cloud=[$13], cloud.region=[$14], input=[$15], input.type=[$16], @timestamp=[$17], ecs=[$18], ecs.version=[$19], data_stream=[$20], data_stream.dataset=[$21], data_stream.namespace=[$22], data_stream.type=[$23], meta=[$24], meta.file=[$25], host=[$26], metrics=[$27], metrics.size=[$28], metrics.tmin=[$29], aws=[$30], aws.cloudwatch=[$31], aws.cloudwatch.ingestion_time=[$32], aws.cloudwatch.log_group=[$33], aws.cloudwatch.log_stream=[$34], event=[$35], event.dataset=[$36], event.id=[$37], event.ingested=[$38], _id=[$39], _index=[$40], _score=[$41], _maxscore=[$42], _sort=[$43], _routing=[$44]) + LogicalFilter(condition=[<=($45, 1)]) + LogicalProject(agent=[$0], agent.ephemeral_id=[$1], agent.id=[$2], agent.name=[$3], agent.type=[$4], agent.version=[$5], process=[$6], process.name=[$7], log=[$8], log.file=[$9], log.file.path=[$10], message=[$11], tags=[$12], cloud=[$13], cloud.region=[$14], input=[$15], input.type=[$16], @timestamp=[$17], ecs=[$18], ecs.version=[$19], data_stream=[$20], data_stream.dataset=[$21], data_stream.namespace=[$22], data_stream.type=[$23], meta=[$24], meta.file=[$25], host=[$26], metrics=[$27], metrics.size=[$28], metrics.tmin=[$29], aws=[$30], aws.cloudwatch=[$31], aws.cloudwatch.ingestion_time=[$32], aws.cloudwatch.log_group=[$33], aws.cloudwatch.log_stream=[$34], event=[$35], event.dataset=[$36], event.id=[$37], event.ingested=[$38], _id=[$39], _index=[$40], _score=[$41], _maxscore=[$42], _sort=[$43], _routing=[$44], _row_number_dedup_=[ROW_NUMBER() OVER (PARTITION BY $28 ORDER BY $28)]) + LogicalFilter(condition=[IS NOT NULL($28)]) + CalciteLogicalIndexScan(table=[[OpenSearch, big5]]) + physical: | + EnumerableLimit(fetch=[10000]) + EnumerableSort(sort0=[$7], dir0=[DESC-nulls-last]) + EnumerableCalc(expr#0..16=[{inputs}], expr#17=[1], expr#18=[<=($t16, $t17)], proj#0..12=[{exprs}], aws=[$t14], event=[$t15], $condition=[$t18]) + EnumerableWindow(window#0=[window(partition {13} order by [13] rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, big5]], PushDownContext=[[PROJECT->[agent, process, log, message, tags, cloud, input, @timestamp, ecs, data_stream, meta, host, metrics, metrics.size, aws, event], FILTER->IS NOT NULL($13)], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","query":{"exists":{"field":"metrics.size","boost":1.0}},"_source":{"includes":["agent","process","log","message","tags","cloud","input","@timestamp","ecs","data_stream","meta","host","metrics","metrics.size","aws","event"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) diff --git a/integ-test/src/test/resources/expectedOutput/calcite/parse_regex_with_cast_transformation.yaml b/integ-test/src/test/resources/expectedOutput/calcite/parse_regex_with_cast_transformation.yaml new file mode 100644 index 00000000000..dd3e8bc82f6 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/parse_regex_with_cast_transformation.yaml @@ -0,0 +1,14 @@ +calcite: + logical: | + LogicalSystemLimit(sort0=[$4], dir0=[DESC-nulls-last], fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalSort(sort0=[$4], dir0=[DESC-nulls-last]) + LogicalProject(log.file.path=[$10], logType=[ITEM(PARSE($10, '/var/log/(?\w+)/(?\w+)':VARCHAR, 'regex':VARCHAR), 'logType':VARCHAR)], filename=[ITEM(PARSE($10, '/var/log/(?\w+)/(?\w+)':VARCHAR, 'regex':VARCHAR), 'filename':VARCHAR)], filename_len=[CHAR_LENGTH(ITEM(PARSE($10, '/var/log/(?\w+)/(?\w+)':VARCHAR, 'regex':VARCHAR), 'filename':VARCHAR))], @timestamp=[$17]) + CalciteLogicalIndexScan(table=[[OpenSearch, big5]]) + physical: | + EnumerableCalc(expr#0..1=[{inputs}], expr#2=['/var/log/(?\w+)/(?\w+)':VARCHAR], expr#3=['regex':VARCHAR], expr#4=[PARSE($t0, $t2, $t3)], expr#5=['logType':VARCHAR], expr#6=[ITEM($t4, $t5)], expr#7=['filename':VARCHAR], expr#8=[ITEM($t4, $t7)], expr#9=[CHAR_LENGTH($t8)], log.file.path=[$t0], $f1=[$t6], $f2=[$t8], $f3=[$t9], @timestamp=[$t1]) + CalciteEnumerableIndexScan(table=[[OpenSearch, big5]], PushDownContext=[[PROJECT->[log.file.path, @timestamp], SORT->[{ + "@timestamp" : { + "order" : "desc", + "missing" : "_last" + } + }], LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":10000,"timeout":"1m","_source":{"includes":["log.file.path","@timestamp"],"excludes":[]},"sort":[{"@timestamp":{"order":"desc","missing":"_last"}}]}, requestedTotalSize=10000, pageSize=null, startFrom=0)]) From 1f006db33e60d655ef202af9d75c7bf22b6a99d5 Mon Sep 17 00:00:00 2001 From: Aaron Alvarez Date: Thu, 18 Dec 2025 14:24:48 -0800 Subject: [PATCH 03/16] Add frequently used Big5 PPL queries Signed-off-by: Aaron Alvarez --- .../org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java | 8 ++++---- .../resources/big5/queries/dedup_metrics_size_field.ppl | 4 ++-- ...st_transformation.ppl => rex_regex_transformation.ppl} | 8 ++++---- .../script_engine_like_pattern_with_aggregation.ppl | 2 +- .../big5/queries/script_engine_like_pattern_with_sort.ppl | 3 ++- 5 files changed, 13 insertions(+), 12 deletions(-) rename integ-test/src/test/resources/big5/queries/{parse_regex_with_cast_transformation.ppl => rex_regex_transformation.ppl} (66%) diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java index ad54d83c8be..d06167574c4 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java @@ -54,10 +54,10 @@ public void dedup_metrics_size_field() throws IOException { } @Test - public void parse_regex_with_cast_transformation() throws IOException { - String ppl = sanitize(loadExpectedQuery("parse_regex_with_cast_transformation.ppl")); - timing(summary, "parse_regex_with_cast_transformation", ppl); - String expected = loadExpectedPlan("parse_regex_with_cast_transformation.yaml"); + public void rex_regex_transformation() throws IOException { + String ppl = sanitize(loadExpectedQuery("rex_regex_transformation.ppl")); + timing(summary, "rex_regex_transformation", ppl); + String expected = loadExpectedPlan("rex_regex_transformation.yaml"); assertYamlEqualsIgnoreId(expected, explainQueryYaml(ppl)); } diff --git a/integ-test/src/test/resources/big5/queries/dedup_metrics_size_field.ppl b/integ-test/src/test/resources/big5/queries/dedup_metrics_size_field.ppl index aca5106807c..f59638eafbc 100644 --- a/integ-test/src/test/resources/big5/queries/dedup_metrics_size_field.ppl +++ b/integ-test/src/test/resources/big5/queries/dedup_metrics_size_field.ppl @@ -18,5 +18,5 @@ } */ source = big5 -| dedup `metrics.size` -| sort - `@timestamp` \ No newline at end of file +| dedup metrics.size +| sort - @timestamp \ No newline at end of file diff --git a/integ-test/src/test/resources/big5/queries/parse_regex_with_cast_transformation.ppl b/integ-test/src/test/resources/big5/queries/rex_regex_transformation.ppl similarity index 66% rename from integ-test/src/test/resources/big5/queries/parse_regex_with_cast_transformation.ppl rename to integ-test/src/test/resources/big5/queries/rex_regex_transformation.ppl index d8ca87e8e5c..aac23a494ac 100644 --- a/integ-test/src/test/resources/big5/queries/parse_regex_with_cast_transformation.ppl +++ b/integ-test/src/test/resources/big5/queries/rex_regex_transformation.ppl @@ -1,6 +1,6 @@ /* { - "name": "parse_regex_with_cast_transformation", + "name": "rex_regex_transformation", "operation-type": "search", "index": "{{index_name | default('big5')}}", "body": { @@ -23,7 +23,7 @@ } */ source = big5 -| parse `log.file.path` '/var/log/(?\\w+)/(?\\w+)' +| rex field=log.file.path '/var/log/(?\\w+)/(?\\w+)' | eval filename_len = length(filename) -| fields `log.file.path`, logType, filename, filename_len, `@timestamp` -| sort - `@timestamp` \ No newline at end of file +| fields log.file.path, logType, filename, filename_len, @timestamp +| sort - @timestamp \ No newline at end of file diff --git a/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_aggregation.ppl b/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_aggregation.ppl index 83003fc5602..4af3f980e15 100644 --- a/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_aggregation.ppl +++ b/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_aggregation.ppl @@ -44,4 +44,4 @@ */ source = big5 | where like(`message`, '%sshd%') -| stats count() by `metrics.size` \ No newline at end of file +| stats count() by metrics.size \ No newline at end of file diff --git a/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_sort.ppl b/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_sort.ppl index 81abe8f2b41..b6ac4a929de 100644 --- a/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_sort.ppl +++ b/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_sort.ppl @@ -16,6 +16,7 @@ "boost": 1.0 } }, + "size": 10, "_source": { "includes": ["agent", "process", "log", "message", "tags", "cloud", "input", "@timestamp", "ecs", "data_stream", "meta", "host", "metrics", "aws", "event"], "excludes": [] @@ -33,5 +34,5 @@ */ source = big5 | where like(`message`, '%sshd%') -| sort - `@timestamp` +| sort - @timestamp | head 10 \ No newline at end of file From 5129270a7bddd52a82959adc491cf80c3b3503c6 Mon Sep 17 00:00:00 2001 From: Aaron Alvarez <900908alvarezaaron@gmail.com> Date: Thu, 18 Dec 2025 14:42:56 -0800 Subject: [PATCH 04/16] Update integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> Signed-off-by: Aaron Alvarez <900908alvarezaaron@gmail.com> --- .../org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java index d06167574c4..e7c8f411bf7 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java @@ -67,6 +67,10 @@ public void script_engine_like_pattern_with_aggregation() throws IOException { timing(summary, "script_engine_like_pattern_with_aggregation", ppl); } + /** + * Tests LIKE pattern matching with sorting and result limiting. + * Validates filtering by message content with timestamp ordering. + */ @Test public void script_engine_like_pattern_with_sort() throws IOException { String ppl = sanitize(loadExpectedQuery("script_engine_like_pattern_with_sort.ppl")); From 48d316a618f88d92dd43e615373c7010780a88bf Mon Sep 17 00:00:00 2001 From: Aaron Alvarez <900908alvarezaaron@gmail.com> Date: Thu, 18 Dec 2025 14:43:17 -0800 Subject: [PATCH 05/16] Update integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> Signed-off-by: Aaron Alvarez <900908alvarezaaron@gmail.com> --- .../org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java index e7c8f411bf7..792ac9e4fe4 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java @@ -61,6 +61,10 @@ public void rex_regex_transformation() throws IOException { assertYamlEqualsIgnoreId(expected, explainQueryYaml(ppl)); } + /** + * Tests LIKE pattern matching with aggregation using script engine. + * Validates filtering by message content and grouping results. + */ @Test public void script_engine_like_pattern_with_aggregation() throws IOException { String ppl = sanitize(loadExpectedQuery("script_engine_like_pattern_with_aggregation.ppl")); From c49e7e4b5b8a246ceb2787884b7ab95e6c76b16d Mon Sep 17 00:00:00 2001 From: Aaron Alvarez <900908alvarezaaron@gmail.com> Date: Thu, 18 Dec 2025 14:43:38 -0800 Subject: [PATCH 06/16] Update integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> Signed-off-by: Aaron Alvarez <900908alvarezaaron@gmail.com> --- .../org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java index 792ac9e4fe4..28654a79aa1 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java @@ -45,6 +45,10 @@ public void coalesce_nonexistent_field_fallback() throws IOException { timing(summary, "coalesce_nonexistent_field_fallback", ppl); } + /** + * Tests deduplication by metrics.size field with sorting by timestamp. + * Validates that the Calcite plan correctly handles dedup operations. + */ @Test public void dedup_metrics_size_field() throws IOException { String ppl = sanitize(loadExpectedQuery("dedup_metrics_size_field.ppl")); From d052ba334fdac002457b304274bde8de7441e818 Mon Sep 17 00:00:00 2001 From: Aaron Alvarez <900908alvarezaaron@gmail.com> Date: Thu, 18 Dec 2025 14:44:12 -0800 Subject: [PATCH 07/16] Update integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> Signed-off-by: Aaron Alvarez <900908alvarezaaron@gmail.com> --- .../org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java index 28654a79aa1..5156106e712 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java @@ -57,6 +57,10 @@ public void dedup_metrics_size_field() throws IOException { assertYamlEqualsIgnoreId(expected, explainQueryYaml(ppl)); } + /** + * Tests regex-based field extraction and transformation using rex command. + * Validates that the Calcite plan correctly handles regex patterns. + */ @Test public void rex_regex_transformation() throws IOException { String ppl = sanitize(loadExpectedQuery("rex_regex_transformation.ppl")); From 08477f714c3c4fec2d1b4979e22599f3cce1ba25 Mon Sep 17 00:00:00 2001 From: Aaron Alvarez Date: Thu, 18 Dec 2025 14:45:53 -0800 Subject: [PATCH 08/16] Addressing naming difference issues Signed-off-by: Aaron Alvarez --- .../test/resources/big5/queries/dedup_metrics_size_field.ppl | 2 +- ...h_cast_transformation.yaml => rex_regex_transformation.yaml} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename integ-test/src/test/resources/expectedOutput/calcite/{parse_regex_with_cast_transformation.yaml => rex_regex_transformation.yaml} (100%) diff --git a/integ-test/src/test/resources/big5/queries/dedup_metrics_size_field.ppl b/integ-test/src/test/resources/big5/queries/dedup_metrics_size_field.ppl index f59638eafbc..bb55d3a06ed 100644 --- a/integ-test/src/test/resources/big5/queries/dedup_metrics_size_field.ppl +++ b/integ-test/src/test/resources/big5/queries/dedup_metrics_size_field.ppl @@ -2,7 +2,7 @@ { "name": "dedup_metrics_size_field", "operation-type": "search", - "index": "{{index_name | default('custom-big5')}}", + "index": "{{index_name | default('big5')}}", "body": { "query": { "exists": { diff --git a/integ-test/src/test/resources/expectedOutput/calcite/parse_regex_with_cast_transformation.yaml b/integ-test/src/test/resources/expectedOutput/calcite/rex_regex_transformation.yaml similarity index 100% rename from integ-test/src/test/resources/expectedOutput/calcite/parse_regex_with_cast_transformation.yaml rename to integ-test/src/test/resources/expectedOutput/calcite/rex_regex_transformation.yaml From 237478a4c54690cf04802209c6f9de3daebc95d0 Mon Sep 17 00:00:00 2001 From: Aaron Alvarez Date: Fri, 19 Dec 2025 09:57:20 -0800 Subject: [PATCH 09/16] Fixing formatting issues Signed-off-by: Aaron Alvarez --- .../sql/calcite/big5/CalcitePPLBig5IT.java | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java index 5156106e712..a8b981ba1b8 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java @@ -46,8 +46,8 @@ public void coalesce_nonexistent_field_fallback() throws IOException { } /** - * Tests deduplication by metrics.size field with sorting by timestamp. - * Validates that the Calcite plan correctly handles dedup operations. + * Tests deduplication by metrics.size field with sorting by timestamp. Validates that the Calcite + * plan correctly handles dedup operations. */ @Test public void dedup_metrics_size_field() throws IOException { @@ -58,8 +58,8 @@ public void dedup_metrics_size_field() throws IOException { } /** - * Tests regex-based field extraction and transformation using rex command. - * Validates that the Calcite plan correctly handles regex patterns. + * Tests regex-based field extraction and transformation using rex command. Validates that the + * Calcite plan correctly handles regex patterns. */ @Test public void rex_regex_transformation() throws IOException { @@ -70,8 +70,8 @@ public void rex_regex_transformation() throws IOException { } /** - * Tests LIKE pattern matching with aggregation using script engine. - * Validates filtering by message content and grouping results. + * Tests LIKE pattern matching with aggregation using script engine. Validates filtering by + * message content and grouping results. */ @Test public void script_engine_like_pattern_with_aggregation() throws IOException { @@ -80,8 +80,8 @@ public void script_engine_like_pattern_with_aggregation() throws IOException { } /** - * Tests LIKE pattern matching with sorting and result limiting. - * Validates filtering by message content with timestamp ordering. + * Tests LIKE pattern matching with sorting and result limiting. Validates filtering by message + * content with timestamp ordering. */ @Test public void script_engine_like_pattern_with_sort() throws IOException { From 9cc6f75cc7b77b11e10806e4c03d49dc5e096f44 Mon Sep 17 00:00:00 2001 From: Aaron Alvarez Date: Fri, 19 Dec 2025 17:20:15 -0800 Subject: [PATCH 10/16] Fixing integration tests Signed-off-by: Aaron Alvarez --- .../org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java | 5 +---- .../sql/calcite/remote/CalcitePPLAggregationIT.java | 6 +++++- .../remote/CalcitePPLConditionBuiltinFunctionIT.java | 2 +- .../expectedOutput/calcite/rex_regex_transformation.yaml | 4 ++-- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java index a8b981ba1b8..b4e3c12fc56 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java @@ -45,10 +45,7 @@ public void coalesce_nonexistent_field_fallback() throws IOException { timing(summary, "coalesce_nonexistent_field_fallback", ppl); } - /** - * Tests deduplication by metrics.size field with sorting by timestamp. Validates that the Calcite - * plan correctly handles dedup operations. - */ + /** Tests deduplication by metrics.size field with sorting by timestamp. */ @Test public void dedup_metrics_size_field() throws IOException { String ppl = sanitize(loadExpectedQuery("dedup_metrics_size_field.ppl")); diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLAggregationIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLAggregationIT.java index 9710d2f4415..6e9b0c7b6d7 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLAggregationIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLAggregationIT.java @@ -738,7 +738,11 @@ public void testCountBySpanForCustomFormats() throws IOException { public void testSpanByImplicitTimestamp() throws IOException { JSONObject result = executeQuery("source=big5 | stats count() by span(1d) as span"); verifySchema(result, schema("count()", "bigint"), schema("span", "timestamp")); - verifyDataRows(result, rows(1, "2023-01-02 00:00:00")); + verifyDataRows( + result, + rows(1, "2023-01-02 00:00:00"), + rows(1, "2023-03-01 00:00:00"), + rows(1, "2023-05-01 00:00:00")); Throwable t = assertThrowsWithReplace( diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLConditionBuiltinFunctionIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLConditionBuiltinFunctionIT.java index f7c81e797df..ad132f3eb7e 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLConditionBuiltinFunctionIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLConditionBuiltinFunctionIT.java @@ -96,7 +96,7 @@ public void testIsNotNull() throws IOException { public void testIsNotNullWithStruct() throws IOException { JSONObject actual = executeQuery("source=big5 | where isnotnull(aws) | fields aws"); verifySchema(actual, schema("aws", "struct")); - verifyNumOfRows(actual, 1); + verifyNumOfRows(actual, 3); } @Test diff --git a/integ-test/src/test/resources/expectedOutput/calcite/rex_regex_transformation.yaml b/integ-test/src/test/resources/expectedOutput/calcite/rex_regex_transformation.yaml index dd3e8bc82f6..bfd0a77d685 100644 --- a/integ-test/src/test/resources/expectedOutput/calcite/rex_regex_transformation.yaml +++ b/integ-test/src/test/resources/expectedOutput/calcite/rex_regex_transformation.yaml @@ -2,10 +2,10 @@ calcite: logical: | LogicalSystemLimit(sort0=[$4], dir0=[DESC-nulls-last], fetch=[10000], type=[QUERY_SIZE_LIMIT]) LogicalSort(sort0=[$4], dir0=[DESC-nulls-last]) - LogicalProject(log.file.path=[$10], logType=[ITEM(PARSE($10, '/var/log/(?\w+)/(?\w+)':VARCHAR, 'regex':VARCHAR), 'logType':VARCHAR)], filename=[ITEM(PARSE($10, '/var/log/(?\w+)/(?\w+)':VARCHAR, 'regex':VARCHAR), 'filename':VARCHAR)], filename_len=[CHAR_LENGTH(ITEM(PARSE($10, '/var/log/(?\w+)/(?\w+)':VARCHAR, 'regex':VARCHAR), 'filename':VARCHAR))], @timestamp=[$17]) + LogicalProject(log.file.path=[$10], logType=[REX_EXTRACT($10, '/var/log/(?\w+)/(?\w+)', 'logType')], filename=[REX_EXTRACT($10, '/var/log/(?\w+)/(?\w+)', 'filename')], filename_len=[CHAR_LENGTH(REX_EXTRACT($10, '/var/log/(?\w+)/(?\w+)', 'filename'))], @timestamp=[$17]) CalciteLogicalIndexScan(table=[[OpenSearch, big5]]) physical: | - EnumerableCalc(expr#0..1=[{inputs}], expr#2=['/var/log/(?\w+)/(?\w+)':VARCHAR], expr#3=['regex':VARCHAR], expr#4=[PARSE($t0, $t2, $t3)], expr#5=['logType':VARCHAR], expr#6=[ITEM($t4, $t5)], expr#7=['filename':VARCHAR], expr#8=[ITEM($t4, $t7)], expr#9=[CHAR_LENGTH($t8)], log.file.path=[$t0], $f1=[$t6], $f2=[$t8], $f3=[$t9], @timestamp=[$t1]) + EnumerableCalc(expr#0..1=[{inputs}], expr#2=['/var/log/(?\w+)/(?\w+)'], expr#3=['logType'], expr#4=[REX_EXTRACT($t0, $t2, $t3)], expr#5=['filename'], expr#6=[REX_EXTRACT($t0, $t2, $t5)], expr#7=[CHAR_LENGTH($t6)], log.file.path=[$t0], $f1=[$t4], $f2=[$t6], $f3=[$t7], @timestamp=[$t1]) CalciteEnumerableIndexScan(table=[[OpenSearch, big5]], PushDownContext=[[PROJECT->[log.file.path, @timestamp], SORT->[{ "@timestamp" : { "order" : "desc", From edef39cc0431918ad61a05fbdbed07f197be8ad3 Mon Sep 17 00:00:00 2001 From: Aaron Alvarez Date: Tue, 23 Dec 2025 12:43:52 -0800 Subject: [PATCH 11/16] Removing dedup Signed-off-by: Aaron Alvarez --- .../sql/calcite/big5/CalcitePPLBig5IT.java | 11 +--------- .../big5/queries/dedup_metrics_size_field.ppl | 22 ------------------- .../{ => big5}/rex_regex_transformation.yaml | 0 .../calcite/dedup_metrics_size_field.yaml | 16 -------------- 4 files changed, 1 insertion(+), 48 deletions(-) delete mode 100644 integ-test/src/test/resources/big5/queries/dedup_metrics_size_field.ppl rename integ-test/src/test/resources/expectedOutput/calcite/{ => big5}/rex_regex_transformation.yaml (100%) delete mode 100644 integ-test/src/test/resources/expectedOutput/calcite/dedup_metrics_size_field.yaml diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java index b4e3c12fc56..dfd9e83925d 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java @@ -45,15 +45,6 @@ public void coalesce_nonexistent_field_fallback() throws IOException { timing(summary, "coalesce_nonexistent_field_fallback", ppl); } - /** Tests deduplication by metrics.size field with sorting by timestamp. */ - @Test - public void dedup_metrics_size_field() throws IOException { - String ppl = sanitize(loadExpectedQuery("dedup_metrics_size_field.ppl")); - timing(summary, "dedup_metrics_size_field", ppl); - String expected = loadExpectedPlan("dedup_metrics_size_field.yaml"); - assertYamlEqualsIgnoreId(expected, explainQueryYaml(ppl)); - } - /** * Tests regex-based field extraction and transformation using rex command. Validates that the * Calcite plan correctly handles regex patterns. @@ -62,7 +53,7 @@ public void dedup_metrics_size_field() throws IOException { public void rex_regex_transformation() throws IOException { String ppl = sanitize(loadExpectedQuery("rex_regex_transformation.ppl")); timing(summary, "rex_regex_transformation", ppl); - String expected = loadExpectedPlan("rex_regex_transformation.yaml"); + String expected = loadExpectedPlan("big5/rex_regex_transformation.yaml"); assertYamlEqualsIgnoreId(expected, explainQueryYaml(ppl)); } diff --git a/integ-test/src/test/resources/big5/queries/dedup_metrics_size_field.ppl b/integ-test/src/test/resources/big5/queries/dedup_metrics_size_field.ppl deleted file mode 100644 index bb55d3a06ed..00000000000 --- a/integ-test/src/test/resources/big5/queries/dedup_metrics_size_field.ppl +++ /dev/null @@ -1,22 +0,0 @@ -/* -{ - "name": "dedup_metrics_size_field", - "operation-type": "search", - "index": "{{index_name | default('big5')}}", - "body": { - "query": { - "exists": { - "field": "metrics.size", - "boost": 1.0 - } - }, - "_source": { - "includes": ["agent", "process", "log", "message", "tags", "cloud", "input", "@timestamp", "ecs", "data_stream", "meta", "host", "metrics", "metrics.size", "aws", "event"], - "excludes": [] - } - } -} -*/ -source = big5 -| dedup metrics.size -| sort - @timestamp \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite/rex_regex_transformation.yaml b/integ-test/src/test/resources/expectedOutput/calcite/big5/rex_regex_transformation.yaml similarity index 100% rename from integ-test/src/test/resources/expectedOutput/calcite/rex_regex_transformation.yaml rename to integ-test/src/test/resources/expectedOutput/calcite/big5/rex_regex_transformation.yaml diff --git a/integ-test/src/test/resources/expectedOutput/calcite/dedup_metrics_size_field.yaml b/integ-test/src/test/resources/expectedOutput/calcite/dedup_metrics_size_field.yaml deleted file mode 100644 index 3c948626ef1..00000000000 --- a/integ-test/src/test/resources/expectedOutput/calcite/dedup_metrics_size_field.yaml +++ /dev/null @@ -1,16 +0,0 @@ -calcite: - logical: | - LogicalSystemLimit(sort0=[$7], dir0=[DESC-nulls-last], fetch=[10000], type=[QUERY_SIZE_LIMIT]) - LogicalProject(agent=[$0], process=[$6], log=[$8], message=[$11], tags=[$12], cloud=[$13], input=[$15], @timestamp=[$17], ecs=[$18], data_stream=[$20], meta=[$24], host=[$26], metrics=[$27], aws=[$30], event=[$35]) - LogicalSort(sort0=[$17], dir0=[DESC-nulls-last]) - LogicalProject(agent=[$0], agent.ephemeral_id=[$1], agent.id=[$2], agent.name=[$3], agent.type=[$4], agent.version=[$5], process=[$6], process.name=[$7], log=[$8], log.file=[$9], log.file.path=[$10], message=[$11], tags=[$12], cloud=[$13], cloud.region=[$14], input=[$15], input.type=[$16], @timestamp=[$17], ecs=[$18], ecs.version=[$19], data_stream=[$20], data_stream.dataset=[$21], data_stream.namespace=[$22], data_stream.type=[$23], meta=[$24], meta.file=[$25], host=[$26], metrics=[$27], metrics.size=[$28], metrics.tmin=[$29], aws=[$30], aws.cloudwatch=[$31], aws.cloudwatch.ingestion_time=[$32], aws.cloudwatch.log_group=[$33], aws.cloudwatch.log_stream=[$34], event=[$35], event.dataset=[$36], event.id=[$37], event.ingested=[$38], _id=[$39], _index=[$40], _score=[$41], _maxscore=[$42], _sort=[$43], _routing=[$44]) - LogicalFilter(condition=[<=($45, 1)]) - LogicalProject(agent=[$0], agent.ephemeral_id=[$1], agent.id=[$2], agent.name=[$3], agent.type=[$4], agent.version=[$5], process=[$6], process.name=[$7], log=[$8], log.file=[$9], log.file.path=[$10], message=[$11], tags=[$12], cloud=[$13], cloud.region=[$14], input=[$15], input.type=[$16], @timestamp=[$17], ecs=[$18], ecs.version=[$19], data_stream=[$20], data_stream.dataset=[$21], data_stream.namespace=[$22], data_stream.type=[$23], meta=[$24], meta.file=[$25], host=[$26], metrics=[$27], metrics.size=[$28], metrics.tmin=[$29], aws=[$30], aws.cloudwatch=[$31], aws.cloudwatch.ingestion_time=[$32], aws.cloudwatch.log_group=[$33], aws.cloudwatch.log_stream=[$34], event=[$35], event.dataset=[$36], event.id=[$37], event.ingested=[$38], _id=[$39], _index=[$40], _score=[$41], _maxscore=[$42], _sort=[$43], _routing=[$44], _row_number_dedup_=[ROW_NUMBER() OVER (PARTITION BY $28 ORDER BY $28)]) - LogicalFilter(condition=[IS NOT NULL($28)]) - CalciteLogicalIndexScan(table=[[OpenSearch, big5]]) - physical: | - EnumerableLimit(fetch=[10000]) - EnumerableSort(sort0=[$7], dir0=[DESC-nulls-last]) - EnumerableCalc(expr#0..16=[{inputs}], expr#17=[1], expr#18=[<=($t16, $t17)], proj#0..12=[{exprs}], aws=[$t14], event=[$t15], $condition=[$t18]) - EnumerableWindow(window#0=[window(partition {13} order by [13] rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) - CalciteEnumerableIndexScan(table=[[OpenSearch, big5]], PushDownContext=[[PROJECT->[agent, process, log, message, tags, cloud, input, @timestamp, ecs, data_stream, meta, host, metrics, metrics.size, aws, event], FILTER->IS NOT NULL($13)], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","query":{"exists":{"field":"metrics.size","boost":1.0}},"_source":{"includes":["agent","process","log","message","tags","cloud","input","@timestamp","ecs","data_stream","meta","host","metrics","metrics.size","aws","event"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) From 8dbb518b462494f9d2c88489a14b556e7f2da060 Mon Sep 17 00:00:00 2001 From: Aaron Alvarez Date: Wed, 24 Dec 2025 09:55:51 -0800 Subject: [PATCH 12/16] Adding comment to rex ppl file to explain what the query does Signed-off-by: Aaron Alvarez --- .../src/test/resources/big5/queries/rex_regex_transformation.ppl | 1 + 1 file changed, 1 insertion(+) diff --git a/integ-test/src/test/resources/big5/queries/rex_regex_transformation.ppl b/integ-test/src/test/resources/big5/queries/rex_regex_transformation.ppl index aac23a494ac..27a6b2cbf58 100644 --- a/integ-test/src/test/resources/big5/queries/rex_regex_transformation.ppl +++ b/integ-test/src/test/resources/big5/queries/rex_regex_transformation.ppl @@ -22,6 +22,7 @@ } } */ +// Extract log type and filename from file paths, calculate filename length, and sort by timestamp source = big5 | rex field=log.file.path '/var/log/(?\\w+)/(?\\w+)' | eval filename_len = length(filename) From 0bf2fb8ba38f2c7f6eee66a75e4efda1a720d150 Mon Sep 17 00:00:00 2001 From: Aaron Alvarez Date: Wed, 24 Dec 2025 11:06:38 -0800 Subject: [PATCH 13/16] Adding comments to ppl queries to explain what they do Signed-off-by: Aaron Alvarez --- .../org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java | 2 +- .../test/resources/big5/queries/rex_regex_transformation.ppl | 4 +++- .../queries/script_engine_like_pattern_with_aggregation.ppl | 3 +++ .../big5/queries/script_engine_like_pattern_with_sort.ppl | 3 +++ 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java index c8f701edea5..d55c71ae4f5 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java @@ -76,7 +76,7 @@ public void script_engine_like_pattern_with_sort() throws IOException { String ppl = sanitize(loadExpectedQuery("script_engine_like_pattern_with_sort.ppl")); timing(summary, "script_engine_like_pattern_with_sort", ppl); } - + /** Tests deduplication by metrics.size field with sorting by timestamp. */ @Test public void dedup_metrics_size_field() throws IOException { diff --git a/integ-test/src/test/resources/big5/queries/rex_regex_transformation.ppl b/integ-test/src/test/resources/big5/queries/rex_regex_transformation.ppl index 27a6b2cbf58..883f585cb4a 100644 --- a/integ-test/src/test/resources/big5/queries/rex_regex_transformation.ppl +++ b/integ-test/src/test/resources/big5/queries/rex_regex_transformation.ppl @@ -1,6 +1,9 @@ /* +Extract log type and filename from file paths, calculate filename length, and sort by timestamp + { "name": "rex_regex_transformation", + "description": "Extract log type and filename from file paths, calculate filename length, and sort by timestamp", "operation-type": "search", "index": "{{index_name | default('big5')}}", "body": { @@ -22,7 +25,6 @@ } } */ -// Extract log type and filename from file paths, calculate filename length, and sort by timestamp source = big5 | rex field=log.file.path '/var/log/(?\\w+)/(?\\w+)' | eval filename_len = length(filename) diff --git a/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_aggregation.ppl b/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_aggregation.ppl index 4af3f980e15..ff565213d8b 100644 --- a/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_aggregation.ppl +++ b/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_aggregation.ppl @@ -1,6 +1,9 @@ /* +Filter messages containing 'sshd' and aggregate count by metrics.size + { "name": "script_engine_like_pattern_with_aggregation", + "description": "Filter messages containing 'sshd' and aggregate count by metrics.size", "operation-type": "search", "index": "{{index_name | default('custom-big5')}}", "body": { diff --git a/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_sort.ppl b/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_sort.ppl index b6ac4a929de..fc45c74193c 100644 --- a/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_sort.ppl +++ b/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_sort.ppl @@ -1,6 +1,9 @@ /* +Filter messages containing 'sshd', sort by timestamp, and return top 10 results + { "name": "script_engine_like_pattern_with_sort", + "description": "Filter messages containing 'sshd', sort by timestamp, and return top 10 results", "operation-type": "search", "index": "{{index_name | default('big5')}}", "body": { From d6079b81b5d1a26d2ac89874f26598e6f59e36b2 Mon Sep 17 00:00:00 2001 From: Aaron Alvarez Date: Wed, 24 Dec 2025 11:44:24 -0800 Subject: [PATCH 14/16] Fixing comments Signed-off-by: Aaron Alvarez --- .../test/resources/big5/queries/rex_regex_transformation.ppl | 4 +--- .../queries/script_engine_like_pattern_with_aggregation.ppl | 4 +--- .../big5/queries/script_engine_like_pattern_with_sort.ppl | 4 +--- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/integ-test/src/test/resources/big5/queries/rex_regex_transformation.ppl b/integ-test/src/test/resources/big5/queries/rex_regex_transformation.ppl index 883f585cb4a..20671d9f6f8 100644 --- a/integ-test/src/test/resources/big5/queries/rex_regex_transformation.ppl +++ b/integ-test/src/test/resources/big5/queries/rex_regex_transformation.ppl @@ -1,9 +1,7 @@ +/* Extract log type and filename from file paths, calculate filename length, and sort by timestamp */ /* -Extract log type and filename from file paths, calculate filename length, and sort by timestamp - { "name": "rex_regex_transformation", - "description": "Extract log type and filename from file paths, calculate filename length, and sort by timestamp", "operation-type": "search", "index": "{{index_name | default('big5')}}", "body": { diff --git a/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_aggregation.ppl b/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_aggregation.ppl index ff565213d8b..a8b3982bb70 100644 --- a/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_aggregation.ppl +++ b/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_aggregation.ppl @@ -1,9 +1,7 @@ +/* Filter messages containing 'sshd' and aggregate count by metrics.size */ /* -Filter messages containing 'sshd' and aggregate count by metrics.size - { "name": "script_engine_like_pattern_with_aggregation", - "description": "Filter messages containing 'sshd' and aggregate count by metrics.size", "operation-type": "search", "index": "{{index_name | default('custom-big5')}}", "body": { diff --git a/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_sort.ppl b/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_sort.ppl index fc45c74193c..f69536b95d3 100644 --- a/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_sort.ppl +++ b/integ-test/src/test/resources/big5/queries/script_engine_like_pattern_with_sort.ppl @@ -1,9 +1,7 @@ +/* Filter messages containing 'sshd', sort by timestamp, and return top 10 results */ /* -Filter messages containing 'sshd', sort by timestamp, and return top 10 results - { "name": "script_engine_like_pattern_with_sort", - "description": "Filter messages containing 'sshd', sort by timestamp, and return top 10 results", "operation-type": "search", "index": "{{index_name | default('big5')}}", "body": { From 3ed2ba62391b917a91bcb916732ca660c8d1cd2a Mon Sep 17 00:00:00 2001 From: Aaron Alvarez Date: Wed, 24 Dec 2025 12:18:31 -0800 Subject: [PATCH 15/16] Empty commit to trigger CI Signed-off-by: Aaron Alvarez From c72712468b371f370818f4062e31b1a96e688fb2 Mon Sep 17 00:00:00 2001 From: Aaron Alvarez Date: Mon, 5 Jan 2026 18:56:15 -0800 Subject: [PATCH 16/16] Removing correctness check for rex command Signed-off-by: Aaron Alvarez --- .../sql/calcite/big5/CalcitePPLBig5IT.java | 2 -- .../calcite/big5/rex_regex_transformation.yaml | 14 -------------- 2 files changed, 16 deletions(-) delete mode 100644 integ-test/src/test/resources/expectedOutput/calcite/big5/rex_regex_transformation.yaml diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java index d55c71ae4f5..d1eb26cabbe 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/big5/CalcitePPLBig5IT.java @@ -53,8 +53,6 @@ public void coalesce_nonexistent_field_fallback() throws IOException { public void rex_regex_transformation() throws IOException { String ppl = sanitize(loadExpectedQuery("rex_regex_transformation.ppl")); timing(summary, "rex_regex_transformation", ppl); - String expected = loadExpectedPlan("big5/rex_regex_transformation.yaml"); - assertYamlEqualsIgnoreId(expected, explainQueryYaml(ppl)); } /** diff --git a/integ-test/src/test/resources/expectedOutput/calcite/big5/rex_regex_transformation.yaml b/integ-test/src/test/resources/expectedOutput/calcite/big5/rex_regex_transformation.yaml deleted file mode 100644 index bfd0a77d685..00000000000 --- a/integ-test/src/test/resources/expectedOutput/calcite/big5/rex_regex_transformation.yaml +++ /dev/null @@ -1,14 +0,0 @@ -calcite: - logical: | - LogicalSystemLimit(sort0=[$4], dir0=[DESC-nulls-last], fetch=[10000], type=[QUERY_SIZE_LIMIT]) - LogicalSort(sort0=[$4], dir0=[DESC-nulls-last]) - LogicalProject(log.file.path=[$10], logType=[REX_EXTRACT($10, '/var/log/(?\w+)/(?\w+)', 'logType')], filename=[REX_EXTRACT($10, '/var/log/(?\w+)/(?\w+)', 'filename')], filename_len=[CHAR_LENGTH(REX_EXTRACT($10, '/var/log/(?\w+)/(?\w+)', 'filename'))], @timestamp=[$17]) - CalciteLogicalIndexScan(table=[[OpenSearch, big5]]) - physical: | - EnumerableCalc(expr#0..1=[{inputs}], expr#2=['/var/log/(?\w+)/(?\w+)'], expr#3=['logType'], expr#4=[REX_EXTRACT($t0, $t2, $t3)], expr#5=['filename'], expr#6=[REX_EXTRACT($t0, $t2, $t5)], expr#7=[CHAR_LENGTH($t6)], log.file.path=[$t0], $f1=[$t4], $f2=[$t6], $f3=[$t7], @timestamp=[$t1]) - CalciteEnumerableIndexScan(table=[[OpenSearch, big5]], PushDownContext=[[PROJECT->[log.file.path, @timestamp], SORT->[{ - "@timestamp" : { - "order" : "desc", - "missing" : "_last" - } - }], LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":10000,"timeout":"1m","_source":{"includes":["log.file.path","@timestamp"],"excludes":[]},"sort":[{"@timestamp":{"order":"desc","missing":"_last"}}]}, requestedTotalSize=10000, pageSize=null, startFrom=0)])