Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,38 @@ public void coalesce_nonexistent_field_fallback() throws IOException {
timing(summary, "coalesce_nonexistent_field_fallback", ppl);
}

/**
* Tests regex-based field extraction and transformation using rex command. Validates that the
* Calcite plan correctly handles regex patterns.
*/
@Test
public void rex_regex_transformation() throws IOException {
String ppl = sanitize(loadExpectedQuery("rex_regex_transformation.ppl"));
timing(summary, "rex_regex_transformation", ppl);
String expected = loadExpectedPlan("big5/rex_regex_transformation.yaml");
assertYamlEqualsIgnoreId(expected, explainQueryYaml(ppl));
Comment on lines +56 to +57
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any reason to do correctness check for this one? I thought this IT is only for benchmark?

Copy link
Contributor Author

@aalva500-prog aalva500-prog Dec 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR is to continue the work done here by @noCharger: #4816. I don't have the whole context, unfortunately. However, I think it is not only for benchmark, as the same was done for thededup command in this PR: #4991

}

/**
* Tests LIKE pattern matching with aggregation using script engine. Validates filtering by
* message content and grouping results.
*/
@Test
public void script_engine_like_pattern_with_aggregation() throws IOException {
String ppl = sanitize(loadExpectedQuery("script_engine_like_pattern_with_aggregation.ppl"));
timing(summary, "script_engine_like_pattern_with_aggregation", ppl);
}

/**
* Tests LIKE pattern matching with sorting and result limiting. Validates filtering by message
* content with timestamp ordering.
*/
@Test
public void script_engine_like_pattern_with_sort() throws IOException {
String ppl = sanitize(loadExpectedQuery("script_engine_like_pattern_with_sort.ppl"));
timing(summary, "script_engine_like_pattern_with_sort", ppl);
}

/** Tests deduplication by metrics.size field with sorting by timestamp. */
@Test
public void dedup_metrics_size_field() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -738,7 +738,11 @@ public void testCountBySpanForCustomFormats() throws IOException {
public void testSpanByImplicitTimestamp() throws IOException {
JSONObject result = executeQuery("source=big5 | stats count() by span(1d) as span");
verifySchema(result, schema("count()", "bigint"), schema("span", "timestamp"));
verifyDataRows(result, rows(1, "2023-01-02 00:00:00"));
verifyDataRows(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR is only to add more test queries right? Why the behavior changed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The behavior changed because the file integ-test/src/test/resources/big5/data/big5.json was modified and now has more data.

result,
rows(1, "2023-01-02 00:00:00"),
rows(1, "2023-03-01 00:00:00"),
rows(1, "2023-05-01 00:00:00"));

Throwable t =
assertThrowsWithReplace(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ public void testIsNotNull() throws IOException {
public void testIsNotNullWithStruct() throws IOException {
JSONObject actual = executeQuery("source=big5 | where isnotnull(aws) | fields aws");
verifySchema(actual, schema("aws", "struct"));
verifyNumOfRows(actual, 1);
verifyNumOfRows(actual, 3);
}

@Test
Expand Down
4 changes: 4 additions & 0 deletions integ-test/src/test/resources/big5/data/big5.json
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
{"index":{}}
{"message":"2023-04-30T21:48:56.160Z Apr 30 21:48:56 ip-66-221-134-40 journal: donkey glazer fly shark whip servant thornfalcon","process":{"name":"journal"},"aws.cloudwatch":{"ingestion_time":"2023-04-30T21:48:56.160Z","log_group":"/var/log/messages","log_stream":"luckcrafter"},"tags":["preserve_original_event"],"meta":{"file":"2023-01-02/1682891301-gotext.ndjson.gz"},"cloud":{"region":"eu-central-1"},"@timestamp":"2023-01-02T22:02:34.000Z","input":{"type":"aws-cloudwatch"},"metrics":{"tmin":849,"size":1981},"log.file.path":"/var/log/messages/luckcrafter","event":{"id":"sunsetmark","dataset":"generic","ingested":"2023-07-20T03:36:30.223806Z"},"agent":{"id":"c315dc22-3ea6-44dc-8d56-fd02f675367b","name":"fancydancer","ephemeral_id":"c315dc22-3ea6-44dc-8d56-fd02f675367b","type":"filebeat","version":"8.8.0"}}
{"index":{}}
{"message":"2024-04-11T18:00:10.965Z Apr 11 18:00:10 ip-32-11-43-93 sshd: cloak bolt thorn hugger rib jackal wolverine shaker boar fighter taker boulderfox","process":{"name":"sshd"},"aws.cloudwatch":{"log_stream":"mirrorlighter","ingestion_time":"2024-04-11T18:00:10.965Z","log_group":"/var/log/messages"},"tags":["preserve_original_event"],"meta":{"file":"2024-04-11/1712851210-sshd.ndjson.gz"},"cloud":{"region":"ap-southeast-3"},"@timestamp":"2023-05-01T21:59:58.000Z","input":{"type":"aws-cloudwatch"},"metrics":{"size":3166,"tmin":1},"log.file.path":"/var/log/messages/mirrorlighter","event":{"id":"patternantler","ingested":"2024-04-11T17:39:10.965818973Z","dataset":"generic"},"agent":{"id":"c79a289f-6c16-4de2-a6c8-8ee5c84473d5","name":"brindlehugger","type":"filebeat","version":"8.8.0","ephemeral_id":"c79a289f-6c16-4de2-a6c8-8ee5c84473d5"}}
{"index":{}}
{"message":"2024-04-11T10:15:01.628Z Apr 11 10:15:01 ip-95-21-51-112 kernel: kicker stinger slave dolphin sparkox","process":{"name":"kernel"},"aws.cloudwatch":{"log_stream":"plumebard","ingestion_time":"2024-04-11T10:15:01.628Z","log_group":"/var/log/messages"},"tags":["preserve_original_event"],"meta":{"file":"2024-04-11/1712826901-kernel.ndjson.gz"},"cloud":{"region":"ap-south-1"},"@timestamp":"2023-03-01T22:31:11.000Z","input":{"type":"aws-cloudwatch"},"metrics":{"size":3993,"tmin":1},"log.file.path":"/var/log/messages/plumebard","event":{"id":"chipgambler","ingested":"2024-04-11T10:09:29.628941177Z","dataset":"generic"},"agent":{"id":"5f25fa16-6a99-489f-b1c5-f27c0627a459","name":"lemongrabber","type":"filebeat","version":"8.8.0","ephemeral_id":"5f25fa16-6a99-489f-b1c5-f27c0627a459"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/* Extract log type and filename from file paths, calculate filename length, and sort by timestamp */
/*
{
"name": "rex_regex_transformation",
"operation-type": "search",
"index": "{{index_name | default('big5')}}",
"body": {
"query": {
"match_all": {}
},
"_source": {
"includes": ["log.file.path", "@timestamp"],
"excludes": []
},
"sort": [
{
"@timestamp": {
"order": "desc",
"missing": "_last"
}
}
]
}
}
*/
source = big5
| rex field=log.file.path '/var/log/(?<logType>\\w+)/(?<filename>\\w+)'
| eval filename_len = length(filename)
| fields log.file.path, logType, filename, filename_len, @timestamp
| sort - @timestamp
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/* Filter messages containing 'sshd' and aggregate count by metrics.size */
/*
{
"name": "script_engine_like_pattern_with_aggregation",
"operation-type": "search",
"index": "{{index_name | default('custom-big5')}}",
"body": {
"query": {
"script": {
"script": {
"source": "{\"langType\":\"calcite\",\"script\":\"...\"}",
"lang": "opensearch_compounded_script",
"params": {
"utcTimestamp": "{{current_timestamp}}"
}
},
"boost": 1.0
}
},
"_source": {
"includes": ["message", "metrics.size"],
"excludes": []
},
"aggregations": {
"composite_buckets": {
"composite": {
"size": 10000,
"sources": [
{
"metrics.size": {
"terms": {
"field": "metrics.size",
"missing_bucket": true,
"missing_order": "first",
"order": "asc"
}
}
}
]
}
}
}
}
}
*/
source = big5
| where like(`message`, '%sshd%')
| stats count() by metrics.size
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/* Filter messages containing 'sshd', sort by timestamp, and return top 10 results */
/*
{
"name": "script_engine_like_pattern_with_sort",
"operation-type": "search",
"index": "{{index_name | default('big5')}}",
"body": {
"query": {
"script": {
"script": {
"source": "{\"langType\":\"calcite\",\"script\":\"...\"}",
"lang": "opensearch_compounded_script",
"params": {
"utcTimestamp": "{{current_timestamp}}"
}
},
"boost": 1.0
}
},
"size": 10,
"_source": {
"includes": ["agent", "process", "log", "message", "tags", "cloud", "input", "@timestamp", "ecs", "data_stream", "meta", "host", "metrics", "aws", "event"],
"excludes": []
},
"sort": [
{
"@timestamp": {
"order": "desc",
"missing": "_last"
}
}
]
}
}
*/
source = big5
| where like(`message`, '%sshd%')
| sort - @timestamp
| head 10
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
calcite:
logical: |
LogicalSystemLimit(sort0=[$4], dir0=[DESC-nulls-last], fetch=[10000], type=[QUERY_SIZE_LIMIT])
LogicalSort(sort0=[$4], dir0=[DESC-nulls-last])
LogicalProject(log.file.path=[$10], logType=[REX_EXTRACT($10, '/var/log/(?<logType>\w+)/(?<filename>\w+)', 'logType')], filename=[REX_EXTRACT($10, '/var/log/(?<logType>\w+)/(?<filename>\w+)', 'filename')], filename_len=[CHAR_LENGTH(REX_EXTRACT($10, '/var/log/(?<logType>\w+)/(?<filename>\w+)', 'filename'))], @timestamp=[$17])
CalciteLogicalIndexScan(table=[[OpenSearch, big5]])
physical: |
EnumerableCalc(expr#0..1=[{inputs}], expr#2=['/var/log/(?<logType>\w+)/(?<filename>\w+)'], expr#3=['logType'], expr#4=[REX_EXTRACT($t0, $t2, $t3)], expr#5=['filename'], expr#6=[REX_EXTRACT($t0, $t2, $t5)], expr#7=[CHAR_LENGTH($t6)], log.file.path=[$t0], $f1=[$t4], $f2=[$t6], $f3=[$t7], @timestamp=[$t1])
CalciteEnumerableIndexScan(table=[[OpenSearch, big5]], PushDownContext=[[PROJECT->[log.file.path, @timestamp], SORT->[{
"@timestamp" : {
"order" : "desc",
"missing" : "_last"
}
}], LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":10000,"timeout":"1m","_source":{"includes":["log.file.path","@timestamp"],"excludes":[]},"sort":[{"@timestamp":{"order":"desc","missing":"_last"}}]}, requestedTotalSize=10000, pageSize=null, startFrom=0)])
Loading