diff --git a/docs/changelog/140982.yaml b/docs/changelog/140982.yaml new file mode 100644 index 0000000000000..1e837bcda8239 --- /dev/null +++ b/docs/changelog/140982.yaml @@ -0,0 +1,6 @@ +pr: 140982 +summary: "ESQL: Prune unused regex extract nodes in optimizer" +area: ES|QL +type: enhancement +issues: + - 132437 diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/dissect.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/dissect.csv-spec index f1318241a6c6c..3c9947bee3e37 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/dissect.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/dissect.csv-spec @@ -396,3 +396,149 @@ FROM employees n:null | abc:keyword null | null ; + + +pruneAllDissectFields +ROW message = "2023-01-23T12:15:00Z - error - 192.168.1.1" +| DISSECT message "%{date} - %{level} - %{ip}" +| KEEP message +; + +message:keyword +2023-01-23T12:15:00Z - error - 192.168.1.1 +; + + +pruneAllDissectFieldsFromEmployees +FROM employees +| EVAL full_name = CONCAT(first_name, " ", last_name) +| DISSECT full_name "%{extracted_first} %{extracted_last}" +| KEEP emp_no, full_name +| SORT emp_no +| LIMIT 3 +; + +emp_no:integer | full_name:keyword +10001 | Georgi Facello +10002 | Bezalel Simmel +10003 | Parto Bamford +; + + +pruneAllDissectFieldsViaDrop +FROM apps +| EVAL concat = CONCAT(id::string, " ", name) +| DISSECT concat "%{extracted_first} %{extracted_last}" +| DROP extracted_first, extracted_last, concat +| SORT id +| LIMIT 3 +; + +id:integer | name:keyword | version:version +1 | aaaaa | 1 +2 | bbbbb | 2.1 +3 | ccccc | 2.3.4 +; + + +pruneDissectFieldsShadowedByEval +ROW message = "2023-01-23T12:15:00Z - error - 192.168.1.1" +| DISSECT message "%{date} - %{level} - %{ip}" +| EVAL date = "overwritten", level = "overwritten", ip = "overwritten" +| KEEP message, date, level, ip +; + +message:keyword | date:keyword | level:keyword | ip:keyword +2023-01-23T12:15:00Z - error - 192.168.1.1 | overwritten | overwritten | overwritten +; + + +prunePartialDissectFieldsViaDrop +ROW message = "2023-01-23T12:15:00Z - error - 192.168.1.1" +| DISSECT message "%{date} - %{level} - %{ip}" +| DROP level, ip +; + +message:keyword | date:keyword +2023-01-23T12:15:00Z - error - 192.168.1.1 | 2023-01-23T12:15:00Z +; + + +pruneChainedDissectAndGrok +ROW message = "2023-01-23 error 192.168.1.1" +| DISSECT message "%{date} %{rest}" +| GROK rest "%{WORD:level} %{IP:ip}" +| KEEP message +; + +message:keyword +2023-01-23 error 192.168.1.1 +; + + +pruneChainedDissectAndGrokPartially +ROW message = "2023-01-23 error 192.168.1.1" +| DISSECT message "%{date} %{rest}" +| GROK rest "%{WORD:level} %{IP:ip}" +| KEEP level +; + +level:keyword +error +; + + +pruneDissectFieldRedefinedBeforeStats +FROM employees +| EVAL full_name = CONCAT(first_name, " ", last_name) +| DISSECT full_name "%{extracted_first} %{extracted_last}" +| EVAL extracted_first = "constant", extracted_last = "constant" +| STATS count = COUNT(*) BY extracted_first +; + +count:long | extracted_first:keyword +100 | constant +; + + +pruneDissectWithInlineStats +required_capability: inline_stats +required_capability: inline_stats_drop_groupings_fix +FROM employees +| EVAL full_name = CONCAT(first_name, " ", last_name) +| DISSECT full_name "%{extracted_first} %{extracted_last}" +| INLINE STATS avg_salary = AVG(salary) BY extracted_last +| KEEP emp_no, avg_salary +| SORT emp_no +| LIMIT 5 +; + +emp_no:integer | avg_salary:double +10001 | 57305.0 +10002 | 56371.0 +10003 | 61805.0 +10004 | 36174.0 +10005 | 63528.0 +; + + +pruneDissectWithLookupJoin +required_capability: join_lookup_v12 +FROM employees +| EVAL language_code = languages +| EVAL full_name = CONCAT(first_name, " ", last_name) +| DISSECT full_name "%{extracted_first} %{extracted_last}" +| LOOKUP JOIN languages_lookup ON language_code +| DROP extracted_first, extracted_last, full_name +| KEEP emp_no, language_name +| SORT emp_no +| LIMIT 5 +; + +emp_no:integer | language_name:keyword +10001 | French +10002 | null +10003 | German +10004 | null +10005 | English +; diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/grok.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/grok.csv-spec index 5d0447b88ec16..d608d5b95f3fe 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/grok.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/grok.csv-spec @@ -372,3 +372,136 @@ FROM employees n:null | abc:keyword null | null ; + +pruneAllGrokFields +ROW message = "2023-01-23T12:15:00Z 192.168.1.1 user@example.com 42" +| GROK message "%{TIMESTAMP_ISO8601:date} %{IP:ip} %{EMAILADDRESS:email} %{NUMBER:num:int}" +| KEEP message +; + +message:keyword +2023-01-23T12:15:00Z 192.168.1.1 user@example.com 42 +; + + +pruneAllGrokFieldsFromEmployees +FROM employees +| EVAL full_name = CONCAT(first_name, " ", last_name) +| GROK full_name "%{WORD:extracted_first} %{WORD:extracted_last}" +| KEEP emp_no, full_name +| SORT emp_no +| LIMIT 3 +; + +emp_no:integer | full_name:keyword +10001 | Georgi Facello +10002 | Bezalel Simmel +10003 | Parto Bamford +; + + +pruneAllGrokFieldsViaDrop +FROM apps +| EVAL concat = CONCAT(id::string, " ", name) +| GROK concat "%{WORD:extracted_first} %{WORD:extracted_last}" +| DROP extracted_first, extracted_last, concat +| SORT id +| LIMIT 3 +; + +id:integer | name:keyword | version:version +1 | aaaaa | 1 +2 | bbbbb | 2.1 +3 | ccccc | 2.3.4 +; + + +pruneGrokFieldsShadowedByEval +ROW message = "2023-01-23T12:15:00Z 192.168.1.1 user@example.com 42" +| GROK message "%{TIMESTAMP_ISO8601:date} %{IP:ip} %{EMAILADDRESS:email} %{NUMBER:num:int}" +| EVAL date = "overwritten", ip = "overwritten", email = "overwritten", num = 0 +| KEEP message, date +; + +message:keyword | date:keyword +2023-01-23T12:15:00Z 192.168.1.1 user@example.com 42 | overwritten +; + + +prunePartialGrokFieldsViaDrop +ROW message = "2023-01-23T12:15:00Z 192.168.1.1 user@example.com 42" +| GROK message "%{TIMESTAMP_ISO8601:date} %{IP:ip} %{EMAILADDRESS:email} %{NUMBER:num:int}" +| DROP ip, email, num +; + +message:keyword | date:keyword +2023-01-23T12:15:00Z 192.168.1.1 user@example.com 42 | 2023-01-23T12:15:00Z +; + + +pruneGrokFieldsUsedInFilter +ROW message = "2023-01-23T12:15:00Z 192.168.1.1 user@example.com 42" +| GROK message "%{TIMESTAMP_ISO8601:date} %{IP:ip} %{EMAILADDRESS:email} %{NUMBER:num:int}" +| WHERE ip == "192.168.1.1" +| KEEP message, ip +; + +message:keyword | ip:keyword +2023-01-23T12:15:00Z 192.168.1.1 user@example.com 42 | 192.168.1.1 +; + + +pruneGrokFieldRedefinedBeforeStats +FROM employees +| EVAL full_name = CONCAT(first_name, " ", last_name) +| GROK full_name "%{WORD:extracted_first} %{WORD:extracted_last}" +| EVAL extracted_first = "constant", extracted_last = "constant" +| STATS count = COUNT(*) BY extracted_first +; + +count:long | extracted_first:keyword +100 | constant +; + + +pruneGrokWithInlineStatsPrunedEntirely +required_capability: inline_stats +FROM employees +| EVAL full_name = CONCAT(first_name, " ", last_name) +| GROK full_name "%{WORD:extracted_first} %{WORD:extracted_last}" +| INLINE STATS avg_salary = AVG(salary) BY extracted_last +| EVAL avg_salary = emp_no +| KEEP emp_no, avg_salary +| SORT emp_no +| LIMIT 5 +; + +emp_no:integer | avg_salary:integer +10001 | 10001 +10002 | 10002 +10003 | 10003 +10004 | 10004 +10005 | 10005 +; + + +pruneGrokWithLookupJoin +required_capability: join_lookup_v12 +FROM employees +| EVAL language_code = languages +| EVAL full_name = CONCAT(first_name, " ", last_name) +| GROK full_name "%{WORD:extracted_first} %{WORD:extracted_last}" +| LOOKUP JOIN languages_lookup ON language_code +| DROP extracted_first, extracted_last, full_name +| KEEP emp_no, language_name +| SORT emp_no +| LIMIT 5 +; + +emp_no:integer | language_name:keyword +10001 | French +10002 | null +10003 | German +10004 | null +10005 | English +; diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/PruneColumns.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/PruneColumns.java index 08af0d202cfd8..dc7bfcb79be9f 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/PruneColumns.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/PruneColumns.java @@ -23,6 +23,7 @@ import org.elasticsearch.xpack.esql.plan.logical.Limit; import org.elasticsearch.xpack.esql.plan.logical.LogicalPlan; import org.elasticsearch.xpack.esql.plan.logical.Project; +import org.elasticsearch.xpack.esql.plan.logical.RegexExtract; import org.elasticsearch.xpack.esql.plan.logical.Sample; import org.elasticsearch.xpack.esql.plan.logical.UnaryPlan; import org.elasticsearch.xpack.esql.plan.logical.UnionAll; @@ -86,6 +87,7 @@ private static LogicalPlan pruneColumns(LogicalPlan plan, AttributeSet.Builder u forkPresent.set(true); yield pruneColumnsInFork(fork, used); } + case RegexExtract re -> pruneUnusedRegexExtract(re, used, recheck); default -> p; }; } while (recheck.get()); @@ -271,6 +273,39 @@ private static LogicalPlan pruneColumnsInFork(Fork fork, AttributeSet.Builder us return fork; } + /** + * Prunes RegexExtract operations (Dissect and Grok) when none of their extracted fields are used. + *
+ * Partial field pruning is not supported due to a layout–operator mismatch in + * {@code LocalExecutionPlanner}: + *
+ * We also cannot simply reconcile the two sides by matching on {@code extractedFields} names, + * because {@code PushDownRegexExtract} may rename attributes to avoid variable shadowing + * (see PR #108360), while the parser still returns results keyed by the original pattern names. + *
+ */ + private static LogicalPlan pruneUnusedRegexExtract(RegexExtract re, AttributeSet.Builder used, Holder{@code
+ * LimitExec[1000[INTEGER],12]
+ * \_AggregateExec[[language_code{r}#8],[COUNT(emp_no{r}#32,true[BOOLEAN],PT0S[TIME_DURATION]) AS c#18, language_code{r}#8],FINAL,[l
+ * anguage_code{r}#8, $$c$count{r}#33, $$c$seen{r}#34],12]
+ * \_ExchangeExec[[language_code{r}#8, $$c$count{r}#33, $$c$seen{r}#34],true]
+ * \_AggregateExec[[language_code{r}#8],[COUNT(emp_no{r}#32,true[BOOLEAN],PT0S[TIME_DURATION]) AS c#18, language_code{r}#8],INITIAL,
+ * [language_code{r}#8, $$c$count{r}#35, $$c$seen{r}#36],12]
+ * \_MvExpandExec[emp_no{f}#19,emp_no{r}#32]
+ * \_ProjectExec[[emp_no{f}#19, languages{r}#22 AS language_code#8]]
+ * \_FieldExtractExec[emp_no{f}#19]<[],[]>
+ * \_EvalExec[[null[INTEGER] AS languages#22]]
+ * \_EsQueryExec[test], indexMode[standard], [_doc{f}#37], limit[], sort[] estimatedRowSize[12] queryBuilderAndTags
+ * [[QueryBuilderAndTags[query=null, tags=[]]]]
+ * }
*/
public void testMissingFieldsPurgesTheJoinLocallyThroughCommands() {
var stats = EsqlTestUtils.statsForMissingField("languages");
@@ -925,8 +928,7 @@ public void testMissingFieldsPurgesTheJoinLocallyThroughCommands() {
var exchange = as(agg.child(), ExchangeExec.class);
agg = as(exchange.child(), AggregateExec.class);
- var grok = as(agg.child(), GrokExec.class);
- var mvexpand = as(grok.child(), MvExpandExec.class);
+ var mvexpand = as(agg.child(), MvExpandExec.class);
var project = as(mvexpand.child(), ProjectExec.class);
var extract = as(project.child(), FieldExtractExec.class);
var eval = as(extract.child(), EvalExec.class);
diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/PruneColumnsTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/PruneColumnsTests.java
index 8ee1936faf0e2..9989cf9eca617 100644
--- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/PruneColumnsTests.java
+++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/PruneColumnsTests.java
@@ -28,6 +28,7 @@
import org.elasticsearch.xpack.esql.plan.logical.LogicalPlan;
import org.elasticsearch.xpack.esql.plan.logical.Project;
import org.elasticsearch.xpack.esql.plan.logical.TopN;
+import org.elasticsearch.xpack.esql.plan.logical.inference.Rerank;
import org.elasticsearch.xpack.esql.plan.logical.join.InlineJoin;
import org.elasticsearch.xpack.esql.plan.logical.join.Join;
import org.elasticsearch.xpack.esql.plan.logical.join.LookupJoin;
@@ -468,27 +469,25 @@ public void testTripleInlineStatsGetsPrunedPartially() {
var stub = as(agg.child(), StubRelation.class);
}
- /*
- * Project[[emp_no{f}#26, salaryK{r}#4, count{r}#6, min{r}#19]]
- * \_TopN[[Order[emp_no{f}#26,ASC,LAST]],5[INTEGER]]
- * \_InlineJoin[LEFT,[salaryK{r}#4],[salaryK{r}#4]]
- * |_Project[[_meta_field{f}#32, emp_no{f}#26, first_name{f}#27, gender{f}#28, hire_date{f}#33, job{f}#34, job.raw{f}#35,
- * languages{f}#29, last_name{f}#30, long_noidx{f}#36, salary{f}#31, count{r}#6, salaryK{r}#4, hire_date_string{r}#10,
- * date{r}#15]]
- * | \_Dissect[hire_date_string{r}#10,Parser[pattern=%{date}, appendSeparator=,
- * parser=org.elasticsearch.dissect.DissectParser@77d1afc3],[date{r}#15]] <-- TODO: Dissect & Eval could/should be dropped
- * | \_Eval[[TOSTRING(hire_date{f}#33) AS hire_date_string#10]]
- * | \_InlineJoin[LEFT,[salaryK{r}#4],[salaryK{r}#4]]
- * | |_Eval[[salary{f}#31 / 10000[INTEGER] AS salaryK#4]]
- * | | \_EsRelation[test][_meta_field{f}#32, emp_no{f}#26, first_name{f}#27, ..]
- * | \_Aggregate[[salaryK{r}#4],[COUNT(*[KEYWORD],true[BOOLEAN]) AS count#6, salaryK{r}#4]]
- * | \_StubRelation[[_meta_field{f}#32, emp_no{f}#26, first_name{f}#27, gender{f}#28, hire_date{f}#33, job{f}#34,
- * job.raw{f}#35, languages{f}#29, last_name{f}#30, long_noidx{f}#36, salary{f}#31, salaryK{r}#4]]
- * \_Aggregate[[salaryK{r}#4],[MIN($$MV_COUNT(langua>$MIN$0{r$}#37,true[BOOLEAN]) AS min#19, salaryK{r}#4]]
- * \_Eval[[MVCOUNT(languages{f}#29) AS $$MV_COUNT(langua>$MIN$0#37]]
- * \_StubRelation[[_meta_field{f}#32, emp_no{f}#26, first_name{f}#27, gender{f}#28, hire_date{f}#33, job{f}#34, job.raw{f}#35,
- * languages{f}#29, last_name{f}#30, long_noidx{f}#36, salary{f}#31, count{r}#6, salaryK{r}#4, sum{r}#13,
- * hire_date_string{r}#10, date{r}#15, $$MV_COUNT(langua>$MIN$0{r$}#37]]
+ /**
+ * Expects
+ * {@code
+ * Project[[emp_no{f}#27, salaryK{r}#5, count{r}#7, min{r}#20]]
+ * \_TopN[[Order[emp_no{f}#27,ASC,LAST]],5[INTEGER],false]
+ * \_InlineJoin[LEFT,[salaryK{r}#5],[salaryK{r}#5]]
+ * |_Project[[emp_no{f}#27, languages{f}#30, count{r}#7, salaryK{r}#5]]
+ * | \_InlineJoin[LEFT,[salaryK{r}#5],[salaryK{r}#5]]
+ * | |_Eval[[salary{f}#32 / 10000[INTEGER] AS salaryK#5]]
+ * | | \_EsRelation[employees][_meta_field{f}#33, emp_no{f}#27, first_name{f}#28, ..]
+ * | \_Aggregate[[salaryK{r}#5],[COUNT(*[KEYWORD],true[BOOLEAN],PT0S[TIME_DURATION]) AS count#7, salaryK{r}#5]]
+ * | \_StubRelation[[_meta_field{f}#33, emp_no{f}#27, first_name{f}#28, gender{f}#29, hire_date{f}#34, job{f}#35, job.raw{f}#36,
+ * languages{f}#30, last_name{f}#31, long_noidx{f}#37, salary{f}#32, salaryK{r}#5]]
+ * \_Aggregate[[salaryK{r}#5],[MIN($$MV_COUNT(langua>$MIN$0{r$}#38,true[BOOLEAN],PT0S[TIME_DURATION]) AS min#20, salaryK{r}#5]]
+ * \_Eval[[MVCOUNT(languages{f}#30) AS $$MV_COUNT(langua>$MIN$0#38]]
+ * \_StubRelation[[_meta_field{f}#33, emp_no{f}#27, first_name{f}#28, gender{f}#29, hire_date{f}#34, job{f}#35, job.raw{f}#36,
+ * languages{f}#30, last_name{f}#31, long_noidx{f}#37, salary{f}#32, count{r}#7, salaryK{r}#5, sum{r}#14,
+ * hire_date_string{r}#11, date{r}#16, $$MV_COUNT(langua>$MIN$0{r$}#38]]
+ * }
*/
public void testTripleInlineStatsMultipleAssignmentsGetsPrunedPartially() {
// TODO: reenable 1st sort, pull the 2nd further up when #132417 is in
@@ -533,14 +532,13 @@ public void testTripleInlineStatsMultipleAssignmentsGetsPrunedPartially() {
assertThat(Expressions.names(outerinline.output()), is(expectedOutterOutput));
// outer left
var outerProject = as(outerinline.left(), Project.class);
- var dissect = as(outerProject.child(), Dissect.class);
- var eval = as(dissect.child(), Eval.class);
- var innerinline = as(eval.child(), InlineJoin.class);
+ // Dissect and Eval(hire_date_string) have been pruned since 'date' field is not used
+ var innerinline = as(outerProject.child(), InlineJoin.class);
var expectedInnerOutput = new ArrayList<>(employeesFields);
expectedInnerOutput.addAll(List.of("count", "salaryK"));
assertThat(Expressions.names(innerinline.output()), is(expectedInnerOutput));
// inner left
- eval = as(innerinline.left(), Eval.class);
+ var eval = as(innerinline.left(), Eval.class);
var relation = as(eval.child(), EsRelation.class);
// inner right
var agg = as(innerinline.right(), Aggregate.class);
@@ -551,13 +549,13 @@ public void testTripleInlineStatsMultipleAssignmentsGetsPrunedPartially() {
stub = as(eval.child(), StubRelation.class);
}
- /*
- * Project[[emp_no{f}#917]]
- * \_TopN[[Order[emp_no{f}#917,ASC,LAST]],5[INTEGER]]
- * \_Dissect[hire_date_string{r}#898,Parser[pattern=%{date}, appendSeparator=,
- * parser=org.elasticsearch.dissect.DissectParser@46132aa7],[date{r}#903]] <-- TODO: Dissect & Eval could/should be dropped
- * \_Eval[[TOSTRING(hire_date{f}#918) AS hire_date_string#898]]
- * \_EsRelation[employees][emp_no{f}#917, hire_date{f}#918, languages{f}#913, ..]
+ /**
+ * Expects
+ * {@code
+ * Project[[emp_no{f}#24]]
+ * \_TopN[[Order[emp_no{f}#24,ASC,LAST]],5[INTEGER],false]
+ * \_EsRelation[employees][_meta_field{f}#30, emp_no{f}#24, first_name{f}#25, ..]
+ * }
*/
public void testTripleInlineStatsMultipleAssignmentsGetsPrunedEntirely() {
// same as the above query, but only keep emp_no
@@ -582,9 +580,8 @@ public void testTripleInlineStatsMultipleAssignmentsGetsPrunedEntirely() {
var project = as(plan, Project.class);
assertThat(Expressions.names(project.projections()), is(List.of("emp_no")));
var topN = as(project.child(), TopN.class);
- var dissect = as(topN.child(), Dissect.class);
- var eval = as(dissect.child(), Eval.class);
- var relation = as(eval.child(), EsRelation.class);
+ // Dissect and Eval(hire_date_string) have been pruned since 'date' field is not used
+ var relation = as(topN.child(), EsRelation.class);
}
/*
@@ -958,20 +955,17 @@ public void testPruneColumnsInForkBranchesDropNestedEvalsFromOutputIfNotNeeded()
}
/**
+ * {@code
* Limit[10000[INTEGER],false,false]
* \_Aggregate[[],[COUNT(*[KEYWORD],true[BOOLEAN],PT0S[TIME_DURATION]) AS count(*)#36, COUNT(emp_no{r}#109,true[BOOLEAN],PT0S[
- * TIME_DURATION]) AS d#39,MAX(_fork{r}#124,true[BOOLEAN],PT0S[TIME_DURATION]) AS m#42,
- * COUNT(s{r}#119,true[BOOLEAN],PT0S[TIME_DURATION]) AS ls#45]]
+ * TIME_DURATION]) AS d#39, MAX(_fork{r}#124,true[BOOLEAN],PT0S[TIME_DURATION]) AS m#42, COUNT(s{r}#119,true[BOOLEAN],PT0S[
+ * TIME_DURATION]) AS ls#45]]
* \_Fork[[emp_no{r}#109, s{r}#119, _fork{r}#124]]
* |_Project[[emp_no{f}#46, s{r}#6, _fork{r}#7]]
- * | \_Eval[[fork1[KEYWORD] AS _fork#7]]
- * | \_Dissect[a{r}#21,Parser[pattern=%{x} %{y} %{z}, appendSeparator=, parser=org.elasticsearch.dissect.DissectParser@53c057b
- * 1],[x{r}#22, y{r}#23, z{r}#24]]
- * | \_Eval[[10[INTEGER] AS s#6, CONCAT(first_name{f}#47, [KEYWORD],TOSTRING(emp_no{f}#46), [KEYWORD],last_name{f}#50) AS
- * a#21]]
- * | \_Limit[1000[INTEGER],false,false]
- * | \_Filter[IN(10048[INTEGER],10081[INTEGER],emp_no{f}#46)]
- * | \_EsRelation[employees][_meta_field{f}#52, emp_no{f}#46, first_name{f}#47, ..]
+ * | \_Eval[[10[INTEGER] AS s#6, fork1[KEYWORD] AS _fork#7]]
+ * | \_Limit[1000[INTEGER],false,false]
+ * | \_Filter[IN(10048[INTEGER],10081[INTEGER],emp_no{f}#46)]
+ * | \_EsRelation[employees][_meta_field{f}#52, emp_no{f}#46, first_name{f}#47, ..]
* |_Project[[emp_no{r}#91, s{r}#101, _fork{r}#7]]
* | \_Eval[[fork2[KEYWORD] AS _fork#7, null[INTEGER] AS emp_no#91, null[INTEGER] AS s#101]]
* | \_Limit[1000[INTEGER],false,false]
@@ -987,6 +981,7 @@ public void testPruneColumnsInForkBranchesDropNestedEvalsFromOutputIfNotNeeded()
* \_Limit[1000[INTEGER],false,false]
* \_Filter[IN(10048[INTEGER],10081[INTEGER],emp_no{f}#79)]
* \_EsRelation[employees][_meta_field{f}#85, emp_no{f}#79, first_name{f}#80, ..]
+ * }
*/
public void testPruneColumnsInForkBranchesPruneIfAggregation() {
var query = """
@@ -1013,12 +1008,10 @@ public void testPruneColumnsInForkBranchesPruneIfAggregation() {
var firstBranch = fork.children().getFirst();
var firstBranchProject = as(firstBranch, Project.class);
assertThat(firstBranchProject.projections().size(), equalTo(3));
+ // Dissect and Eval(a) have been pruned since x, y, z fields are not used in the final aggregation
var evalInFirstBranch = as(firstBranchProject.child(), Eval.class);
- var dissect = as(evalInFirstBranch.child(), Dissect.class);
- assertThat(Expressions.names(dissect.extractedFields()), containsInAnyOrder("x", "y", "z"));
- var evalAfterDissect = as(dissect.child(), Eval.class);
- assertThat(Expressions.names(evalAfterDissect.fields()), containsInAnyOrder("a", "s"));
- var limitInFirstBranch = as(evalAfterDissect.child(), Limit.class);
+ assertThat(Expressions.names(evalInFirstBranch.fields()), containsInAnyOrder("s", "_fork"));
+ var limitInFirstBranch = as(evalInFirstBranch.child(), Limit.class);
var filterInFirstBranch = as(limitInFirstBranch.child(), Filter.class);
var firstBranchRelation = as(filterInFirstBranch.child(), EsRelation.class);
assertThat(
@@ -1029,6 +1022,7 @@ public void testPruneColumnsInForkBranchesPruneIfAggregation() {
var secondBranch = fork.children().get(1);
var secondBranchProject = as(secondBranch, Project.class);
assertThat(secondBranchProject.projections().size(), equalTo(3));
+ // x, y, z from STATS are pruned since they are not used in the final aggregation
var evalInSecondBranch = as(secondBranchProject.child(), Eval.class);
var limitInSecondBranch = as(evalInSecondBranch.child(), Limit.class);
var localRelation = as(limitInSecondBranch.child(), LocalRelation.class);
@@ -1041,6 +1035,7 @@ public void testPruneColumnsInForkBranchesPruneIfAggregation() {
var thirdBranchProject = as(thirdBranch, Project.class);
assertThat(thirdBranchProject.projections().size(), equalTo(3));
assertThat(Expressions.names(thirdBranchProject.projections()), containsInAnyOrder("emp_no", "s", "_fork"));
+ // x (= last_name) is pruned since it is not used in the final aggregation
var evalInThirdBranch = as(thirdBranchProject.child(), Eval.class);
var topNInThirdBranch = as(evalInThirdBranch.child(), TopN.class);
var evalBeforeTopN = as(topNInThirdBranch.child(), Eval.class);
@@ -1055,6 +1050,7 @@ public void testPruneColumnsInForkBranchesPruneIfAggregation() {
var fourthBranchProject = as(fourthBranch, Project.class);
assertThat(fourthBranchProject.projections().size(), equalTo(3));
assertThat(Expressions.names(fourthBranchProject.projections()), containsInAnyOrder("emp_no", "s", "_fork"));
+ // x (= "abc") and y (= "aaa") are pruned since they are not used in the final aggregation
var evalInFourthBranch = as(fourthBranchProject.child(), Eval.class);
var limitInFourthBranch = as(evalInFourthBranch.child(), Limit.class);
var filterInFourthBranch = as(limitInFourthBranch.child(), Filter.class);
@@ -1701,16 +1697,16 @@ public void testPruneColumnsInProject_WithLookupJoin() {
/*
* Limit[1000[INTEGER],false,false]
- * \_Project[[id{f}#12]]
- * \_Dissect[x{r}#5,Parser[pattern=%{foo}, appendSeparator=, parser=org.elasticsearch.dissect.DissectParser@18e5d3b5],[foo{r}#6]]
- * \_Project[[id{f}#12, $$languages$converted_to$keyword{f$}#14, $$languages$converted_to$keyword{f$}#14 AS x#5]]
- * \_EsRelation[union_types_index*][!first_name, id{f}#12, !languages, !last_name, !sal..]
+ * \_Project[[id{f}#14]]
+ * \_Rerank[reranking-inference-id[KEYWORD],1000[INTEGER],test[KEYWORD],[x{r}#5 AS x#7],_score{m}#16]
+ * \_Project[[id{f}#14, $$languages$converted_to$keyword{f$}#15, $$languages$converted_to$keyword{f$}#15 AS x#5]]
+ * \_EsRelation[union_types_index*][!first_name, id{f}#14, !languages, !last_name, !sal..]
*/
- public void testPruneColumnsInProject_WithDissect() {
+ public void testPruneColumnsInProject_WithGeneratingPlan_Rerank() {
var query = """
from union_types_index*
| eval x = languages::string
- | dissect x "%{foo}"
+ | rerank "test" ON x WITH { "inference_id" : "reranking-inference-id" }
| keep id
""";
@@ -1722,18 +1718,18 @@ public void testPruneColumnsInProject_WithDissect() {
/*
* Limit[1000[INTEGER],false,false]
- * \_Project[[id{f}#10]]
- * \_Dissect[x{r}#5,Parser[pattern=%{foo}, appendSeparator=, parser=org.elasticsearch.dissect.DissectParser@16d0b308],[foo{r}#6]]
- * \_Project[[!first_name, id{f}#10, !languages, !last_name, !salary_change, $$languages$converted_to$keyword{f$}#14, $$lan
- * guages$converted_to$keyword{f$}#14 AS x#5]]
- * \_EsRelation[union_types_index*][!first_name, id{f}#10, !languages, !last_name, !sal..]
+ * \_Project[[id{f}#13]]
+ * \_Rerank[reranking-inference-id[KEYWORD],1000[INTEGER],test[KEYWORD],[x{r}#5 AS x#7],_score{m}#16]
+ * \_Project[[!first_name, id{f}#13, !languages, !last_name, !salary_change, $$languages$converted_to$keyword{f$}#15, $$lan
+ * guages$converted_to$keyword{f$}#15 AS x#5]]
+ * \_EsRelation[union_types_index*][!first_name, id{f}#13, !languages, !last_name, !sal..]
*/
Limit topLimit = as(prunedEval, Limit.class);
Project topProject = as(topLimit.child(), Project.class);
assertThat(topProject.projections().size(), equalTo(1));
assertThat(Expressions.names(topProject.projections()), contains("id"));
- Dissect dissect = as(topProject.child(), Dissect.class);
- Project project = as(dissect.child(), Project.class);
+ Rerank rerank = as(topProject.child(), Rerank.class);
+ Project project = as(rerank.child(), Project.class);
assertThat(project.projections().size(), equalTo(7));
assertThat(
Expressions.names(project.projections()),
@@ -1746,8 +1742,8 @@ public void testPruneColumnsInProject_WithDissect() {
topProject = as(topLimit.child(), Project.class);
assertThat(topProject.projections().size(), equalTo(1));
assertThat(Expressions.names(topProject.projections()), contains("id"));
- dissect = as(topProject.child(), Dissect.class);
- project = as(dissect.child(), Project.class);
+ rerank = as(topProject.child(), Rerank.class);
+ project = as(rerank.child(), Project.class);
assertThat(project.projections().size(), equalTo(3));
assertThat(Expressions.names(project.projections()), contains("id", "$$languages$converted_to$keyword", "x"));
EsRelation relation = as(project.child(), EsRelation.class);
@@ -1773,4 +1769,657 @@ public static void assertCommonIncompatibleDataTypesEsRelation(EsRelation relati
assertThat(relationOutput.get(4).name(), equalTo("salary_change"));
assertThat(relationOutput.get(4).dataType(), equalTo(UNSUPPORTED));
}
+
+ /**
+ * Expects
+ * {@code
+ * Project[[date{r}#60]]
+ * \_Dissect[message{r}#59,Parser[pattern=%{date} - %{level} - %{ip}, appendSeparator=, parser=org.elasticsearch.dissect.Dis
+ * sectParser@981fdf8],[date{r}#60, level{r}#61, ip{r}#62]]
+ * \_Limit[1000[INTEGER],false,false]
+ * \_LocalRelation[[message{r}#59],Page{blocks=[BytesRefVectorBlock[vector=ConstantBytesRefVector[positions=1, value=[32 30 32 33
+ * 2d 30 31 2d 32 33 54 31 32 3a 31 35 3a 30 30 5a 20 2d 20 65 72 72 6f 72 20 2d 20 31 39 32 2e 31 36 38 2e 31 2e 31]]]]}]
+ * }
+ */
+ public void testCannotPartiallyPruneDissectFields() {
+ var plan = plan("""
+ ROW message = "2023-01-23T12:15:00Z - error - 192.168.1.1"
+ | DISSECT message "%{date} - %{level} - %{ip}"
+ | KEEP date
+ """);
+
+ var project = as(plan, Project.class);
+ assertThat(Expressions.names(project.projections()), is(List.of("date")));
+ var dissect = as(project.child(), Dissect.class);
+ // partial pruning is not supported, all fields remain
+ assertThat(Expressions.names(dissect.extractedFields()), is(List.of("date", "level", "ip")));
+ var limit = as(dissect.child(), Limit.class);
+ as(limit.child(), LocalRelation.class);
+ }
+
+ /**
+ * Expects
+ * {@code
+ * Project[[message{r}#46]]
+ * \_Limit[1000[INTEGER],false,false]
+ * \_LocalRelation[[message{r}#46],Page{blocks=[BytesRefVectorBlock[vector=ConstantBytesRefVector[positions=1, value=...]]]}]
+ * }
+ */
+ public void testPruneAllDissectFields() {
+ var plan = plan("""
+ ROW message = "2023-01-23T12:15:00Z - error - 192.168.1.1"
+ | DISSECT message "%{date} - %{level} - %{ip}"
+ | KEEP message
+ """);
+
+ var project = as(plan, Project.class);
+ assertThat(Expressions.names(project.projections()), contains("message"));
+ var limit = as(project.child(), Limit.class);
+ // Dissect is completely removed since 'date', 'level' and 'ip' are not used
+ as(limit.child(), LocalRelation.class);
+ }
+
+ /**
+ * Expects
+ * {@code
+ * Limit[1000[INTEGER],false,false]
+ * \_Aggregate[[extracted_last{r}#19],[COUNT(*[KEYWORD],true[BOOLEAN],PT0S[TIME_DURATION]) AS count#23, extracted_last{r}#19]]
+ * \_Dissect[full_name{r}#17,Parser[pattern=%{extracted_first} %{extracted_last}, appendSeparator=, parser=org.elasticsearch
+ * .dissect.DissectParser@43bbc401],[extracted_first{r}#18, extracted_last{r}#19]]
+ * \_Eval[[CONCAT(first_name{f}#25, [KEYWORD],last_name{f}#28) AS full_name#17]]
+ * \_EsRelation[test][_meta_field{f}#30, emp_no{f}#24, first_name{f}#25, ..]
+ * }
+ */
+ public void testCannotPartiallyPruneDissectFieldsInAgg() {
+ var plan = plan("""
+ FROM test
+ | EVAL full_name = CONCAT(first_name, " ", last_name)
+ | DISSECT full_name "%{extracted_first} %{extracted_last}"
+ | STATS count = COUNT(*) BY extracted_last
+ """);
+
+ var limit = as(plan, Limit.class);
+ var agg = as(limit.child(), Aggregate.class);
+ assertThat(Expressions.names(agg.aggregates()), is(List.of("count", "extracted_last")));
+ var dissect = as(agg.child(), Dissect.class);
+ // partial pruning is not supported, all fields remain
+ assertThat(Expressions.names(dissect.extractedFields()), is(List.of("extracted_first", "extracted_last")));
+ var eval = as(dissect.child(), Eval.class);
+ assertThat(Expressions.names(eval.fields()), is(List.of("full_name")));
+ as(eval.child(), EsRelation.class);
+ }
+
+ /**
+ * Expects
+ * {@code
+ * Project[[date{r}#99]]
+ * \_Grok[message{r}#93,Parser[pattern=%{TIMESTAMP_ISO8601:date} %{IP:ip} %{EMAILADDRESS:email} %{NUMBER:num:int}, grok=o
+ * rg.elasticsearch.grok.Grok@2991a7bb],[date{r}#99, email{r}#100, ip{r}#101, num{r}#102]]
+ * \_Limit[1000[INTEGER],false,false]
+ * \_LocalRelation[[message{r}#93],Page{blocks=[BytesRefVectorBlock[vector=ConstantBytesRefVector[positions=1, value=[32 30 32 33
+ * 2d 30 31 2d 32 33 54 31 32 3a 31 35 3a 30 30 5a 20 31 39 32 2e 31 36 38 2e 31 2e 31 20 75 73 65 72 40 65 78 61 6d 70 6c 65 2e 63 6f
+ * 6d 20 34 32]]]]}]
+ * }
+ */
+ public void testCannotPartiallyPruneGrokFields() {
+ var plan = plan("""
+ ROW message = "2023-01-23T12:15:00Z 192.168.1.1 user@example.com 42"
+ | GROK message "%{TIMESTAMP_ISO8601:date} %{IP:ip} %{EMAILADDRESS:email} %{NUMBER:num:int}"
+ | KEEP date
+ """);
+
+ var project = as(plan, Project.class);
+ assertThat(Expressions.names(project.projections()), is(List.of("date")));
+ var grok = as(project.child(), Grok.class);
+ // partial pruning is not supported, all fields remain
+ assertThat(Expressions.names(grok.extractedFields()), is(List.of("date", "email", "ip", "num")));
+ var limit = as(grok.child(), Limit.class);
+ as(limit.child(), LocalRelation.class);
+ }
+
+ /**
+ * Expects
+ * {@code
+ * Project[[message{r}#53]]
+ * \_Limit[1000[INTEGER],false,false]
+ * \_LocalRelation[[message{r}#53],Page{blocks=[BytesRefVectorBlock[vector=ConstantBytesRefVector[positions=1, value=...]]]}]
+ * }
+ */
+ public void testPruneAllGrokFields() {
+ var plan = plan("""
+ ROW message = "2023-01-23T12:15:00Z 192.168.1.1 user@example.com 42"
+ | GROK message "%{TIMESTAMP_ISO8601:date} %{IP:ip} %{EMAILADDRESS:email} %{NUMBER:num:int}"
+ | KEEP message
+ """);
+
+ var project = as(plan, Project.class);
+ assertThat(Expressions.names(project.projections()), contains("message"));
+ var limit = as(project.child(), Limit.class);
+ // Grok is completely removed since 'date', 'ip', 'email' and 'num' are not used
+ as(limit.child(), LocalRelation.class);
+ }
+
+ /**
+ * Expects
+ * {@code
+ * Limit[1000[INTEGER],false,false]
+ * \_Aggregate[[extracted_last{r}#43],[COUNT(*[KEYWORD],true[BOOLEAN],PT0S[TIME_DURATION]) AS count#46, extracted_last{r}#43]]
+ * \_Grok[full_name{r}#38,Parser[pattern=%{WORD:extracted_first} %{WORD:extracted_last}, grok=org.elasticsearch.grok.Grok
+ * @1df7d04d],[extracted_first{r}#42, extracted_last{r}#43]]
+ * \_Eval[[CONCAT(first_name{f}#48, [KEYWORD],last_name{f}#51) AS full_name#38]]
+ * \_EsRelation[test][_meta_field{f}#53, emp_no{f}#47, first_name{f}#48, ..]
+ * }
+ */
+ public void testCannotPartiallyPruneGrokFieldsInAgg() {
+ var plan = plan("""
+ FROM test
+ | EVAL full_name = CONCAT(first_name, " ", last_name)
+ | GROK full_name "%{WORD:extracted_first} %{WORD:extracted_last}"
+ | STATS count = COUNT(*) BY extracted_last
+ """);
+
+ var limit = as(plan, Limit.class);
+ var agg = as(limit.child(), Aggregate.class);
+ assertThat(Expressions.names(agg.aggregates()), is(List.of("count", "extracted_last")));
+ var grok = as(agg.child(), Grok.class);
+ // partial pruning is not supported, all fields remain
+ assertThat(Expressions.names(grok.extractedFields()), is(List.of("extracted_first", "extracted_last")));
+ var eval = as(grok.child(), Eval.class);
+ assertThat(Expressions.names(eval.fields()), is(List.of("full_name")));
+ as(eval.child(), EsRelation.class);
+ }
+
+ /**
+ * Expects
+ *
+ * {@code
+ * Project[[_meta_field{f}#300, emp_no{f}#294, first_name{f}#295, gender{f}#296, hire_date{f}#301, job{f}#302, job.raw{f}
+ * #303, languages{f}#297, last_name{f}#298, long_noidx{f}#304, salary{f}#299]]
+ * \_Limit[1000[INTEGER],false,false]
+ * \_EsRelation[test][_meta_field{f}#300, emp_no{f}#294, first_name{f}#29..]
+ * }
+ */
+ public void testPruneDissectFieldsViaDrop() {
+ var plan = plan("""
+ FROM test
+ | EVAL full_name = CONCAT(first_name, " ", last_name)
+ | DISSECT full_name "%{extracted_first} %{extracted_last}"
+ | DROP extracted_first, extracted_last, full_name
+ """);
+
+ var project = as(plan, Project.class);
+ var projections = Expressions.names(project.projections());
+ assertThat(projections.contains("extracted_first"), is(false));
+ assertThat(projections.contains("extracted_last"), is(false));
+ assertThat(projections.contains("full_name"), is(false));
+ var limit = as(project.child(), Limit.class);
+ as(limit.child(), EsRelation.class);
+ }
+
+ /**
+ * Expects
+ *
+ * {@code
+ * Project[[_meta_field{f}#265, emp_no{f}#259, first_name{f}#260, gender{f}#261, hire_date{f}#266, job{f}#267, job.raw{f}
+ * #268, languages{f}#262, last_name{f}#263, long_noidx{f}#269, salary{f}#264]]
+ * \_Limit[1000[INTEGER],false,false]
+ * \_EsRelation[test][_meta_field{f}#265, emp_no{f}#259, first_name{f}#26..]
+ * }
+ */
+ public void testPruneGrokFieldsViaDrop() {
+ var plan = plan("""
+ FROM test
+ | EVAL full_name = CONCAT(first_name, " ", last_name)
+ | GROK full_name "%{WORD:extracted_first} %{WORD:extracted_last}"
+ | DROP extracted_first, extracted_last, full_name
+ """);
+
+ var project = as(plan, Project.class);
+ var projections = Expressions.names(project.projections());
+ assertThat(projections.contains("extracted_first"), is(false));
+ assertThat(projections.contains("extracted_last"), is(false));
+ assertThat(projections.contains("full_name"), is(false));
+ var limit = as(project.child(), Limit.class);
+ as(limit.child(), EsRelation.class);
+ }
+
+ /**
+ * Expects
+ *
+ * {@code
+ * Project[[message{r}#142, date{r}#143]]
+ * \_Dissect[message{r}#142,Parser[pattern=%{date} - %{level} - %{ip}, appendSeparator=, parser=org.elasticsearch.dissect.Di
+ * ssectParser@223d640a],[date{r}#143, level{r}#144, ip{r}#145]]
+ * \_Limit[1000[INTEGER],false,false]
+ * \_LocalRelation[[message{r}#142],Page{blocks=[BytesRefVectorBlock[vector=ConstantBytesRefVector[positions=1, value=[32 30 32 33
+ * 2d 30 31 2d 32 33 54 31 32 3a 31 35 3a 30 30 5a 20 2d 20 65 72 72 6f 72 20 2d 20 31 39 32 2e 31 36 38 2e 31 2e 31]]]]}]
+ * }
+ */
+ public void testCannotPartiallyPruneDissectFieldsViaDrop() {
+ var plan = plan("""
+ ROW message = "2023-01-23T12:15:00Z - error - 192.168.1.1"
+ | DISSECT message "%{date} - %{level} - %{ip}"
+ | DROP level, ip
+ """);
+
+ var project = as(plan, Project.class);
+ assertThat(project.projections(), hasSize(2));
+ assertThat(Expressions.names(project.projections()), is(List.of("message", "date")));
+ var dissect = as(project.child(), Dissect.class);
+ // partial pruning is not supported, all fields remain
+ assertThat(dissect.extractedFields(), hasSize(3));
+ assertThat(Expressions.names(dissect.extractedFields()), is(List.of("date", "level", "ip")));
+ var limit = as(dissect.child(), Limit.class);
+ as(limit.child(), LocalRelation.class);
+ }
+
+ /**
+ * Expects
+ *
+ * {@code
+ * Project[[message{r}#77, date{r}#83, level{r}#85, ip{r}#87]]
+ * \_Eval[[overwritten[KEYWORD] AS date#83, overwritten[KEYWORD] AS level#85, overwritten[KEYWORD] AS ip#87]]
+ * \_Limit[1000[INTEGER],false,false]
+ * \_LocalRelation[[message{r}#77],Page{blocks=[BytesRefVectorBlock[vector=ConstantBytesRefVector[positions=1, value=[32 30 32 33
+ * 2d 30 31 2d 32 33 54 31 32 3a 31 35 3a 30 30 5a 20 2d 20 65 72 72 6f 72 20 2d 20 31 39 32 2e 31 36 38 2e 31 2e 31]]]]}]
+ * }
+ */
+ public void testPruneDissectFieldsShadowedByEval() {
+ var plan = plan("""
+ ROW message = "2023-01-23T12:15:00Z - error - 192.168.1.1"
+ | DISSECT message "%{date} - %{level} - %{ip}"
+ | EVAL date = "overwritten", level = "overwritten", ip = "overwritten"
+ | KEEP message, date, level, ip
+ """);
+
+ var project = as(plan, Project.class);
+ assertThat(project.projections(), hasSize(4));
+ assertThat(Expressions.names(project.projections()), contains("message", "date", "level", "ip"));
+ var eval = as(project.child(), Eval.class);
+ assertThat(eval.fields(), hasSize(3));
+ assertThat(Expressions.names(eval.fields()), contains("date", "level", "ip"));
+ var limit = as(eval.child(), Limit.class);
+ // Dissect is completely removed since all its fields are overwritten by EVAL
+ as(limit.child(), LocalRelation.class);
+ }
+
+ /**
+ * Expects
+ *
+ * {@code
+ * Project[[message{r}#227, date{r}#238]]
+ * \_Eval[[overwritten[KEYWORD] AS date#238]]
+ * \_Limit[1000[INTEGER],false,false]
+ * \_LocalRelation[[message{r}#227],Page{blocks=[BytesRefVectorBlock[vector=ConstantBytesRefVector[positions=1, value=[32 30 32 33
+ * 2d 30 31 2d 32 33 54 31 32 3a 31 35 3a 30 30 5a 20 31 39 32 2e 31 36 38 2e 31 2e 31 20 75 73 65 72 40 65 78 61 6d 70 6c 65 2e 63 6f
+ * 6d 20 34 32]]]]}]
+ * }
+ */
+ public void testPruneGrokFieldsShadowedByEval() {
+ var plan = plan("""
+ ROW message = "2023-01-23T12:15:00Z 192.168.1.1 user@example.com 42"
+ | GROK message "%{TIMESTAMP_ISO8601:date} %{IP:ip} %{EMAILADDRESS:email} %{NUMBER:num:int}"
+ | EVAL date = "overwritten", ip = "overwritten", email = "overwritten", num = 0
+ | KEEP message, date
+ """);
+
+ var project = as(plan, Project.class);
+ assertThat(project.projections(), hasSize(2));
+ assertThat(Expressions.names(project.projections()), contains("message", "date"));
+ var eval = as(project.child(), Eval.class);
+ assertThat(eval.fields(), hasSize(1));
+ assertThat(Expressions.names(eval.fields()), contains("date"));
+ var limit = as(eval.child(), Limit.class);
+ // Grok is completely removed since all its fields are overwritten by EVAL
+ as(limit.child(), LocalRelation.class);
+ }
+
+ /**
+ * Expects
+ *
+ * {@code
+ * Project[[first_name{f}#117 AS extracted_first#113, extracted_last{r}#109]]
+ * \_Dissect[full_name{r}#107,Parser[pattern=%{extracted_first} %{extracted_last}, appendSeparator=, parser=org.elasticsearc
+ * h.dissect.DissectParser@4eafab7f],[extracted_first{r}#108, extracted_last{r}#109]]
+ * \_Eval[[CONCAT(first_name{f}#117, [KEYWORD],last_name{f}#120) AS full_name#107]]
+ * \_Limit[1000[INTEGER],false,false]
+ * \_EsRelation[test][_meta_field{f}#122, emp_no{f}#116, first_name{f}#11..]
+ * }
+ */
+ public void testCannotPartiallyPruneDissectFieldShadowedByRename() {
+ var plan = plan("""
+ FROM test
+ | EVAL full_name = CONCAT(first_name, " ", last_name)
+ | DISSECT full_name "%{extracted_first} %{extracted_last}"
+ | RENAME first_name AS extracted_first
+ | KEEP extracted_first, extracted_last
+ """);
+
+ var project = as(plan, Project.class);
+ assertThat(project.projections(), hasSize(2));
+ assertThat(Expressions.names(project.projections()), is(List.of("extracted_first", "extracted_last")));
+ // partial pruning is not supported, both extracted fields remain; extracted_first from dissect is shadowed by RENAME
+ var dissect = as(project.child(), Dissect.class);
+ assertThat(dissect.extractedFields(), hasSize(2));
+ assertThat(Expressions.names(dissect.extractedFields()), is(List.of("extracted_first", "extracted_last")));
+ var eval = as(dissect.child(), Eval.class);
+ assertThat(Expressions.names(eval.fields()), is(List.of("full_name")));
+ var limit = as(eval.child(), Limit.class);
+ as(limit.child(), EsRelation.class);
+ }
+
+ /**
+ * Expects
+ *
+ * {@code
+ * Project[[emp_no{f}#79, avg_salary{r}#74]]
+ * \_TopN[[Order[emp_no{f}#79,ASC,LAST]],5[INTEGER],false]
+ * \_InlineJoin[LEFT,[extracted_last{r}#70],[extracted_last{r}#70]]
+ * |_Dissect[full_name{r}#68,Parser[pattern=%{extracted_first} %{extracted_last}, appendSeparator=, parser=org.elasticsearch
+ * .dissect.DissectParser@4d4b3df6],[extracted_first{r}#69, extracted_last{r}#70]]
+ * | \_Eval[[CONCAT(first_name{f}#80, [KEYWORD],last_name{f}#83) AS full_name#68]]
+ * | \_EsRelation[employees][_meta_field{f}#85, emp_no{f}#79, first_name{f}#80, ..]
+ * \_Project[[avg_salary{r}#74, extracted_last{r}#70]]
+ * \_Eval[[$$SUM$avg_salary$0{r$}#90 / $$COUNT$avg_salary$1{r$}#91 AS avg_salary#74]]
+ * \_Aggregate[[extracted_last{r}#70],[SUM(salary{f}#84,true[BOOLEAN],PT0S[TIME_DURATION],compensated[KEYWORD]) AS $$SUM$avg_s
+ * alary$0#90, COUNT(salary{f}#84,true[BOOLEAN],PT0S[TIME_DURATION]) AS $$COUNT$avg_salary$1#91, extracted_last{r}#70]]
+ * \_StubRelation[[_meta_field{f}#85, emp_no{f}#79, first_name{f}#80, gender{f}#81, hire_date{f}#86, job{f}#87, job.raw{f}#88,
+ * languages{f}#82, last_name{f}#83, long_noidx{f}#89, salary{f}#84, full_name{r}#68, extracted_first{r}#69, extracted_last{r}#70]]
+ * }
+ */
+ public void testCannotPartiallyPruneDissectWithInlineStats() {
+ var query = """
+ FROM employees
+ | EVAL full_name = CONCAT(first_name, " ", last_name)
+ | DISSECT full_name "%{extracted_first} %{extracted_last}"
+ | INLINE STATS avg_salary = AVG(salary) BY extracted_last
+ | KEEP emp_no, avg_salary
+ | SORT emp_no
+ | LIMIT 5
+ """;
+ if (releaseBuildForInlineStats(query)) {
+ return;
+ }
+ var plan = optimizedPlan(query);
+
+ var project = as(plan, Project.class);
+ assertThat(project.projections(), hasSize(2));
+ assertThat(Expressions.names(project.projections()), is(List.of("emp_no", "avg_salary")));
+ var topN = as(project.child(), TopN.class);
+ var inlineJoin = as(topN.child(), InlineJoin.class);
+ // Left side: partial pruning is not supported, both extracted fields remain
+ var dissect = as(inlineJoin.left(), Dissect.class);
+ assertThat(dissect.extractedFields(), hasSize(2));
+ assertThat(Expressions.names(dissect.extractedFields()), is(List.of("extracted_first", "extracted_last")));
+ var eval = as(dissect.child(), Eval.class);
+ as(eval.child(), EsRelation.class);
+ // Right side: aggregation
+ var rightProject = as(inlineJoin.right(), Project.class);
+ assertThat(Expressions.names(rightProject.projections()), is(List.of("avg_salary", "extracted_last")));
+ }
+
+ /**
+ * Expects
+ *
+ * {@code
+ * Project[[emp_no{f}#63, emp_no{f}#63 AS avg_salary#59]]
+ * \_TopN[[Order[emp_no{f}#63,ASC,LAST]],5[INTEGER],false]
+ * \_EsRelation[employees][_meta_field{f}#69, emp_no{f}#63, first_name{f}#64, ..]
+ * }
+ */
+ public void testPruneGrokWithInlineStatsPrunedEntirely() {
+ var query = """
+ FROM employees
+ | EVAL full_name = CONCAT(first_name, " ", last_name)
+ | GROK full_name "%{WORD:extracted_first} %{WORD:extracted_last}"
+ | INLINE STATS avg_salary = AVG(salary) BY extracted_last
+ | EVAL avg_salary = emp_no
+ | KEEP emp_no, avg_salary
+ | SORT emp_no
+ | LIMIT 5
+ """;
+ if (releaseBuildForInlineStats(query)) {
+ return;
+ }
+ var plan = optimizedPlan(query);
+
+ var project = as(plan, Project.class);
+ assertThat(project.projections(), hasSize(2));
+ assertThat(Expressions.names(project.projections()), is(List.of("emp_no", "avg_salary")));
+ // Both inline stats and grok are fully pruned since avg_salary is overwritten with emp_no
+ var topN = as(project.child(), TopN.class);
+ as(topN.child(), EsRelation.class);
+ }
+
+ /**
+ * Expects
+ *
+ * {@code
+ * Project[[emp_no{f}#108, language_name{f}#120]]
+ * \_Limit[1000[INTEGER],true,false]
+ * \_Join[LEFT,[languages{f}#111],[language_code{f}#119],null]
+ * |_Limit[1000[INTEGER],false,false]
+ * | \_EsRelation[test][_meta_field{f}#114, emp_no{f}#108, first_name{f}#10..]
+ * \_EsRelation[languages_lookup][LOOKUP][language_code{f}#119, language_name{f}#120]
+ * }
+ */
+ public void testPruneDissectWithLookupJoin() {
+ var plan = plan("""
+ FROM test
+ | EVAL language_code = languages
+ | EVAL full_name = CONCAT(first_name, " ", last_name)
+ | DISSECT full_name "%{extracted_first} %{extracted_last}"
+ | LOOKUP JOIN languages_lookup ON language_code
+ | DROP extracted_first, extracted_last, full_name
+ | KEEP emp_no, language_name
+ """);
+
+ var project = as(plan, Project.class);
+ assertThat(project.projections(), hasSize(2));
+ assertThat(Expressions.names(project.projections()), contains("emp_no", "language_name"));
+ var limit = as(project.child(), Limit.class);
+ var join = as(limit.child(), Join.class);
+ // Dissect is fully pruned since extracted_first and extracted_last are DROPped
+ var leftLimit = as(join.left(), Limit.class);
+ as(leftLimit.child(), EsRelation.class);
+ as(join.right(), EsRelation.class);
+ }
+
+ /**
+ * Expects
+ *
+ * {@code
+ * Project[[emp_no{f}#31, language_name{f}#43]]
+ * \_Limit[1000[INTEGER],true,false]
+ * \_Join[LEFT,[languages{f}#34],[language_code{f}#42],null]
+ * |_Limit[1000[INTEGER],false,false]
+ * | \_EsRelation[test][_meta_field{f}#37, emp_no{f}#31, first_name{f}#32, ..]
+ * \_EsRelation[languages_lookup][LOOKUP][language_code{f}#42, language_name{f}#43]
+ * }
+ */
+ public void testPruneGrokWithLookupJoin() {
+ var plan = plan("""
+ FROM test
+ | EVAL language_code = languages
+ | EVAL full_name = CONCAT(first_name, " ", last_name)
+ | GROK full_name "%{WORD:extracted_first} %{WORD:extracted_last}"
+ | LOOKUP JOIN languages_lookup ON language_code
+ | DROP extracted_first, extracted_last, full_name
+ | KEEP emp_no, language_name
+ """);
+
+ var project = as(plan, Project.class);
+ assertThat(project.projections(), hasSize(2));
+ assertThat(Expressions.names(project.projections()), contains("emp_no", "language_name"));
+ var limit = as(project.child(), Limit.class);
+ var join = as(limit.child(), Join.class);
+ // Grok is fully pruned since extracted_first and extracted_last are DROPped
+ var leftLimit = as(join.left(), Limit.class);
+ as(leftLimit.child(), EsRelation.class);
+ as(join.right(), EsRelation.class);
+ }
+
+ /**
+ * Expects
+ *
+ * {@code
+ * Project[[message{r}#122]]
+ * \_Limit[1000[INTEGER],false,false]
+ * \_LocalRelation[[message{r}#122],Page{blocks=[BytesRefVectorBlock[vector=ConstantBytesRefVector[positions=1, value=[32 30 32 33
+ * 2d 30 31 2d 32 33 20 65 72 72 6f 72 20 31 39 32 2e 31 36 38 2e 31 2e 31]]]]}]
+ * }
+ */
+ public void testPruneChainedDissectAndGrok() {
+ var plan = plan("""
+ ROW message = "2023-01-23 error 192.168.1.1"
+ | DISSECT message "%{date} %{rest}"
+ | GROK rest "%{WORD:level} %{IP:ip}"
+ | KEEP message
+ """);
+
+ var project = as(plan, Project.class);
+ assertThat(project.projections(), hasSize(1));
+ assertThat(Expressions.names(project.projections()), contains("message"));
+ var limit = as(project.child(), Limit.class);
+ // Both dissect and grok are removed
+ as(limit.child(), LocalRelation.class);
+ }
+
+ /**
+ * Expects
+ *
+ * {@code
+ * Project[[level{r}#12]]
+ * \_Grok[rest{r}#6,Parser[pattern=%{WORD:level} %{IP:ip}, grok=org.elasticsearch.grok.Grok@883ecd6],[ip{r}#11, level{r}#1
+ * 2]]
+ * \_Dissect[message{r}#4,Parser[pattern=%{date} %{rest}, appendSeparator=, parser=org.elasticsearch.dissect.DissectParser@4
+ * 814ecff],[date{r}#5, rest{r}#6]]
+ * \_Limit[1000[INTEGER],false,false]
+ * \_LocalRelation[[message{r}#4],Page{blocks=[BytesRefVectorBlock[vector=ConstantBytesRefVector[positions=1, value=[32 30 32 33
+ * 2d 30 31 2d 32 33 20 65 72 72 6f 72 20 31 39 32 2e 31 36 38 2e 31 2e 31]]]]}]
+ * }
+ */
+ public void testCannotPartiallyPruneChainedDissectAndGrok() {
+ var plan = plan("""
+ ROW message = "2023-01-23 error 192.168.1.1"
+ | DISSECT message "%{date} %{rest}"
+ | GROK rest "%{WORD:level} %{IP:ip}"
+ | KEEP level
+ """);
+
+ var project = as(plan, Project.class);
+ assertThat(project.projections(), hasSize(1));
+ assertThat(Expressions.names(project.projections()), is(List.of("level")));
+ var grok = as(project.child(), Grok.class);
+ // partial pruning is not supported, all fields remain
+ assertThat(grok.extractedFields(), hasSize(2));
+ assertThat(Expressions.names(grok.extractedFields()), is(List.of("ip", "level")));
+ var dissect = as(grok.child(), Dissect.class);
+ // partial pruning is not supported, all fields remain
+ assertThat(dissect.extractedFields(), hasSize(2));
+ assertThat(Expressions.names(dissect.extractedFields()), is(List.of("date", "rest")));
+ var limit = as(dissect.child(), Limit.class);
+ as(limit.child(), LocalRelation.class);
+ }
+
+ /**
+ * Expects
+ *
+ * {@code
+ * Limit[1000[INTEGER],false,false]
+ * \_Aggregate[[extracted_first{r}#163],[COUNT(*[KEYWORD],true[BOOLEAN],PT0S[TIME_DURATION]) AS count#168, extracted_first{r}#
+ * 163]]
+ * \_Eval[[constant[KEYWORD] AS extracted_first#163]]
+ * \_EsRelation[test][_meta_field{f}#175, emp_no{f}#169, first_name{f}#17..]
+ * }
+ */
+ public void testPruneDissectFieldRedefinedBeforeStats() {
+ var plan = plan("""
+ FROM test
+ | EVAL full_name = CONCAT(first_name, " ", last_name)
+ | DISSECT full_name "%{extracted_first} %{extracted_last}"
+ | EVAL extracted_first = "constant", extracted_last = "constant"
+ | STATS count = COUNT(*) BY extracted_first
+ """);
+
+ var limit = as(plan, Limit.class);
+ var agg = as(limit.child(), Aggregate.class);
+ assertThat(Expressions.names(agg.aggregates()), contains("count", "extracted_first"));
+ // Dissect is fully pruned since extracted_first/extracted_last are overwritten by EVAL
+ var eval = as(agg.child(), Eval.class);
+ assertThat(Expressions.names(eval.fields()), contains("extracted_first"));
+ // Dissect is fully pruned — next node is EsRelation, no Dissect in the tree
+ as(eval.child(), EsRelation.class);
+ }
+
+ /**
+ * Expects
+ *
+ * {@code
+ * Project[[message{r}#128, ip{r}#136]]
+ * \_Limit[1000[INTEGER],false,false]
+ * \_Filter[ip{r}#136 == 192.168.1.1[KEYWORD]]
+ * \_Grok[message{r}#128,Parser[pattern=%{TIMESTAMP_ISO8601:date} %{IP:ip} %{EMAILADDRESS:email} %{NUMBER:num:int}, grok=
+ * org.elasticsearch.grok.Grok@7c72a4af],[date{r}#134, email{r}#135, ip{r}#136, num{r}#137]]
+ * \_LocalRelation[[message{r}#128],Page{blocks=[BytesRefVectorBlock[vector=ConstantBytesRefVector[positions=1, value=[32 30 32 33
+ * 2d 30 31 2d 32 33 54 31 32 3a 31 35 3a 30 30 5a 20 31 39 32 2e 31 36 38 2e 31 2e 31 20 75 73 65 72 40 65 78 61 6d 70 6c 65 2e 63 6f
+ * 6d 20 34 32]]]]}]
+ * }
+ */
+ public void testCannotPartiallyPruneGrokFieldsUsedInFilter() {
+ var plan = plan("""
+ ROW message = "2023-01-23T12:15:00Z 192.168.1.1 user@example.com 42"
+ | GROK message "%{TIMESTAMP_ISO8601:date} %{IP:ip} %{EMAILADDRESS:email} %{NUMBER:num:int}"
+ | WHERE ip == "192.168.1.1"
+ | KEEP message, ip
+ """);
+
+ var project = as(plan, Project.class);
+ assertThat(project.projections(), hasSize(2));
+ assertThat(Expressions.names(project.projections()), is(List.of("message", "ip")));
+ var limit = as(project.child(), Limit.class);
+ var filter = as(limit.child(), Filter.class);
+ var grok = as(filter.child(), Grok.class);
+ // partial pruning is not supported, all fields remain even though only 'ip' is used
+ assertThat(grok.extractedFields(), hasSize(4));
+ assertThat(Expressions.names(grok.extractedFields()), is(List.of("date", "email", "ip", "num")));
+ as(grok.child(), LocalRelation.class);
+ }
+
+ /**
+ * Expects
+ *
+ * {@code
+ * Project[[combined{r}#11]]
+ * \_Eval[[CONCAT(a{r}#5,-[KEYWORD],b{r}#6) AS combined#11]]
+ * \_Dissect[message{r}#4,Parser[pattern=%{a} %{b}, appendSeparator=, parser=org.elasticsearch.dissect.DissectParser@63bbe42
+ * c],[a{r}#5, b{r}#6]]
+ * \_Limit[1000[INTEGER],false,false]
+ * \_LocalRelation[[message{r}#4],Page{blocks=[BytesRefVectorBlock[vector=ConstantBytesRefVector[positions=1, value=[68 65 6c 6c 6
+ * f 20 77 6f 72 6c 64]]]]}]
+ * }
+ */
+ public void testNoPruneDissectFieldsUsedInEvalExpression() {
+ var plan = plan("""
+ ROW message = "hello world"
+ | DISSECT message "%{a} %{b}"
+ | EVAL combined = CONCAT(a, "-", b)
+ | KEEP combined
+ """);
+
+ var project = as(plan, Project.class);
+ assertThat(project.projections(), hasSize(1));
+ assertThat(Expressions.names(project.projections()), contains("combined"));
+ var eval = as(project.child(), Eval.class);
+ var dissect = as(eval.child(), Dissect.class);
+ // Both a and b survive because they're referenced in the EVAL expression
+ assertThat(dissect.extractedFields(), hasSize(2));
+ assertThat(
+ dissect.extractedFields(),
+ containsIgnoringIds(new ReferenceAttribute(EMPTY, "a", KEYWORD), new ReferenceAttribute(EMPTY, "b", KEYWORD))
+ );
+ var limit = as(dissect.child(), Limit.class);
+ as(limit.child(), LocalRelation.class);
+ }
+
}