Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/131204.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 131204
summary: Allow pruning columns added by `InlineJoin`
area: ES|QL
type: enhancement
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -671,6 +671,21 @@ emp_no:integer
10100
;

shadowEntireInlinestats
required_capability: inlinestats_v8

FROM employees
| INLINESTATS x = avg(salary), y = min(salary) BY emp_no
| EVAL x = emp_no, y = x
| SORT x
| KEEP x, y, emp_no
| LIMIT 1
;

x:integer |y:integer |emp_no:integer
10001 |10001 |10001
;

byConstant
required_capability: inlinestats_v8

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import org.elasticsearch.xpack.esql.core.expression.Attribute;
import org.elasticsearch.xpack.esql.core.expression.AttributeSet;
import org.elasticsearch.xpack.esql.core.expression.Expressions;
import org.elasticsearch.xpack.esql.core.expression.FieldAttribute;
import org.elasticsearch.xpack.esql.core.expression.NamedExpression;
import org.elasticsearch.xpack.esql.core.util.Holder;
import org.elasticsearch.xpack.esql.plan.logical.Aggregate;
Expand All @@ -22,7 +23,11 @@
import org.elasticsearch.xpack.esql.plan.logical.Fork;
import org.elasticsearch.xpack.esql.plan.logical.Limit;
import org.elasticsearch.xpack.esql.plan.logical.LogicalPlan;
import org.elasticsearch.xpack.esql.plan.logical.Project;
import org.elasticsearch.xpack.esql.plan.logical.Sample;
import org.elasticsearch.xpack.esql.plan.logical.join.InlineJoin;
import org.elasticsearch.xpack.esql.plan.logical.join.StubRelation;
import org.elasticsearch.xpack.esql.plan.logical.local.EmptyLocalSupplier;
import org.elasticsearch.xpack.esql.plan.logical.local.LocalRelation;
import org.elasticsearch.xpack.esql.plan.logical.local.LocalSupplier;
import org.elasticsearch.xpack.esql.planner.PlannerUtils;
Expand All @@ -38,11 +43,10 @@ public final class PruneColumns extends Rule<LogicalPlan, LogicalPlan> {

@Override
public LogicalPlan apply(LogicalPlan plan) {
// track used references
var used = plan.outputSet().asBuilder();
// track inlinestats' own aggregation output (right-hand side of the join) so that any other plan on the left-hand side of the
// inline join won't have its columns pruned due to the lack of "visibility" into the right hand side output/Attributes
var inlineJoinRightOutput = new ArrayList<Attribute>();
return pruneColumns(plan, plan.outputSet().asBuilder(), false);
Copy link
Contributor

@astefan astefan Jul 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a harder to follow code, but with some comments it can be made easier to digest.
For one, I would add two methods; one LogicalPlan pruneColumns(LogicalPlan plan, AttributeSet.Builder used) and use it here. This first method would call pruneColumns(plan, plan.outputSet().asBuilder(), false).
The second method would be pruneInlineJoinColumns(LogicalPlan plan, AttributeSet.Builder used) and call pruneColumns(plan, plan.outputSet().asBuilder(), true).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I liked the uniformity of the previous namings, but I realise that it doesn't make the flow obvious at call sites, so I've renamed the methods to make it clearer what happens with InlineJoin instances. But happy to iterate if still unclear.

}

private static LogicalPlan pruneColumns(LogicalPlan plan, AttributeSet.Builder used, boolean inlineJoin) {
Holder<Boolean> forkPresent = new Holder<>(false);

// while going top-to-bottom (upstream)
Expand All @@ -54,8 +58,9 @@ public LogicalPlan apply(LogicalPlan plan) {
// same index fields will have different name ids in the left and right hand sides - as in the extreme example
// `FROM lookup_idx | LOOKUP JOIN lookup_idx ON key_field`.

// skip nodes that simply pass the input through
if (p instanceof Limit) {
// TODO: revisit with every new command
// skip nodes that simply pass the input through and use no references
if (p instanceof Limit || p instanceof Sample) {
return p;
}

Expand All @@ -67,77 +72,20 @@ public LogicalPlan apply(LogicalPlan plan) {
return p;
}

// TODO: INLINESTATS unit testing for tracking this set
if (p instanceof InlineJoin ij) {
inlineJoinRightOutput.addAll(ij.right().outputSet());
}

// remember used
boolean recheck;
var recheck = new Holder<Boolean>();
// analyze the unused items against dedicated 'producer' nodes such as Eval and Aggregate
// perform a loop to retry checking if the current node is completely eliminated
do {
recheck = false;
if (p instanceof Aggregate aggregate) {
// TODO: INLINESTATS https://github.com/elastic/elasticsearch/pull/128917#discussion_r2175162099
var remaining = removeUnused(aggregate.aggregates(), used, inlineJoinRightOutput);

if (remaining != null) {
if (remaining.isEmpty()) {
// We still need to have a plan that produces 1 row per group.
if (aggregate.groupings().isEmpty()) {
p = new LocalRelation(
aggregate.source(),
List.of(Expressions.attribute(aggregate.aggregates().getFirst())),
LocalSupplier.of(
new Block[] { BlockUtils.constantBlock(PlannerUtils.NON_BREAKING_BLOCK_FACTORY, null, 1) }
)
);
} else {
// Aggs cannot produce pages with 0 columns, so retain one grouping.
Attribute attribute = Expressions.attribute(aggregate.groupings().getFirst());
NamedExpression firstAggregate = aggregate.aggregates().getFirst();
remaining = List.of(
new Alias(firstAggregate.source(), firstAggregate.name(), attribute, firstAggregate.id())
);
p = aggregate.with(aggregate.groupings(), remaining);
}
} else {
p = aggregate.with(aggregate.groupings(), remaining);
}
}
} else if (p instanceof InlineJoin ij) {// TODO: InlineStats - add unit tests for this IJ removal
var remaining = removeUnused(ij.right().output(), used, inlineJoinRightOutput);
if (remaining != null) {
if (remaining.isEmpty()) {
// remove the InlineJoin altogether
p = ij.left();
recheck = true;
}
// TODO: InlineStats - prune ONLY the unused output columns from it? In other words, don't perform more aggs
// if they will not be used anyway
}
} else if (p instanceof Eval eval) {
var remaining = removeUnused(eval.fields(), used, inlineJoinRightOutput);
// no fields, no eval
if (remaining != null) {
if (remaining.isEmpty()) {
p = eval.child();
recheck = true;
} else {
p = new Eval(eval.source(), eval.child(), remaining);
}
}
} else if (p instanceof EsRelation esr && esr.indexMode() == IndexMode.LOOKUP) {
// Normally, pruning EsRelation has no effect because InsertFieldExtraction only extracts the required fields, anyway.
// However, InsertFieldExtraction can't be currently used in LOOKUP JOIN right index,
// it works differently as we extract all fields (other than the join key) that the EsRelation has.
var remaining = removeUnused(esr.output(), used, inlineJoinRightOutput);
if (remaining != null) {
p = new EsRelation(esr.source(), esr.indexPattern(), esr.indexMode(), esr.indexNameWithModes(), remaining);
}
}
} while (recheck);
recheck.set(false);
p = switch (p) {
case Aggregate agg -> pruneColumns(agg, used, inlineJoin);
case InlineJoin inj -> pruneColumns(inj, used, recheck);
case Eval eval -> pruneColumns(eval, used, recheck);
case Project project -> inlineJoin ? pruneColumns(project, used) : p;
case EsRelation esr -> pruneColumns(esr, used);
default -> p;
};
} while (recheck.get());

used.addAll(p.references());

Expand All @@ -148,24 +96,143 @@ public LogicalPlan apply(LogicalPlan plan) {
return pl;
}

private static LogicalPlan pruneColumns(Aggregate aggregate, AttributeSet.Builder used, boolean inlineJoin) {
LogicalPlan p = aggregate;

var remaining = pruneUnusedAndAddReferences(aggregate.aggregates(), used);

if (remaining != null) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if you checked here if (remaining == null) return p;? to have a smaller if branch?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I typically prefer this style (you suggest) too. But I think generally the code base uses the one I used. However, I applied your suggestion.

if (remaining.isEmpty()) {
if (inlineJoin) {
p = emptyLocalRelation(aggregate);
} else if (aggregate.groupings().isEmpty()) {
// We still need to have a plan that produces 1 row per group.
p = new LocalRelation(
aggregate.source(),
List.of(Expressions.attribute(aggregate.aggregates().getFirst())),
LocalSupplier.of(new Block[] { BlockUtils.constantBlock(PlannerUtils.NON_BREAKING_BLOCK_FACTORY, null, 1) })
);
} else {
// Aggs cannot produce pages with 0 columns, so retain one grouping.
Attribute attribute = Expressions.attribute(aggregate.groupings().getFirst());
NamedExpression firstAggregate = aggregate.aggregates().getFirst();
remaining = List.of(new Alias(firstAggregate.source(), firstAggregate.name(), attribute, firstAggregate.id()));
p = aggregate.with(aggregate.groupings(), remaining);
}
} else {
if (inlineJoin && aggregate.groupings().containsAll(remaining)) { // not expecting high groups cardinality
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand this comment tbh // not expecting high groups cardinality

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The editor suggests the use of HashSets for faster inclusion check. But I think that'd be more costly, for most of the cases, since typically queries won't aggregate by hundred of groups.

// It's an INLINEJOIN and all remaining attributes are groupings, which are already part of the IJ output (from the
// left-hand side).
if (aggregate.child() instanceof StubRelation stub) {
Copy link
Contributor

@astefan astefan Jul 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this condition will need an update when we start supporting filters on inlinestats aggs... if that guess is valid, I think a TODO is in order here. If you will add that TODO, for some (all, most I hope) I added // TODO Inlinestats:.... to find them quicker later.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added the TODO (as used in EsqlSession too).

var message = "Aggregate groups references ["
+ remaining
+ "] not in child's (StubRelation) output: ["
+ stub.outputSet()
+ "]";
assert stub.outputSet().containsAll(Expressions.asAttributes(remaining)) : message;

p = emptyLocalRelation(aggregate);
} else {
// There are no aggregates to compute, just output the groupings; these are already in the IJ output, so only
// restrict the output to what remained.
p = new Project(aggregate.source(), aggregate.child(), remaining);
}
} else { // not an INLINEJOIN or there are actually aggregates to compute
p = aggregate.with(aggregate.groupings(), remaining);
}
}
}

return p;
}

private static LogicalPlan pruneColumns(InlineJoin ij, AttributeSet.Builder used, Holder<Boolean> recheck) {
LogicalPlan p = ij;

used.addAll(ij.references());
var right = pruneColumns(ij.right(), used, true);
if (right.output().isEmpty()) {
p = ij.left();
recheck.set(true);
} else if (right != ij.right()) {
// if the right side has been updated, replace it
p = ij.replaceRight(right);
}

return p;
}

private static LogicalPlan pruneColumns(Eval eval, AttributeSet.Builder used, Holder<Boolean> recheck) {
LogicalPlan p = eval;

var remaining = pruneUnusedAndAddReferences(eval.fields(), used);
// no fields, no eval
if (remaining != null) {
if (remaining.isEmpty()) {
p = eval.child();
recheck.set(true);
} else {
p = new Eval(eval.source(), eval.child(), remaining);
}
}

return p;
}

private static LogicalPlan pruneColumns(Project project, AttributeSet.Builder used) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is the meat of the PR, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Handling Project is part of it, but the main change is calling (the now only method called) pruneColumns() within the (now renamed) pruneColumnsInInlineJoin() and using updated used set, since INLINEJOIN can be followed by any other command and each of these will need to have columns inspected for potential pruning, but aware of the fact that the pruning is done following INLINEJOIN; this "awareness" allows dropping the INLINEJOIN node when its output is entirely contained within the left-hand side of the join already.

LogicalPlan p = project;

var remaining = pruneUnusedAndAddReferences(project.projections(), used);
if (remaining != null) {
p = remaining.isEmpty() || remaining.stream().allMatch(FieldAttribute.class::isInstance)
? emptyLocalRelation(project)
: new Project(project.source(), project.child(), remaining);
} else if (project.output().stream().allMatch(FieldAttribute.class::isInstance)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we interested in this aspect? (the fact that the projection is made of fields only)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The explanation is the one given in the previous comment -- I've added a code comment too.

p = emptyLocalRelation(project);
}

return p;
}

private static LogicalPlan pruneColumns(EsRelation esr, AttributeSet.Builder used) {
LogicalPlan p = esr;

if (esr.indexMode() == IndexMode.LOOKUP) {
// Normally, pruning EsRelation has no effect because InsertFieldExtraction only extracts the required fields, anyway.
// However, InsertFieldExtraction can't be currently used in LOOKUP JOIN right index,
// it works differently as we extract all fields (other than the join key) that the EsRelation has.
var remaining = pruneUnusedAndAddReferences(esr.output(), used);
if (remaining != null) {
p = new EsRelation(esr.source(), esr.indexPattern(), esr.indexMode(), esr.indexNameWithModes(), remaining);
}
}

return p;
}

private static LogicalPlan emptyLocalRelation(LogicalPlan plan) {
// create an empty local relation with no attributes
return new LocalRelation(plan.source(), List.of(), EmptyLocalSupplier.EMPTY);
}

/**
* Prunes attributes from the list not found in the given set.
* Returns null if no changed occurred.
* Prunes attributes from the `named` list that are not found in the given set (builder).
* Returns null if no pruning occurred.
* As a side effect, the references of the kept attributes are added to the input set (builder) -- irrespective of the return value.
*/
private static <N extends NamedExpression> List<N> removeUnused(List<N> named, AttributeSet.Builder used, List<Attribute> exceptions) {
private static <N extends NamedExpression> List<N> pruneUnusedAndAddReferences(List<N> named, AttributeSet.Builder used) {
var clone = new ArrayList<>(named);
var it = clone.listIterator(clone.size());

// due to Eval, go in reverse
while (it.hasPrevious()) {
for (var it = clone.listIterator(clone.size()); it.hasPrevious();) {
N prev = it.previous();
var attr = prev.toAttribute();
if (used.contains(attr) == false && exceptions.contains(attr) == false) {
it.remove();
} else {
if (used.contains(attr)) {
used.addAll(prev.references());
} else {
it.remove();
}
}

return clone.size() != named.size() ? clone : null;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ public static void init() {
EsqlTestUtils.TEST_CFG,
new EsqlFunctionRegistry(),
getIndexResultAirports,
defaultLookupResolution(),
enrichResolution,
emptyInferenceResolution()
),
Expand Down
Loading
Loading