Skip to content

Commit

Permalink
Remove filter rewrite optimization for range aggregations when segmen…
Browse files Browse the repository at this point in the history
…t is not effective match all (opensearch-project#15194)

---------

Signed-off-by: Finn Carroll <[email protected]>
(cherry picked from commit ef87b39)
  • Loading branch information
finnegancarroll committed Aug 16, 2024
1 parent 18fe3ac commit 01580da
Show file tree
Hide file tree
Showing 8 changed files with 657 additions and 8 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),

### Fixed

- Fix range aggregation optimization ignoring top level queries ([#15194](https://github.com/opensearch-project/OpenSearch/pull/15194))

### Security

[Unreleased 2.x]: https://github.com/opensearch-project/OpenSearch/compare/57cd81da11e5cb831029719f0394e40aff68ced2...2.16
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,94 @@ setup:
- match: { aggregations.histo.buckets.8.doc_count: 1 }
- match: { aggregations.histo.buckets.12.key_as_string: "2016-06-01T00:00:00.000Z" }
- match: { aggregations.histo.buckets.12.doc_count: 1 }

---
"Date histogram aggregation w/ filter query test":
- skip:
version: " - 2.99.99"
reason: Backport fix to 2.16

- do:
bulk:
refresh: true
index: dhisto-agg-w-query
body:
- '{"index": {}}'
- '{"routing": "route1", "date": "2024-08-12", "dow": "monday"}'
- '{"index": {}}'
- '{"routing": "route1", "date": "2024-08-14", "dow": "wednesday"}'
- '{"index": {}}'
- '{"routing": "route1", "date": "2024-08-19", "dow": "monday"}'
- '{"index": {}}'
- '{"routing": "route2", "date": "2024-08-13", "dow": "tuesday"}'
- '{"index": {}}'
- '{"routing": "route2", "date": "2024-08-15", "dow": "thursday"}'

- do:
search:
index: dhisto-agg-w-query
body:
query:
bool:
must:
match_all: {}
filter:
- terms:
routing:
- "route1"
aggregations:
weekHisto:
date_histogram:
field: date
calendar_interval: week
_source: false

- match: { hits.total.value: 3 }
- match: { aggregations.weekHisto.buckets.0.doc_count: 2 }
- match: { aggregations.weekHisto.buckets.1.doc_count: 1 }

---
"Date histogram aggregation w/ shared field range test":
- do:
bulk:
refresh: true
index: dhisto-agg-w-query
body:
- '{"index": {}}'
- '{"date": "2024-10-31"}'
- '{"index": {}}'
- '{"date": "2024-11-11"}'
- '{"index": {}}'
- '{"date": "2024-11-28"}'
- '{"index": {}}'
- '{"date": "2024-12-25"}'
- '{"index": {}}'
- '{"date": "2025-01-01"}'
- '{"index": {}}'
- '{"date": "2025-02-14"}'

- do:
search:
index: dhisto-agg-w-query
body:
profile: true
query:
range:
date:
gte: "2024-01-01"
lt: "2025-01-01"
aggregations:
monthHisto:
date_histogram:
field: date
calendar_interval: month
_source: false

- match: { hits.total.value: 4 }
- match: { aggregations.monthHisto.buckets.0.doc_count: 1 }
- match: { aggregations.monthHisto.buckets.1.doc_count: 2 }
- match: { aggregations.monthHisto.buckets.2.doc_count: 1 }
- match: { profile.shards.0.aggregations.0.debug.optimized_segments: 1 }
- match: { profile.shards.0.aggregations.0.debug.unoptimized_segments: 0 }
- match: { profile.shards.0.aggregations.0.debug.leaf_visited: 0 }
- match: { profile.shards.0.aggregations.0.debug.inner_visited: 0 }
Original file line number Diff line number Diff line change
Expand Up @@ -673,3 +673,82 @@ setup:
- match: { aggregations.my_range.buckets.3.from: 1.5 }
- is_false: aggregations.my_range.buckets.3.to
- match: { aggregations.my_range.buckets.3.doc_count: 2 }

---
"Filter query w/ aggregation test":
- skip:
version: " - 2.99.99"
reason: Backport fix to 2.16

- do:
bulk:
refresh: true
index: range-agg-w-query
body:
- '{"index": {}}'
- '{"routing": "route1", "v": -10, "date": "2024-10-29"}'
- '{"index": {}}'
- '{"routing": "route1", "v": -5, "date": "2024-10-30"}'
- '{"index": {}}'
- '{"routing": "route1", "v": 10, "date": "2024-10-31"}'
- '{"index": {}}'
- '{"routing": "route2", "v": 15, "date": "2024-11-01"}'
- '{"index": {}}'
- '{"routing": "route2", "v": 20, "date": "2024-11-02"}'

- do:
search:
index: range-agg-w-query
body:
query:
bool:
must:
match_all: {}
filter:
- terms:
routing:
- "route1"
aggregations:
NegPosAgg:
range:
field: v
keyed: true
ranges:
- to: 0
key: "0"
- from: 0
key: "1"
_source: false

- match: { hits.total.value: 3 }
- match: { aggregations.NegPosAgg.buckets.0.doc_count: 2 }
- match: { aggregations.NegPosAgg.buckets.1.doc_count: 1 }

- do:
search:
index: range-agg-w-query
body:
query:
bool:
must:
match_all: {}
filter:
- terms:
routing:
- "route1"
aggregations:
HalloweenAgg:
date_range:
field: date
format: "yyyy-MM-dd"
keyed: true
ranges:
- to: "2024-11-01"
key: "to-october"
- from: "2024-11-01"
key: "from-september"
_source: false

- match: { hits.total.value: 3 }
- match: { aggregations.HalloweenAgg.buckets.to-october.doc_count: 3 }
- match: { aggregations.HalloweenAgg.buckets.from-september.doc_count: 0 }
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.search.aggregations.bucket.filterrewrite;

import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
import org.opensearch.index.mapper.MappedFieldType;
import org.opensearch.search.internal.SearchContext;

import java.io.IOException;
import java.util.function.BiConsumer;
import java.util.function.Consumer;

/**
* This interface provides a bridge between an aggregator and the optimization context, allowing
* the aggregator to provide data and optimize the aggregation process.
*
* <p>The main purpose of this interface is to encapsulate the aggregator-specific optimization
* logic and provide access to the data in Aggregator that is required for optimization, while keeping the optimization
* business logic separate from the aggregator implementation.
*
* <p>To use this interface to optimize an aggregator, you should subclass this interface in this package
* and put any specific optimization business logic in it. Then implement this subclass in the aggregator
* to provide data that is needed for doing the optimization
*
* @opensearch.internal
*/
public abstract class AggregatorBridge {

/**
* The field type associated with this aggregator bridge.
*/
MappedFieldType fieldType;

Consumer<Ranges> setRanges;

void setRangesConsumer(Consumer<Ranges> setRanges) {
this.setRanges = setRanges;
}

/**
* Checks whether the aggregator can be optimized.
* <p>
* This method is supposed to be implemented in a specific aggregator to take in fields from there
*
* @return {@code true} if the aggregator can be optimized, {@code false} otherwise.
* The result will be saved in the optimization context.
*/
protected abstract boolean canOptimize();

/**
* Prepares the optimization at shard level after checking aggregator is optimizable.
* <p>
* For example, figure out what are the ranges from the aggregation to do the optimization later
* <p>
* This method is supposed to be implemented in a specific aggregator to take in fields from there
*/
protected abstract void prepare() throws IOException;

/**
* Prepares the optimization for a specific segment when the segment is functionally matching all docs
*
* @param leaf the leaf reader context for the segment
*/
abstract Ranges tryBuildRangesFromSegment(LeafReaderContext leaf) throws IOException;

/**
* Attempts to build aggregation results for a segment
*
* @param values the point values (index structure for numeric values) for a segment
* @param incrementDocCount a consumer to increment the document count for a range bucket. The First parameter is document count, the second is the key of the bucket
* @param ranges
*/
abstract FilterRewriteOptimizationContext.DebugInfo tryOptimize(
PointValues values,
BiConsumer<Long, Long> incrementDocCount,
Ranges ranges
) throws IOException;

/**
* Checks whether the top level query matches all documents on the segment
*
* <p>This method creates a weight from the search context's query and checks whether the weight's
* document count matches the total number of documents in the leaf reader context.
*
* @param ctx the search context
* @param leafCtx the leaf reader context for the segment
* @return {@code true} if the segment matches all documents, {@code false} otherwise
*/
public static boolean segmentMatchAll(SearchContext ctx, LeafReaderContext leafCtx) throws IOException {
Weight weight = ctx.query().rewrite(ctx.searcher()).createWeight(ctx.searcher(), ScoreMode.COMPLETE_NO_SCORES, 1f);
return weight != null && weight.count(leafCtx) == leafCtx.reader().numDocs();
}
}
Loading

0 comments on commit 01580da

Please sign in to comment.