diff --git a/CHANGELOG.md b/CHANGELOG.md index c69c928a7ab56..754c745cb6403 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -71,8 +71,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Refactor the Cache.CacheStats class to use the Builder pattern instead of constructors ([#20015](https://github.com/opensearch-project/OpenSearch/pull/20015)) - Refactor the HttpStats, ScriptStats, AdaptiveSelectionStats and OsStats class to use the Builder pattern instead of constructors ([#20014](https://github.com/opensearch-project/OpenSearch/pull/20014)) - Bump opensearch-protobufs dependency to 0.24.0 and update transport-grpc module compatibility ([#20059](https://github.com/opensearch-project/OpenSearch/pull/20059)) - - Refactor the ShardStats, WarmerStats and IndexingPressureStats class to use the Builder pattern instead of constructors ([#19966](https://github.com/opensearch-project/OpenSearch/pull/19966)) +- Add skiplist optimization to auto_date_histogram aggregation ([#20057](https://github.com/opensearch-project/OpenSearch/pull/20057)) - Throw exceptions for currently unsupported GRPC request-side fields ([#20162](https://github.com/opensearch-project/OpenSearch/pull/20162)) ### Fixed diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/HistogramSkiplistLeafCollector.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/HistogramSkiplistLeafCollector.java index acbc62cb18024..52547a0441425 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/HistogramSkiplistLeafCollector.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/HistogramSkiplistLeafCollector.java @@ -13,25 +13,39 @@ import org.apache.lucene.search.DocIdStream; import org.apache.lucene.search.Scorable; import org.opensearch.common.Rounding; +import org.opensearch.search.aggregations.Aggregator; +import org.opensearch.search.aggregations.AggregatorBase; import org.opensearch.search.aggregations.LeafBucketCollector; +import org.opensearch.search.aggregations.bucket.histogram.LongBounds; import org.opensearch.search.aggregations.bucket.terms.LongKeyedBucketOrds; import java.io.IOException; +import java.util.function.LongFunction; +import java.util.function.Supplier; /** * Histogram collection logic using skip list. * + * Currently, it can only handle one owningBucketOrd at a time. + * * @opensearch.internal */ public class HistogramSkiplistLeafCollector extends LeafBucketCollector { private final NumericDocValues values; private final DocValuesSkipper skipper; - private final Rounding.Prepared preparedRounding; - private final LongKeyedBucketOrds bucketOrds; private final LeafBucketCollector sub; + private final boolean isSubNoOp; private final BucketsAggregator aggregator; + /** + * Supplier function to get the current preparedRounding from the parent aggregator. + * This allows detection of rounding changes in AutoDateHistogramAggregator. + */ + private final LongFunction preparedRoundingSupplier; + private final Supplier bucketOrdsSupplier; + private final IncreaseRoundingIfNeeded increaseRoundingIfNeeded; + /** * Max doc ID (inclusive) up to which all docs values may map to the same * bucket. @@ -48,6 +62,12 @@ public class HistogramSkiplistLeafCollector extends LeafBucketCollector { */ private long upToBucketIndex; + /** + * Tracks the last preparedRounding reference to detect rounding changes. + * Used for cache invalidation when AutoDateHistogramAggregator changes rounding. + */ + private Rounding.Prepared lastPreparedRounding; + public HistogramSkiplistLeafCollector( NumericDocValues values, DocValuesSkipper skipper, @@ -55,13 +75,30 @@ public HistogramSkiplistLeafCollector( LongKeyedBucketOrds bucketOrds, LeafBucketCollector sub, BucketsAggregator aggregator + ) { + this(values, skipper, (owningBucketOrd) -> preparedRounding, () -> bucketOrds, sub, aggregator, (owningBucketOrd, rounded) -> {}); + } + + /** + * Constructor that accepts a supplier for dynamic rounding (used by AutoDateHistogramAggregator). + */ + public HistogramSkiplistLeafCollector( + NumericDocValues values, + DocValuesSkipper skipper, + LongFunction preparedRoundingSupplier, + Supplier bucketOrdsSupplier, + LeafBucketCollector sub, + BucketsAggregator aggregator, + IncreaseRoundingIfNeeded increaseRoundingIfNeeded ) { this.values = values; this.skipper = skipper; - this.preparedRounding = preparedRounding; - this.bucketOrds = bucketOrds; + this.preparedRoundingSupplier = preparedRoundingSupplier; + this.bucketOrdsSupplier = bucketOrdsSupplier; this.sub = sub; + this.isSubNoOp = (sub == NO_OP_COLLECTOR); this.aggregator = aggregator; + this.increaseRoundingIfNeeded = increaseRoundingIfNeeded; } @Override @@ -87,17 +124,20 @@ private void advanceSkipper(int doc, long owningBucketOrd) throws IOException { upToInclusive = skipper.maxDocID(0); + // Get current rounding from supplier + Rounding.Prepared currentRounding = preparedRoundingSupplier.apply(owningBucketOrd); + // Now find the highest level where all docs map to the same bucket. for (int level = 0; level < skipper.numLevels(); ++level) { int totalDocsAtLevel = skipper.maxDocID(level) - skipper.minDocID(level) + 1; - long minBucket = preparedRounding.round(skipper.minValue(level)); - long maxBucket = preparedRounding.round(skipper.maxValue(level)); + long minBucket = currentRounding.round(skipper.minValue(level)); + long maxBucket = currentRounding.round(skipper.maxValue(level)); if (skipper.docCount(level) == totalDocsAtLevel && minBucket == maxBucket) { // All docs at this level have a value, and all values map to the same bucket. upToInclusive = skipper.maxDocID(level); upToSameBucket = true; - upToBucketIndex = bucketOrds.add(owningBucketOrd, maxBucket); + upToBucketIndex = bucketOrdsSupplier.get().add(owningBucketOrd, maxBucket); if (upToBucketIndex < 0) { upToBucketIndex = -1 - upToBucketIndex; } @@ -109,6 +149,16 @@ private void advanceSkipper(int doc, long owningBucketOrd) throws IOException { @Override public void collect(int doc, long owningBucketOrd) throws IOException { + Rounding.Prepared currentRounding = preparedRoundingSupplier.apply(owningBucketOrd); + + // Check if rounding changed (using reference equality) + // AutoDateHistogramAggregator creates a new Rounding.Prepared instance when rounding changes + if (currentRounding != lastPreparedRounding) { + upToInclusive = -1; // Invalidate + upToSameBucket = false; + lastPreparedRounding = currentRounding; + } + if (doc > upToInclusive) { advanceSkipper(doc, owningBucketOrd); } @@ -118,12 +168,14 @@ public void collect(int doc, long owningBucketOrd) throws IOException { sub.collect(doc, upToBucketIndex); } else if (values.advanceExact(doc)) { final long value = values.longValue(); - long bucketIndex = bucketOrds.add(owningBucketOrd, preparedRounding.round(value)); + long rounded = currentRounding.round(value); + long bucketIndex = bucketOrdsSupplier.get().add(owningBucketOrd, rounded); if (bucketIndex < 0) { bucketIndex = -1 - bucketIndex; aggregator.collectExistingBucket(sub, doc, bucketIndex); } else { aggregator.collectBucket(sub, doc, bucketIndex); + increaseRoundingIfNeeded.accept(owningBucketOrd, rounded); } } } @@ -136,7 +188,6 @@ public void collect(DocIdStream stream) throws IOException { @Override public void collect(DocIdStream stream, long owningBucketOrd) throws IOException { - // This will only be called if its the sub aggregation for (;;) { int upToExclusive = upToInclusive + 1; if (upToExclusive < 0) { // overflow @@ -144,7 +195,7 @@ public void collect(DocIdStream stream, long owningBucketOrd) throws IOException } if (upToSameBucket) { - if (sub == NO_OP_COLLECTOR) { + if (isSubNoOp) { // stream.count maybe faster when we don't need to handle sub-aggs long count = stream.count(upToExclusive); aggregator.incrementBucketDocCount(upToBucketIndex, count); @@ -167,4 +218,30 @@ public void collect(DocIdStream stream, long owningBucketOrd) throws IOException } } } + + /** + * Call back for auto date histogram + * + * @opensearch.internal + */ + public interface IncreaseRoundingIfNeeded { + void accept(long owningBucket, long rounded); + } + + /** + * Skiplist is based as top level agg (null parent) or parent that will execute in sorted order + * + */ + public static boolean canUseSkiplist(LongBounds hardBounds, Aggregator parent, DocValuesSkipper skipper, NumericDocValues singleton) { + if (skipper == null || singleton == null) return false; + // TODO: add hard bounds support + if (hardBounds != null) return false; + + if (parent == null) return true; + + if (parent instanceof AggregatorBase base) { + return base.getLeafCollectorMode() == AggregatorBase.LeafCollectionMode.FILTER_REWRITE; + } + return false; + } } diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/histogram/AutoDateHistogramAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/histogram/AutoDateHistogramAggregator.java index 63951953a2f5d..57134d07d5021 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/histogram/AutoDateHistogramAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/histogram/AutoDateHistogramAggregator.java @@ -31,7 +31,10 @@ package org.opensearch.search.aggregations.bucket.histogram; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.DocValuesSkipper; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.search.DocIdStream; import org.apache.lucene.search.ScoreMode; @@ -52,6 +55,7 @@ import org.opensearch.search.aggregations.LeafBucketCollectorBase; import org.opensearch.search.aggregations.bucket.DeferableBucketAggregator; import org.opensearch.search.aggregations.bucket.DeferringBucketCollector; +import org.opensearch.search.aggregations.bucket.HistogramSkiplistLeafCollector; import org.opensearch.search.aggregations.bucket.MergingBucketsDeferringCollector; import org.opensearch.search.aggregations.bucket.filterrewrite.DateHistogramAggregatorBridge; import org.opensearch.search.aggregations.bucket.filterrewrite.FilterRewriteOptimizationContext; @@ -85,6 +89,7 @@ * @opensearch.internal */ abstract class AutoDateHistogramAggregator extends DeferableBucketAggregator { + static AutoDateHistogramAggregator build( String name, AggregatorFactories factories, @@ -135,6 +140,7 @@ static AutoDateHistogramAggregator build( protected final int targetBuckets; protected int roundingIdx; protected Rounding.Prepared preparedRounding; + private final String fieldName; private final FilterRewriteOptimizationContext filterRewriteOptimizationContext; @@ -157,6 +163,9 @@ private AutoDateHistogramAggregator( this.roundingInfos = roundingInfos; this.roundingPreparer = roundingPreparer; this.preparedRounding = prepareRounding(0); + this.fieldName = (valuesSource instanceof ValuesSource.Numeric.FieldData) + ? ((ValuesSource.Numeric.FieldData) valuesSource).getIndexFieldName() + : null; DateHistogramAggregatorBridge bridge = new DateHistogramAggregatorBridge() { @Override @@ -243,7 +252,11 @@ public final DeferringBucketCollector getDeferringCollector() { return deferringCollector; } - protected abstract LeafBucketCollector getLeafCollector(SortedNumericDocValues values, LeafBucketCollector sub) throws IOException; + protected abstract LeafBucketCollector getLeafCollector( + SortedNumericDocValues values, + DocValuesSkipper skipper, + LeafBucketCollector sub + ) throws IOException; @Override protected boolean tryPrecomputeAggregationForLeaf(LeafReaderContext ctx) throws IOException { @@ -262,23 +275,12 @@ public final LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBuc } final SortedNumericDocValues values = valuesSource.longValues(ctx); - final LeafBucketCollector iteratingCollector = getLeafCollector(values, sub); - return new LeafBucketCollectorBase(sub, values) { - @Override - public void collect(int doc, long owningBucketOrd) throws IOException { - iteratingCollector.collect(doc, owningBucketOrd); - } - - @Override - public void collect(DocIdStream stream, long owningBucketOrd) throws IOException { - super.collect(stream, owningBucketOrd); - } + DocValuesSkipper skipper = null; + if (this.fieldName != null) { + skipper = ctx.reader().getDocValuesSkipper(this.fieldName); + } - @Override - public void collectRange(int min, int max) throws IOException { - super.collectRange(min, max); - } - }; + return getLeafCollector(values, skipper, sub); } protected final InternalAggregation[] buildAggregations( @@ -370,6 +372,9 @@ private static class FromSingle extends AutoDateHistogramAggregator { private long min = Long.MAX_VALUE; private long max = Long.MIN_VALUE; + // Debug tracking counters for collector types + private int skiplistCollectorCount = 0; + FromSingle( String name, AggregatorFactories factories, @@ -402,7 +407,25 @@ protected LongKeyedBucketOrds getBucketOrds() { } @Override - protected LeafBucketCollector getLeafCollector(SortedNumericDocValues values, LeafBucketCollector sub) throws IOException { + protected LeafBucketCollector getLeafCollector(SortedNumericDocValues values, DocValuesSkipper skipper, LeafBucketCollector sub) + throws IOException { + // Check if skiplist optimization is available + final NumericDocValues singleton = DocValues.unwrapSingleton(values); + if (HistogramSkiplistLeafCollector.canUseSkiplist(null, parent, skipper, singleton)) { + // Increment skiplist collector count + skiplistCollectorCount++; + return new HistogramSkiplistLeafCollector( + singleton, + skipper, + (owningBucketOrd) -> preparedRounding, // for FromSingle there will be no parent/ + () -> bucketOrds, + sub, + FromSingle.this, + (owningBucket, rounded) -> increaseRoundingIfNeeded(rounded) // Pass supplier to allow rounding change + ); + } + + // Fall back to standard LeafBucketCollectorBase when skiplist unavailable return new LeafBucketCollectorBase(sub, values) { @Override public void collect(int doc, long owningBucketOrd) throws IOException { @@ -446,62 +469,63 @@ private void collectValue(int doc, long rounded) throws IOException { increaseRoundingIfNeeded(rounded); } - /** - * Examine our current bucket count and the most recently added bucket to determine if an update to - * preparedRounding is required to keep total bucket count in compliance with targetBuckets. - * - * @param rounded the most recently collected value rounded - */ - private void increaseRoundingIfNeeded(long rounded) { - // If we are already using the rounding with the largest interval nothing can be done - if (roundingIdx >= roundingInfos.length - 1) { - return; - } + }; + } - // Re calculate the max and min values we expect to bucket according to most recently rounded val - min = Math.min(min, rounded); - max = Math.max(max, rounded); - - /** - * Quick explanation of the two below conditions: - * - * 1. [targetBuckets * roundingInfos[roundingIdx].getMaximumInnerInterval()] - * Represents the total bucket count possible before we will exceed targetBuckets - * even if we use the maximum inner interval of our current rounding. For example, consider the - * DAYS_OF_MONTH rounding where the maximum inner interval is 7 days (i.e. 1 week buckets). - * targetBuckets * roundingInfos[roundingIdx].getMaximumInnerInterval() would then be the number of - * 1 day buckets possible such that if we re-bucket to 1 week buckets we will have more 1 week buckets - * than our targetBuckets limit. If the current count of buckets exceeds this limit we must update - * our rounding. - * - * 2. [targetBuckets * roundingInfos[roundingIdx].getMaximumRoughEstimateDurationMillis()] - * The total duration of ms covered by our current rounding. In the case of MINUTES_OF_HOUR rounding - * getMaximumRoughEstimateDurationMillis is 60000. If our current total range in millis (max - min) - * exceeds this range we must update our rounding. - */ - if (bucketOrds.size() <= targetBuckets * roundingInfos[roundingIdx].getMaximumInnerInterval() - && max - min <= targetBuckets * roundingInfos[roundingIdx].getMaximumRoughEstimateDurationMillis()) { - return; + /** + * Examine our current bucket count and the most recently added bucket to determine if an update to + * preparedRounding is required to keep total bucket count in compliance with targetBuckets. + * + * @param rounded the most recently collected value rounded + */ + private void increaseRoundingIfNeeded(long rounded) { + // If we are already using the rounding with the largest interval nothing can be done + if (roundingIdx >= roundingInfos.length - 1) { + return; + } + + // Re calculate the max and min values we expect to bucket according to most recently rounded val + min = Math.min(min, rounded); + max = Math.max(max, rounded); + + /** + * Quick explanation of the two below conditions: + * + * 1. [targetBuckets * roundingInfos[roundingIdx].getMaximumInnerInterval()] + * Represents the total bucket count possible before we will exceed targetBuckets + * even if we use the maximum inner interval of our current rounding. For example, consider the + * DAYS_OF_MONTH rounding where the maximum inner interval is 7 days (i.e. 1 week buckets). + * targetBuckets * roundingInfos[roundingIdx].getMaximumInnerInterval() would then be the number of + * 1 day buckets possible such that if we re-bucket to 1 week buckets we will have more 1 week buckets + * than our targetBuckets limit. If the current count of buckets exceeds this limit we must update + * our rounding. + * + * 2. [targetBuckets * roundingInfos[roundingIdx].getMaximumRoughEstimateDurationMillis()] + * The total duration of ms covered by our current rounding. In the case of MINUTES_OF_HOUR rounding + * getMaximumRoughEstimateDurationMillis is 60000. If our current total range in millis (max - min) + * exceeds this range we must update our rounding. + */ + if (bucketOrds.size() <= targetBuckets * roundingInfos[roundingIdx].getMaximumInnerInterval() + && max - min <= targetBuckets * roundingInfos[roundingIdx].getMaximumRoughEstimateDurationMillis()) { + return; + } + do { + try (LongKeyedBucketOrds oldOrds = bucketOrds) { + preparedRounding = prepareRounding(++roundingIdx); + long[] mergeMap = new long[Math.toIntExact(oldOrds.size())]; + bucketOrds = new LongKeyedBucketOrds.FromSingle(context.bigArrays()); + LongKeyedBucketOrds.BucketOrdsEnum ordsEnum = oldOrds.ordsEnum(0); + while (ordsEnum.next()) { + long oldKey = ordsEnum.value(); + long newKey = preparedRounding.round(oldKey); + long newBucketOrd = bucketOrds.add(0, newKey); + mergeMap[(int) ordsEnum.ord()] = newBucketOrd >= 0 ? newBucketOrd : -1 - newBucketOrd; } - do { - try (LongKeyedBucketOrds oldOrds = bucketOrds) { - preparedRounding = prepareRounding(++roundingIdx); - long[] mergeMap = new long[Math.toIntExact(oldOrds.size())]; - bucketOrds = new LongKeyedBucketOrds.FromSingle(context.bigArrays()); - LongKeyedBucketOrds.BucketOrdsEnum ordsEnum = oldOrds.ordsEnum(0); - while (ordsEnum.next()) { - long oldKey = ordsEnum.value(); - long newKey = preparedRounding.round(oldKey); - long newBucketOrd = bucketOrds.add(0, newKey); - mergeMap[(int) ordsEnum.ord()] = newBucketOrd >= 0 ? newBucketOrd : -1 - newBucketOrd; - } - merge(mergeMap, bucketOrds.size()); - } - } while (roundingIdx < roundingInfos.length - 1 - && (bucketOrds.size() > targetBuckets * roundingInfos[roundingIdx].getMaximumInnerInterval() - || max - min > targetBuckets * roundingInfos[roundingIdx].getMaximumRoughEstimateDurationMillis())); + merge(mergeMap, bucketOrds.size()); } - }; + } while (roundingIdx < roundingInfos.length - 1 + && (bucketOrds.size() > targetBuckets * roundingInfos[roundingIdx].getMaximumInnerInterval() + || max - min > targetBuckets * roundingInfos[roundingIdx].getMaximumRoughEstimateDurationMillis())); } @Override @@ -513,6 +537,7 @@ public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws I public void collectDebugInfo(BiConsumer add) { super.collectDebugInfo(add); add.accept("surviving_buckets", bucketOrds.size()); + add.accept("skiplist_collectors_used", skiplistCollectorCount); } @Override @@ -619,6 +644,8 @@ private static class FromMany extends AutoDateHistogramAggregator { */ private int rebucketCount = 0; + private int skiplistCollectorCount = 0; + FromMany( String name, AggregatorFactories factories, @@ -661,7 +688,38 @@ protected LongKeyedBucketOrds getBucketOrds() { } @Override - protected LeafBucketCollector getLeafCollector(SortedNumericDocValues values, LeafBucketCollector sub) throws IOException { + protected LeafBucketCollector getLeafCollector(SortedNumericDocValues values, DocValuesSkipper skipper, LeafBucketCollector sub) + throws IOException { + + final NumericDocValues singleton = DocValues.unwrapSingleton(values); + if (HistogramSkiplistLeafCollector.canUseSkiplist(null, parent, skipper, singleton)) { + /** + * HistogramSkiplistLeafCollector in its current state can only handle one owningBucketOrd at a time. + * When parent is null, i.e. then ForSingle class will get used. ForMany is used when auto date is sub agg. + * In the special case where in FilterRewrite (SubAggRangeCollector) logic, we can use Skiplist because we + * will one range (and thus owningBucketOrd) at time. + * + * In the future we can enhance HistogramSkiplistLeafCollector to handle multiple owningBucketOrd, + * similar to FromMany. + */ + skiplistCollectorCount++; + + return new HistogramSkiplistLeafCollector( + singleton, + skipper, + (owningBucketOrd) -> preparedRoundings[roundingIndexFor(owningBucketOrd)], + () -> bucketOrds, + sub, + FromMany.this, + (owningBucketOrd, rounded) -> { + int roundingIdx = roundingIndexFor(owningBucketOrd); + liveBucketCountUnderestimate = context.bigArrays().grow(liveBucketCountUnderestimate, owningBucketOrd + 1); + int estimatedBucketCount = liveBucketCountUnderestimate.increment(owningBucketOrd, 1); + increaseRoundingIfNeeded(owningBucketOrd, estimatedBucketCount, rounded, roundingIdx); + } + ); + } + return new LeafBucketCollectorBase(sub, values) { @Override public void collect(int doc, long owningBucketOrd) throws IOException { @@ -707,61 +765,62 @@ private int collectValue(long owningBucketOrd, int roundingIdx, int doc, long ro return increaseRoundingIfNeeded(owningBucketOrd, estimatedBucketCount, rounded, roundingIdx); } - /** - * Increase the rounding of {@code owningBucketOrd} using - * estimated, bucket counts, {@link FromMany#rebucket()} rebucketing} the all - * buckets if the estimated number of wasted buckets is too high. - */ - private int increaseRoundingIfNeeded(long owningBucketOrd, int oldEstimatedBucketCount, long newKey, int oldRounding) { - if (oldRounding >= roundingInfos.length - 1) { - return oldRounding; - } - if (mins.size() < owningBucketOrd + 1) { - long oldSize = mins.size(); - mins = context.bigArrays().grow(mins, owningBucketOrd + 1); - mins.fill(oldSize, mins.size(), Long.MAX_VALUE); - } - if (maxes.size() < owningBucketOrd + 1) { - long oldSize = maxes.size(); - maxes = context.bigArrays().grow(maxes, owningBucketOrd + 1); - maxes.fill(oldSize, maxes.size(), Long.MIN_VALUE); - } - - long min = Math.min(mins.get(owningBucketOrd), newKey); - mins.set(owningBucketOrd, min); - long max = Math.max(maxes.get(owningBucketOrd), newKey); - maxes.set(owningBucketOrd, max); - if (oldEstimatedBucketCount <= targetBuckets * roundingInfos[oldRounding].getMaximumInnerInterval() - && max - min <= targetBuckets * roundingInfos[oldRounding].getMaximumRoughEstimateDurationMillis()) { - return oldRounding; - } - long oldRoughDuration = roundingInfos[oldRounding].roughEstimateDurationMillis; - int newRounding = oldRounding; - int newEstimatedBucketCount; - do { - newRounding++; - double ratio = (double) oldRoughDuration / (double) roundingInfos[newRounding].getRoughEstimateDurationMillis(); - newEstimatedBucketCount = (int) Math.ceil(oldEstimatedBucketCount * ratio); - } while (newRounding < roundingInfos.length - 1 - && (newEstimatedBucketCount > targetBuckets * roundingInfos[newRounding].getMaximumInnerInterval() - || max - min > targetBuckets * roundingInfos[newRounding].getMaximumRoughEstimateDurationMillis())); - setRounding(owningBucketOrd, newRounding); - mins.set(owningBucketOrd, preparedRoundings[newRounding].round(mins.get(owningBucketOrd))); - maxes.set(owningBucketOrd, preparedRoundings[newRounding].round(maxes.get(owningBucketOrd))); - wastedBucketsOverestimate += oldEstimatedBucketCount - newEstimatedBucketCount; - if (wastedBucketsOverestimate > nextRebucketAt) { - rebucket(); - // Bump the threshold for the next rebucketing - wastedBucketsOverestimate = 0; - nextRebucketAt *= 2; - } else { - liveBucketCountUnderestimate.set(owningBucketOrd, newEstimatedBucketCount); - } - return newRounding; - } }; } + /** + * Increase the rounding of {@code owningBucketOrd} using + * estimated, bucket counts, {@link FromMany#rebucket()} rebucketing} the all + * buckets if the estimated number of wasted buckets is too high. + */ + private int increaseRoundingIfNeeded(long owningBucketOrd, int oldEstimatedBucketCount, long newKey, int oldRounding) { + if (oldRounding >= roundingInfos.length - 1) { + return oldRounding; + } + if (mins.size() < owningBucketOrd + 1) { + long oldSize = mins.size(); + mins = context.bigArrays().grow(mins, owningBucketOrd + 1); + mins.fill(oldSize, mins.size(), Long.MAX_VALUE); + } + if (maxes.size() < owningBucketOrd + 1) { + long oldSize = maxes.size(); + maxes = context.bigArrays().grow(maxes, owningBucketOrd + 1); + maxes.fill(oldSize, maxes.size(), Long.MIN_VALUE); + } + + long min = Math.min(mins.get(owningBucketOrd), newKey); + mins.set(owningBucketOrd, min); + long max = Math.max(maxes.get(owningBucketOrd), newKey); + maxes.set(owningBucketOrd, max); + if (oldEstimatedBucketCount <= targetBuckets * roundingInfos[oldRounding].getMaximumInnerInterval() + && max - min <= targetBuckets * roundingInfos[oldRounding].getMaximumRoughEstimateDurationMillis()) { + return oldRounding; + } + long oldRoughDuration = roundingInfos[oldRounding].roughEstimateDurationMillis; + int newRounding = oldRounding; + int newEstimatedBucketCount; + do { + newRounding++; + double ratio = (double) oldRoughDuration / (double) roundingInfos[newRounding].getRoughEstimateDurationMillis(); + newEstimatedBucketCount = (int) Math.ceil(oldEstimatedBucketCount * ratio); + } while (newRounding < roundingInfos.length - 1 + && (newEstimatedBucketCount > targetBuckets * roundingInfos[newRounding].getMaximumInnerInterval() + || max - min > targetBuckets * roundingInfos[newRounding].getMaximumRoughEstimateDurationMillis())); + setRounding(owningBucketOrd, newRounding); + mins.set(owningBucketOrd, preparedRoundings[newRounding].round(mins.get(owningBucketOrd))); + maxes.set(owningBucketOrd, preparedRoundings[newRounding].round(maxes.get(owningBucketOrd))); + wastedBucketsOverestimate += oldEstimatedBucketCount - newEstimatedBucketCount; + if (wastedBucketsOverestimate > nextRebucketAt) { + rebucket(); + // Bump the threshold for the next rebucketing + wastedBucketsOverestimate = 0; + nextRebucketAt *= 2; + } else { + liveBucketCountUnderestimate.set(owningBucketOrd, newEstimatedBucketCount); + } + return newRounding; + } + private void rebucket() { rebucketCount++; try (LongKeyedBucketOrds oldOrds = bucketOrds) { @@ -806,6 +865,7 @@ public void collectDebugInfo(BiConsumer add) { add.accept("wasted_buckets_overestimate", wastedBucketsOverestimate); add.accept("next_rebucket_at", nextRebucketAt); add.accept("rebucket_count", rebucketCount); + add.accept("skiplist_collectors_used", skiplistCollectorCount); } private void setRounding(long owningBucketOrd, int newRounding) { diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/histogram/DateHistogramAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/histogram/DateHistogramAggregator.java index 42dfc30b0da63..eb7f167015d13 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/histogram/DateHistogramAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/histogram/DateHistogramAggregator.java @@ -53,7 +53,6 @@ import org.opensearch.index.mapper.CompositeDataCubeFieldType; import org.opensearch.search.DocValueFormat; import org.opensearch.search.aggregations.Aggregator; -import org.opensearch.search.aggregations.AggregatorBase; import org.opensearch.search.aggregations.AggregatorFactories; import org.opensearch.search.aggregations.BucketOrder; import org.opensearch.search.aggregations.CardinalityUpperBound; @@ -233,7 +232,7 @@ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCol final SortedNumericDocValues values = valuesSource.longValues(ctx); final NumericDocValues singleton = DocValues.unwrapSingleton(values); - if (canUseSkiplist(skipper, singleton)) { + if (HistogramSkiplistLeafCollector.canUseSkiplist(hardBounds, parent, skipper, singleton)) { skipListCollectorsUsed++; return new HistogramSkiplistLeafCollector(singleton, skipper, preparedRounding, bucketOrds, sub, this); } @@ -295,23 +294,6 @@ public void collectRange(int min, int max) throws IOException { }; } - /** - * Skiplist is based as top level agg (null parent) or parent that will execute in sorted order - * - */ - private boolean canUseSkiplist(DocValuesSkipper skipper, NumericDocValues singleton) { - if (skipper == null || singleton == null) return false; - // TODO: add hard bounds support - if (hardBounds != null) return false; - - if (parent == null) return true; - - if (parent instanceof AggregatorBase base) { - return base.getLeafCollectorMode() == LeafCollectionMode.FILTER_REWRITE; - } - return false; - } - private void collectValue(LeafBucketCollector sub, int doc, long owningBucketOrd, long rounded) throws IOException { if (hardBounds == null || hardBounds.contain(rounded)) { long bucketOrd = bucketOrds.add(owningBucketOrd, rounded); diff --git a/server/src/test/java/org/opensearch/search/aggregations/bucket/filterrewrite/FilterRewriteSubAggTests.java b/server/src/test/java/org/opensearch/search/aggregations/bucket/filterrewrite/FilterRewriteSubAggTests.java index 26c540451f2be..f61caca550f22 100644 --- a/server/src/test/java/org/opensearch/search/aggregations/bucket/filterrewrite/FilterRewriteSubAggTests.java +++ b/server/src/test/java/org/opensearch/search/aggregations/bucket/filterrewrite/FilterRewriteSubAggTests.java @@ -284,6 +284,13 @@ public void testRangeWithCard() throws IOException { /** * Test that verifies skiplist-based collection works correctly with range aggregations * that have date histogram sub-aggregations. + * + * This test exercises the following code paths: + * 1. HistogramSkiplistLeafCollector.collect() - skiplist-based document collection + * 2. HistogramSkiplistLeafCollector.advanceSkipper() - skiplist advancement with upToBucket logic + * 3. SubAggRangeCollector.collect() - sub-aggregation collection path + * + * The test uses: * - Index sort on date field to enable skiplist functionality * - Multiple segments created via explicit commits * - Searchable date field type @@ -331,7 +338,7 @@ public void testRangeDate() throws IOException { InternalRange.Bucket firstBucket = buckets.get(0); assertEquals(5, firstBucket.getDocCount()); InternalDateHistogram firstDate = firstBucket.getAggregations().get(dateAggName); - assertNotNull(firstDate); + assertNotNull("Sub-aggregation should be present (verifies SubAggRangeCollector.collect() was called)", firstDate); assertEquals(1, firstDate.getBuckets().size()); assertEquals(5, firstDate.getBuckets().get(0).getDocCount()); @@ -339,7 +346,7 @@ public void testRangeDate() throws IOException { InternalRange.Bucket secondBucket = buckets.get(1); assertEquals(8, secondBucket.getDocCount()); InternalDateHistogram secondDate = secondBucket.getAggregations().get(dateAggName); - assertNotNull(secondDate); + assertNotNull("Sub-aggregation should be present (verifies SubAggRangeCollector.collect() was called)", secondDate); assertEquals(2, secondDate.getBuckets().size()); assertEquals(5, secondDate.getBuckets().get(0).getDocCount()); assertEquals(3, secondDate.getBuckets().get(1).getDocCount()); @@ -348,7 +355,7 @@ public void testRangeDate() throws IOException { InternalRange.Bucket thirdBucket = buckets.get(2); assertEquals(7, thirdBucket.getDocCount()); InternalDateHistogram thirdDate = thirdBucket.getAggregations().get(dateAggName); - assertNotNull(thirdDate); + assertNotNull("Sub-aggregation should be present (verifies SubAggRangeCollector.collect() was called)", thirdDate); assertEquals(1, thirdDate.getBuckets().size()); assertEquals(7, thirdDate.getBuckets().get(0).getDocCount()); } diff --git a/server/src/test/java/org/opensearch/search/aggregations/bucket/histogram/AutoDateHistogramAggregatorTests.java b/server/src/test/java/org/opensearch/search/aggregations/bucket/histogram/AutoDateHistogramAggregatorTests.java index 95f56d779b088..8cff2fd1f4e34 100644 --- a/server/src/test/java/org/opensearch/search/aggregations/bucket/histogram/AutoDateHistogramAggregatorTests.java +++ b/server/src/test/java/org/opensearch/search/aggregations/bucket/histogram/AutoDateHistogramAggregatorTests.java @@ -33,18 +33,22 @@ package org.opensearch.search.aggregations.bucket.histogram; import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; import org.apache.lucene.document.LongPoint; import org.apache.lucene.document.SortedNumericDocValuesField; import org.apache.lucene.document.SortedSetDocValuesField; +import org.apache.lucene.document.StringField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.NoMergePolicy; +import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.util.BytesRef; @@ -95,7 +99,8 @@ import static org.hamcrest.Matchers.hasSize; public class AutoDateHistogramAggregatorTests extends DateHistogramAggregatorTestCase { - private static final String DATE_FIELD = "date"; + // @timestamp field name by default uses skip_list + private static final String DATE_FIELD = "@timestamp"; private static final String INSTANT_FIELD = "instant"; private static final String NUMERIC_FIELD = "numeric"; @@ -986,15 +991,24 @@ private void testSearchCase( final List dataset, final Consumer configure, final Consumer verify + ) throws IOException { + testSearchCase(query, dataset, false, configure, verify); + } + + private void testSearchCase( + final Query query, + final List dataset, + final boolean enableSkiplist, + final Consumer configure, + final Consumer verify ) throws IOException { try (Directory directory = newDirectory()) { try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { - indexSampleData(dataset, indexWriter); + indexSampleData(dataset, indexWriter, enableSkiplist); } try (IndexReader indexReader = DirectoryReader.open(directory)) { final IndexSearcher indexSearcher = newSearcher(indexReader, true, true); - final AutoDateHistogramAggregationBuilder aggregationBuilder = new AutoDateHistogramAggregationBuilder("_name"); if (configure != null) { configure.accept(aggregationBuilder); @@ -1019,11 +1033,19 @@ private void testSearchCase( } private void indexSampleData(List dataset, RandomIndexWriter indexWriter) throws IOException { + indexSampleData(dataset, indexWriter, false); + } + + private void indexSampleData(List dataset, RandomIndexWriter indexWriter, boolean enableSkiplist) throws IOException { final Document document = new Document(); int i = 0; for (final ZonedDateTime date : dataset) { final long instant = date.toInstant().toEpochMilli(); - document.add(new SortedNumericDocValuesField(DATE_FIELD, instant)); + if (enableSkiplist) { + document.add(SortedNumericDocValuesField.indexedField(DATE_FIELD, instant)); + } else { + document.add(new SortedNumericDocValuesField(DATE_FIELD, instant)); + } document.add(new LongPoint(DATE_FIELD, instant)); document.add(new LongPoint(INSTANT_FIELD, instant)); document.add(new SortedNumericDocValuesField(NUMERIC_FIELD, i)); @@ -1031,6 +1053,13 @@ private void indexSampleData(List dataset, RandomIndexWriter inde document.clear(); i += 1; } + // delete a doc to avoid approx optimization + if (enableSkiplist) { + document.add(new StringField("someField", "a", Field.Store.NO)); + indexWriter.addDocument(document); + indexWriter.commit(); + indexWriter.deleteDocuments(new TermQuery(new Term("someField", "a"))); + } } private Map bucketCountsAsMap(InternalAutoDateHistogram result) { @@ -1052,6 +1081,230 @@ private Map maxAsMap(InternalAutoDateHistogram result) { return map; } + /** + * Test that verifies FromSingle.increaseRoundingIfNeeded() works correctly with skiplist collector. + * This test validates: + * 1. preparedRounding field update is visible to skiplist collector + * 2. New Rounding.Prepared instance is created on rounding change + * 3. owningBucketOrd is always 0 in FromSingle context + * 4. Bucket merging works correctly after rounding change + * + */ + public void testSkiplistCollectorWithRoundingChange() throws IOException { + // Create a dataset that will trigger rounding changes + // Start with hourly data, then add data that spans months to force rounding increase + final List dataset = new ArrayList<>(); + + // Add hourly data for first day (24 docs) + ZonedDateTime start = ZonedDateTime.of(2020, 1, 1, 0, 0, 0, 0, ZoneOffset.UTC); + for (int hour = 0; hour < 24; hour++) { + dataset.add(start.plusHours(hour)); + } + + // Add data spanning several months to trigger rounding increase (30 docs) + for (int month = 0; month < 6; month++) { + for (int day = 0; day < 5; day++) { + dataset.add(start.plusMonths(month).plusDays(day * 6)); + } + } + + testSearchCase(DEFAULT_QUERY, dataset, true, aggregation -> aggregation.setNumBuckets(10).field(DATE_FIELD), histogram -> { + // Verify that aggregation completed successfully + assertTrue(AggregationInspectionHelper.hasValue(histogram)); + + // Verify buckets were created (exact count depends on rounding chosen) + assertFalse(histogram.getBuckets().isEmpty()); + assertTrue(histogram.getBuckets().size() <= 10); + + // Verify total doc count matches input + long totalDocs = histogram.getBuckets().stream().mapToLong(Histogram.Bucket::getDocCount).sum(); + assertEquals(54, totalDocs); // 24 + 30 = 54 docs + + // Verify buckets are in ascending order (requirement for histogram) + List buckets = histogram.getBuckets(); + for (int i = 1; i < buckets.size(); i++) { + assertTrue(((ZonedDateTime) buckets.get(i - 1).getKey()).isBefore((ZonedDateTime) buckets.get(i).getKey())); + } + }); + } + + /** + * Test that verifies skiplist collector handles rounding changes correctly with sub-aggregations. + * This ensures that when rounding changes mid-collection, sub-aggregations still produce correct results. + * + */ + public void testSkiplistCollectorRoundingChangeWithSubAggs() throws IOException { + // Create dataset that triggers rounding change + final List dataset = new ArrayList<>(); + ZonedDateTime start = ZonedDateTime.of(2020, 1, 1, 0, 0, 0, 0, ZoneOffset.UTC); + + // Add data spanning months to trigger rounding increase + for (int month = 0; month < 12; month++) { + for (int day = 0; day < 3; day++) { + dataset.add(start.plusMonths(month).plusDays(day * 10)); + } + } + + testSearchCase( + DEFAULT_QUERY, + dataset, + true, + aggregation -> aggregation.setNumBuckets(8) + .field(DATE_FIELD) + .subAggregation(AggregationBuilders.stats("stats").field(DATE_FIELD)), + histogram -> { + assertTrue(AggregationInspectionHelper.hasValue(histogram)); + + // Verify buckets were created + assertFalse(histogram.getBuckets().isEmpty()); + assertTrue(histogram.getBuckets().size() <= 8); + + // Verify sub-aggregations are present and valid + for (Histogram.Bucket bucket : histogram.getBuckets()) { + InternalStats stats = bucket.getAggregations().get("stats"); + assertNotNull(stats); + if (bucket.getDocCount() > 0) { + assertTrue(AggregationInspectionHelper.hasValue(stats)); + assertTrue(stats.getCount() > 0); + } + } + + // Verify total doc count + long totalDocs = histogram.getBuckets().stream().mapToLong(Histogram.Bucket::getDocCount).sum(); + assertEquals(36, totalDocs); // 12 months * 3 days = 36 docs + } + ); + } + + /** + * Test that verifies bucket merging works correctly after rounding change. + * This test creates a scenario where buckets must be merged when rounding increases. + * + */ + public void testSkiplistCollectorBucketMergingAfterRoundingChange() throws IOException { + // Create dataset with fine-grained data that will be merged into coarser buckets + final List dataset = new ArrayList<>(); + ZonedDateTime start = ZonedDateTime.of(2020, 1, 1, 0, 0, 0, 0, ZoneOffset.UTC); + + // Add hourly data for multiple days, then add data spanning years + // This forces rounding to increase from hours -> days -> months -> years + for (int day = 0; day < 5; day++) { + for (int hour = 0; hour < 24; hour++) { + dataset.add(start.plusDays(day).plusHours(hour)); + } + } + + // Add data spanning multiple years to force coarse rounding + for (int year = 0; year < 5; year++) { + for (int month = 0; month < 12; month += 3) { + dataset.add(start.plusYears(year).plusMonths(month)); + } + } + + testSearchCase(DEFAULT_QUERY, dataset, true, aggregation -> aggregation.setNumBuckets(5).field(DATE_FIELD), histogram -> { + assertTrue(AggregationInspectionHelper.hasValue(histogram)); + + // Verify buckets were created and merged appropriately + assertFalse(histogram.getBuckets().isEmpty()); + assertTrue(histogram.getBuckets().size() <= 5); + + // Verify total doc count is preserved after merging + long totalDocs = histogram.getBuckets().stream().mapToLong(Histogram.Bucket::getDocCount).sum(); + assertEquals(140, totalDocs); // (5 days * 24 hours) + (5 years * 4 quarters) = 120 + 20 = 140 + + Map expectedDocCount = new TreeMap<>(); + expectedDocCount.put("2020-01-01T00:00:00.000Z", 124); // 5 * 24 hours + 4 quarters + expectedDocCount.put("2021-01-01T00:00:00.000Z", 4); + expectedDocCount.put("2022-01-01T00:00:00.000Z", 4); + expectedDocCount.put("2023-01-01T00:00:00.000Z", 4); + expectedDocCount.put("2024-01-01T00:00:00.000Z", 4); + + assertThat(bucketCountsAsMap(histogram), equalTo(expectedDocCount)); + }); + } + + /** + * Test that verifies skiplist and non-skiplist collectors produce identical results. + * Uses random number of documents (20-200) and random date distribution. + */ + public void testSkiplistEquivalence() throws IOException { + // Generate random number of documents between 20 and 200 + final int numDocs = randomIntBetween(20, 200); + final List dataset = new ArrayList<>(numDocs); + + // Generate random dates spanning a year + final ZonedDateTime startDate = ZonedDateTime.of(2020, 1, 1, 0, 0, 0, 0, ZoneOffset.UTC); + final long yearInMillis = 365L * 24 * 60 * 60 * 1000; + + for (int i = 0; i < numDocs; i++) { + long randomOffset = randomLongBetween(0, yearInMillis); + dataset.add(startDate.plusSeconds(randomOffset / 1000)); + } + + // Sort dataset to ensure consistent ordering + dataset.sort(ZonedDateTime::compareTo); + + // Test with random number of buckets + final int numBuckets = randomIntBetween(5, 20); + + // Run aggregation without skiplist + final InternalAutoDateHistogram histogramWithoutSkiplist = runAggregation(dataset, false, numBuckets); + + // Run aggregation with skiplist + final InternalAutoDateHistogram histogramWithSkiplist = runAggregation(dataset, true, numBuckets); + + // Verify both produce identical results + assertEquals( + "Bucket count mismatch between skiplist and non-skiplist", + histogramWithoutSkiplist.getBuckets().size(), + histogramWithSkiplist.getBuckets().size() + ); + + // Verify each bucket matches + for (int i = 0; i < histogramWithoutSkiplist.getBuckets().size(); i++) { + InternalAutoDateHistogram.Bucket bucketWithout = histogramWithoutSkiplist.getBuckets().get(i); + InternalAutoDateHistogram.Bucket bucketWith = histogramWithSkiplist.getBuckets().get(i); + + assertEquals("Bucket key mismatch at index " + i, bucketWithout.getKey(), bucketWith.getKey()); + + assertEquals( + "Doc count mismatch at index " + i + " for key " + bucketWithout.getKeyAsString(), + bucketWithout.getDocCount(), + bucketWith.getDocCount() + ); + } + + // Verify total doc counts match + long totalDocsWithout = histogramWithoutSkiplist.getBuckets().stream().mapToLong(Histogram.Bucket::getDocCount).sum(); + long totalDocsWith = histogramWithSkiplist.getBuckets().stream().mapToLong(Histogram.Bucket::getDocCount).sum(); + + assertEquals("Total doc count mismatch", totalDocsWithout, totalDocsWith); + assertEquals("Total doc count should match input", numDocs, totalDocsWithout); + } + + private InternalAutoDateHistogram runAggregation(List dataset, boolean enableSkiplist, int numBuckets) + throws IOException { + try (Directory directory = newDirectory()) { + try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { + indexSampleData(dataset, indexWriter, enableSkiplist); + } + + try (IndexReader indexReader = DirectoryReader.open(directory)) { + final IndexSearcher indexSearcher = newSearcher(indexReader, true, true); + + final AutoDateHistogramAggregationBuilder aggregationBuilder = new AutoDateHistogramAggregationBuilder("_name") + .setNumBuckets(numBuckets) + .field(DATE_FIELD); + + final DateFieldMapper.DateFieldType fieldType = new DateFieldMapper.DateFieldType(DATE_FIELD); + MappedFieldType instantFieldType = new NumberFieldMapper.NumberFieldType(INSTANT_FIELD, NumberFieldMapper.NumberType.LONG); + MappedFieldType numericFieldType = new NumberFieldMapper.NumberFieldType(NUMERIC_FIELD, NumberFieldMapper.NumberType.LONG); + + return searchAndReduce(indexSearcher, DEFAULT_QUERY, aggregationBuilder, fieldType, instantFieldType, numericFieldType); + } + } + } + @Override public void doAssertReducedMultiBucketConsumer(Aggregation agg, MultiBucketConsumerService.MultiBucketConsumer bucketConsumer) { /*