-
Notifications
You must be signed in to change notification settings - Fork 25.6k
Calculate sum in Kahan summation algorithm in aggregations (#27807) #27848
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
30f7228
6c45e08
95aff79
15cdec2
1f07cca
594f750
b2ce7cb
176be1f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -44,6 +44,7 @@ public class AvgAggregator extends NumericMetricsAggregator.SingleValue { | |
|
|
||
| LongArray counts; | ||
| DoubleArray sums; | ||
| DoubleArray compensations; | ||
| DocValueFormat format; | ||
|
|
||
| public AvgAggregator(String name, ValuesSource.Numeric valuesSource, DocValueFormat formatter, SearchContext context, | ||
|
|
@@ -55,6 +56,7 @@ public AvgAggregator(String name, ValuesSource.Numeric valuesSource, DocValueFor | |
| final BigArrays bigArrays = context.bigArrays(); | ||
| counts = bigArrays.newLongArray(1, true); | ||
| sums = bigArrays.newDoubleArray(1, true); | ||
| compensations = bigArrays.newDoubleArray(1, true); | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -76,15 +78,31 @@ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, | |
| public void collect(int doc, long bucket) throws IOException { | ||
| counts = bigArrays.grow(counts, bucket + 1); | ||
| sums = bigArrays.grow(sums, bucket + 1); | ||
| compensations = bigArrays.grow(compensations, bucket + 1); | ||
|
|
||
| if (values.advanceExact(doc)) { | ||
| final int valueCount = values.docValueCount(); | ||
| counts.increment(bucket, valueCount); | ||
| double sum = 0; | ||
| // Compute the sum of double values with Kahan summation algorithm which is more | ||
| // accurate than naive summation. | ||
| double sum = sums.get(bucket); | ||
| double compensation = compensations.get(bucket); | ||
|
|
||
| for (int i = 0; i < valueCount; i++) { | ||
| sum += values.nextValue(); | ||
| double value = values.nextValue(); | ||
| if (Double.isNaN(value) || Double.isInfinite(value)) { | ||
| sum += value; | ||
| if (Double.isNaN(sum)) | ||
|
||
| break; | ||
| } else if (Double.isFinite(sum)) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is always true I think? So we could make it an
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The check statement is needed. For example when summing up |
||
| double corrected = value - compensation; | ||
| double newSum = sum + corrected; | ||
| compensation = (newSum - sum) - corrected; | ||
| sum = newSum; | ||
| } | ||
| } | ||
| sums.increment(bucket, sum); | ||
| sums.set(bucket, sum); | ||
| compensations.set(bucket, compensation); | ||
| } | ||
| } | ||
| }; | ||
|
|
@@ -113,7 +131,7 @@ public InternalAggregation buildEmptyAggregation() { | |
|
|
||
| @Override | ||
| public void doClose() { | ||
| Releasables.close(counts, sums); | ||
| Releasables.close(counts, sums, compensations); | ||
| } | ||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -91,9 +91,22 @@ public String getWriteableName() { | |
| public InternalAvg doReduce(List<InternalAggregation> aggregations, ReduceContext reduceContext) { | ||
| long count = 0; | ||
| double sum = 0; | ||
| double compensation = 0; | ||
| // Compute the sum of double values with Kahan summation algorithm which is more | ||
| // accurate than naive summation. | ||
| for (InternalAggregation aggregation : aggregations) { | ||
| count += ((InternalAvg) aggregation).count; | ||
| sum += ((InternalAvg) aggregation).sum; | ||
| InternalAvg avg = (InternalAvg) aggregation; | ||
| count += avg.count; | ||
| if (Double.isNaN(sum) == false) { | ||
|
||
| if (Double.isNaN(avg.sum) || Double.isInfinite(avg.sum)) { | ||
| sum += avg.sum; | ||
| } else if (Double.isFinite(sum)) { | ||
| double corrected = avg.sum - compensation; | ||
| double newSum = sum + corrected; | ||
| compensation = (newSum - sum) - corrected; | ||
| sum = newSum; | ||
| } | ||
| } | ||
| } | ||
| return new InternalAvg(getName(), sum, count, format, pipelineAggregators(), getMetaData()); | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could test both at once by doing
Double.isFinite(value) == false?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it's much better.