diff --git a/changelog.d/24415_histogram_incremental_conversion.fix.md b/changelog.d/24415_histogram_incremental_conversion.fix.md new file mode 100644 index 0000000000000..3aa1c23388f83 --- /dev/null +++ b/changelog.d/24415_histogram_incremental_conversion.fix.md @@ -0,0 +1,3 @@ +Fixed histogram incremental conversion by ensuring all individual buckets increase or reinitializing the entire metric. + +authors: dd-sebastien-lb diff --git a/lib/vector-core/src/event/metric/mod.rs b/lib/vector-core/src/event/metric/mod.rs index 884fbbdc1d81e..5e956f8042c02 100644 --- a/lib/vector-core/src/event/metric/mod.rs +++ b/lib/vector-core/src/event/metric/mod.rs @@ -896,6 +896,33 @@ mod test { assert!(!new_reset_histogram.subtract(&old_histogram)); } + #[test] + fn subtract_aggregated_histograms_bucket_redistribution() { + // Test for issue #24415: when total count is higher but individual bucket counts is sometimes lower + let old_histogram = Metric::new( + "histogram", + MetricKind::Absolute, + MetricValue::AggregatedHistogram { + count: 15, + sum: 15.0, + buckets: buckets!(1.0 => 10, 2.0 => 5), + }, + ); + + let mut new_histogram_with_redistribution = Metric::new( + "histogram", + MetricKind::Absolute, + MetricValue::AggregatedHistogram { + count: 20, + sum: 20.0, + // Total count is higher (20 > 15), but bucket1 count is lower (8 < 10) + buckets: buckets!(1.0 => 8, 2.0 => 12), + }, + ); + + assert!(!new_histogram_with_redistribution.subtract(&old_histogram)); + } + #[test] // `too_many_lines` is mostly just useful for production code but we're not // able to flag the lint on only for non-test. diff --git a/lib/vector-core/src/event/metric/value.rs b/lib/vector-core/src/event/metric/value.rs index ce21e80bbec24..7e301ae7943ce 100644 --- a/lib/vector-core/src/event/metric/value.rs +++ b/lib/vector-core/src/event/metric/value.rs @@ -327,6 +327,11 @@ impl MetricValue { // fewer values -- would not make sense, since buckets should never be able to have negative counts... and // it's not clear that a saturating subtraction is technically correct either. Instead, we avoid having to // make that decision, and simply force the metric to be reinitialized. + // + // We also check that each individual bucket count is >= the corresponding count in the + // other histogram, since bucket value redistribution (e.g., after a source restart or + // cache eviction) can cause individual buckets to have lower counts even when the total + // count is higher. Failing here leads to the metric being reinitialized. ( Self::AggregatedHistogram { buckets, @@ -343,7 +348,7 @@ impl MetricValue { && buckets .iter() .zip(buckets2.iter()) - .all(|(b1, b2)| b1.upper_limit == b2.upper_limit) => + .all(|(b1, b2)| b1.upper_limit == b2.upper_limit && b1.count >= b2.count) => { for (b1, b2) in buckets.iter_mut().zip(buckets2) { b1.count -= b2.count;