@@ -13,24 +13,24 @@ use std::borrow::Borrow;
13
13
use std:: sync:: Arc ;
14
14
use std:: time:: Duration ;
15
15
16
+ use mz_cluster_client:: metrics:: { ControllerMetrics , WallclockLagMetrics } ;
16
17
use mz_cluster_client:: ReplicaId ;
17
18
use mz_compute_types:: ComputeInstanceId ;
18
19
use mz_ore:: cast:: CastFrom ;
19
20
use mz_ore:: metric;
20
21
use mz_ore:: metrics:: raw:: UIntGaugeVec ;
21
22
use mz_ore:: metrics:: {
22
- CounterVec , DeleteOnDropCounter , DeleteOnDropGauge , DeleteOnDropHistogram , GaugeVec ,
23
- HistogramVec , IntCounterVec , MetricVecExt , MetricsRegistry ,
23
+ DeleteOnDropCounter , DeleteOnDropGauge , DeleteOnDropHistogram , GaugeVec , HistogramVec ,
24
+ IntCounterVec , MetricVecExt , MetricsRegistry ,
24
25
} ;
25
- use mz_ore:: stats:: { histogram_seconds_buckets, SlidingMinMax } ;
26
+ use mz_ore:: stats:: histogram_seconds_buckets;
26
27
use mz_repr:: GlobalId ;
27
28
use mz_service:: codec:: StatsCollector ;
28
29
use prometheus:: core:: { AtomicF64 , AtomicU64 } ;
29
30
30
31
use crate :: protocol:: command:: { ComputeCommand , ProtoComputeCommand } ;
31
32
use crate :: protocol:: response:: { PeekResponse , ProtoComputeResponse } ;
32
33
33
- type Counter = DeleteOnDropCounter < ' static , AtomicF64 , Vec < String > > ;
34
34
pub ( crate ) type IntCounter = DeleteOnDropCounter < ' static , AtomicU64 , Vec < String > > ;
35
35
type Gauge = DeleteOnDropGauge < ' static , AtomicF64 , Vec < String > > ;
36
36
/// TODO(database-issues#7533): Add documentation.
@@ -68,14 +68,14 @@ pub struct ComputeControllerMetrics {
68
68
69
69
// dataflows
70
70
dataflow_initial_output_duration_seconds : GaugeVec ,
71
- dataflow_wallclock_lag_seconds : GaugeVec ,
72
- dataflow_wallclock_lag_seconds_sum : CounterVec ,
73
- dataflow_wallclock_lag_seconds_count : IntCounterVec ,
71
+
72
+ /// Metrics shared with the storage controller.
73
+ shared : ControllerMetrics ,
74
74
}
75
75
76
76
impl ComputeControllerMetrics {
77
77
/// Create a metrics instance registered into the given registry.
78
- pub fn new ( metrics_registry : MetricsRegistry ) -> Self {
78
+ pub fn new ( metrics_registry : & MetricsRegistry , shared : ControllerMetrics ) -> Self {
79
79
ComputeControllerMetrics {
80
80
commands_total : metrics_registry. register ( metric ! (
81
81
name: "mz_compute_commands_total" ,
@@ -174,25 +174,7 @@ impl ComputeControllerMetrics {
174
174
var_labels: [ "instance_id" , "replica_id" , "collection_id" ] ,
175
175
) ) ,
176
176
177
- // The next three metrics immitate a summary metric type. The `prometheus` crate lacks
178
- // support for summaries, so we roll our own. Note that we also only expose the 0- and
179
- // the 1-quantile, i.e., minimum and maximum lag values.
180
- dataflow_wallclock_lag_seconds : metrics_registry. register ( metric ! (
181
- name: "mz_dataflow_wallclock_lag_seconds" ,
182
- help: "A summary of the second-by-second lag of the dataflow frontier relative \
183
- to wallclock time, aggregated over the last minute.",
184
- var_labels: [ "instance_id" , "replica_id" , "collection_id" , "quantile" ] ,
185
- ) ) ,
186
- dataflow_wallclock_lag_seconds_sum : metrics_registry. register ( metric ! (
187
- name: "mz_dataflow_wallclock_lag_seconds_sum" ,
188
- help: "The total sum of dataflow wallclock lag measurements." ,
189
- var_labels: [ "instance_id" , "replica_id" , "collection_id" ] ,
190
- ) ) ,
191
- dataflow_wallclock_lag_seconds_count : metrics_registry. register ( metric ! (
192
- name: "mz_dataflow_wallclock_lag_seconds_count" ,
193
- help: "The total count of dataflow wallclock lag measurements." ,
194
- var_labels: [ "instance_id" , "replica_id" , "collection_id" ] ,
195
- ) ) ,
177
+ shared,
196
178
}
197
179
}
198
180
@@ -418,44 +400,20 @@ impl ReplicaMetrics {
418
400
collection_id. to_string( ) ,
419
401
] ;
420
402
421
- let labels_with_quantile = |quantile : & str | {
422
- labels
423
- . iter ( )
424
- . cloned ( )
425
- . chain ( [ quantile. to_string ( ) ] )
426
- . collect ( )
427
- } ;
428
-
429
403
let initial_output_duration_seconds = self
430
404
. metrics
431
405
. dataflow_initial_output_duration_seconds
432
406
. get_delete_on_drop_metric ( labels. clone ( ) ) ;
433
407
434
- let wallclock_lag_seconds_min = self
435
- . metrics
436
- . dataflow_wallclock_lag_seconds
437
- . get_delete_on_drop_metric ( labels_with_quantile ( "0" ) ) ;
438
- let wallclock_lag_seconds_max = self
439
- . metrics
440
- . dataflow_wallclock_lag_seconds
441
- . get_delete_on_drop_metric ( labels_with_quantile ( "1" ) ) ;
442
- let wallclock_lag_seconds_sum = self
443
- . metrics
444
- . dataflow_wallclock_lag_seconds_sum
445
- . get_delete_on_drop_metric ( labels. clone ( ) ) ;
446
- let wallclock_lag_seconds_count = self
447
- . metrics
448
- . dataflow_wallclock_lag_seconds_count
449
- . get_delete_on_drop_metric ( labels) ;
450
- let wallclock_lag_minmax = SlidingMinMax :: new ( 60 ) ;
408
+ let wallclock_lag = self . metrics . shared . wallclock_lag_metrics (
409
+ collection_id. to_string ( ) ,
410
+ Some ( self . instance_id . to_string ( ) ) ,
411
+ Some ( self . replica_id . to_string ( ) ) ,
412
+ ) ;
451
413
452
414
Some ( ReplicaCollectionMetrics {
453
415
initial_output_duration_seconds,
454
- wallclock_lag_seconds_min,
455
- wallclock_lag_seconds_max,
456
- wallclock_lag_seconds_sum,
457
- wallclock_lag_seconds_count,
458
- wallclock_lag_minmax,
416
+ wallclock_lag,
459
417
} )
460
418
}
461
419
}
@@ -484,35 +442,8 @@ impl StatsCollector<ProtoComputeCommand, ProtoComputeResponse> for ReplicaMetric
484
442
pub ( crate ) struct ReplicaCollectionMetrics {
485
443
/// Gauge tracking dataflow hydration time.
486
444
pub initial_output_duration_seconds : Gauge ,
487
- /// Gauge tracking minimum dataflow wallclock lag.
488
- wallclock_lag_seconds_min : Gauge ,
489
- /// Gauge tracking maximum dataflow wallclock lag.
490
- wallclock_lag_seconds_max : Gauge ,
491
- /// Counter tracking the total sum of dataflow wallclock lag.
492
- wallclock_lag_seconds_sum : Counter ,
493
- /// Counter tracking the total count of dataflow wallclock lag measurements.
494
- wallclock_lag_seconds_count : IntCounter ,
495
-
496
- /// State maintaining minimum and maximum wallclock lag.
497
- wallclock_lag_minmax : SlidingMinMax < f32 > ,
498
- }
499
-
500
- impl ReplicaCollectionMetrics {
501
- pub fn observe_wallclock_lag ( & mut self , lag : Duration ) {
502
- let lag_secs = lag. as_secs_f32 ( ) ;
503
-
504
- self . wallclock_lag_minmax . add_sample ( lag_secs) ;
505
-
506
- let ( & min, & max) = self
507
- . wallclock_lag_minmax
508
- . get ( )
509
- . expect ( "just added a sample" ) ;
510
-
511
- self . wallclock_lag_seconds_min . set ( min. into ( ) ) ;
512
- self . wallclock_lag_seconds_max . set ( max. into ( ) ) ;
513
- self . wallclock_lag_seconds_sum . inc_by ( lag_secs. into ( ) ) ;
514
- self . wallclock_lag_seconds_count . inc ( ) ;
515
- }
445
+ /// Metrics tracking dataflow wallclock lag.
446
+ pub wallclock_lag : WallclockLagMetrics ,
516
447
}
517
448
518
449
/// Metrics keyed by `ComputeCommand` type.
0 commit comments