Skip to content

Commit e338e08

Browse files
committed
kvserver: delete per action priority inversion metrics
This commit removes per-action priority inversion metrics due to their high cardinality. We already have logging in place, which should provide sufficient observability. For now, we care about is priority inversion that leads to consider rebalance and requeuing the most.
1 parent be668d7 commit e338e08

File tree

3 files changed

+6
-276
lines changed

3 files changed

+6
-276
lines changed

docs/generated/metrics/metrics.yaml

Lines changed: 0 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -13919,118 +13919,6 @@ layers:
1391913919
unit: COUNT
1392013920
aggregation: AVG
1392113921
derivative: NONE
13922-
- name: queue.replicate.priority_inversion.addnonvoter
13923-
exported_name: queue_replicate_priority_inversion_addnonvoter
13924-
description: Number of priority inversions in the replicate queue that resulted in add non-voter action during processing
13925-
y_axis_label: Replicas
13926-
type: COUNTER
13927-
unit: COUNT
13928-
aggregation: AVG
13929-
derivative: NON_NEGATIVE_DERIVATIVE
13930-
- name: queue.replicate.priority_inversion.addvoter
13931-
exported_name: queue_replicate_priority_inversion_addvoter
13932-
description: Number of priority inversions in the replicate queue that resulted in add voter action during processing
13933-
y_axis_label: Replicas
13934-
type: COUNTER
13935-
unit: COUNT
13936-
aggregation: AVG
13937-
derivative: NON_NEGATIVE_DERIVATIVE
13938-
- name: queue.replicate.priority_inversion.considerrebalance
13939-
exported_name: queue_replicate_priority_inversion_considerrebalance
13940-
description: Number of priority inversions in the replicate queue that resulted in consider rebalance action during processing
13941-
y_axis_label: Replicas
13942-
type: COUNTER
13943-
unit: COUNT
13944-
aggregation: AVG
13945-
derivative: NON_NEGATIVE_DERIVATIVE
13946-
- name: queue.replicate.priority_inversion.noop
13947-
exported_name: queue_replicate_priority_inversion_noop
13948-
description: Number of priority inversions in the replicate queue that resulted in noop action during processing
13949-
y_axis_label: Replicas
13950-
type: COUNTER
13951-
unit: COUNT
13952-
aggregation: AVG
13953-
derivative: NON_NEGATIVE_DERIVATIVE
13954-
- name: queue.replicate.priority_inversion.rangeunavailable
13955-
exported_name: queue_replicate_priority_inversion_rangeunavailable
13956-
description: Number of priority inversions in the replicate queue that resulted in range unavailable action during processing
13957-
y_axis_label: Replicas
13958-
type: COUNTER
13959-
unit: COUNT
13960-
aggregation: AVG
13961-
derivative: NON_NEGATIVE_DERIVATIVE
13962-
- name: queue.replicate.priority_inversion.removedeadnonvoter
13963-
exported_name: queue_replicate_priority_inversion_removedeadnonvoter
13964-
description: Number of priority inversions in the replicate queue that resulted in remove dead non-voter action during processing
13965-
y_axis_label: Replicas
13966-
type: COUNTER
13967-
unit: COUNT
13968-
aggregation: AVG
13969-
derivative: NON_NEGATIVE_DERIVATIVE
13970-
- name: queue.replicate.priority_inversion.removedeadvoter
13971-
exported_name: queue_replicate_priority_inversion_removedeadvoter
13972-
description: Number of priority inversions in the replicate queue that resulted in remove dead voter action during processing
13973-
y_axis_label: Replicas
13974-
type: COUNTER
13975-
unit: COUNT
13976-
aggregation: AVG
13977-
derivative: NON_NEGATIVE_DERIVATIVE
13978-
- name: queue.replicate.priority_inversion.removedecommissioningnonvoter
13979-
exported_name: queue_replicate_priority_inversion_removedecommissioningnonvoter
13980-
description: Number of priority inversions in the replicate queue that resulted in remove decommissioning non-voter action during processing
13981-
y_axis_label: Replicas
13982-
type: COUNTER
13983-
unit: COUNT
13984-
aggregation: AVG
13985-
derivative: NON_NEGATIVE_DERIVATIVE
13986-
- name: queue.replicate.priority_inversion.removedecommissioningvoter
13987-
exported_name: queue_replicate_priority_inversion_removedecommissioningvoter
13988-
description: Number of priority inversions in the replicate queue that resulted in remove decommissioning voter action during processing
13989-
y_axis_label: Replicas
13990-
type: COUNTER
13991-
unit: COUNT
13992-
aggregation: AVG
13993-
derivative: NON_NEGATIVE_DERIVATIVE
13994-
- name: queue.replicate.priority_inversion.removenonvoter
13995-
exported_name: queue_replicate_priority_inversion_removenonvoter
13996-
description: Number of priority inversions in the replicate queue that resulted in remove non-voter action during processing
13997-
y_axis_label: Replicas
13998-
type: COUNTER
13999-
unit: COUNT
14000-
aggregation: AVG
14001-
derivative: NON_NEGATIVE_DERIVATIVE
14002-
- name: queue.replicate.priority_inversion.removevoter
14003-
exported_name: queue_replicate_priority_inversion_removevoter
14004-
description: Number of priority inversions in the replicate queue that resulted in remove voter action during processing
14005-
y_axis_label: Replicas
14006-
type: COUNTER
14007-
unit: COUNT
14008-
aggregation: AVG
14009-
derivative: NON_NEGATIVE_DERIVATIVE
14010-
- name: queue.replicate.priority_inversion.replacedeadnonvoter
14011-
exported_name: queue_replicate_priority_inversion_replacedeadnonvoter
14012-
description: Number of priority inversions in the replicate queue that resulted in replace dead non-voter action during processing
14013-
y_axis_label: Replicas
14014-
type: COUNTER
14015-
unit: COUNT
14016-
aggregation: AVG
14017-
derivative: NON_NEGATIVE_DERIVATIVE
14018-
- name: queue.replicate.priority_inversion.replacedecommissioningnonvoter
14019-
exported_name: queue_replicate_priority_inversion_replacedecommissioningnonvoter
14020-
description: Number of priority inversions in the replicate queue that resulted in replace decommissioning non-voter action during processing
14021-
y_axis_label: Replicas
14022-
type: COUNTER
14023-
unit: COUNT
14024-
aggregation: AVG
14025-
derivative: NON_NEGATIVE_DERIVATIVE
14026-
- name: queue.replicate.priority_inversion.replacedecommissioningvoter
14027-
exported_name: queue_replicate_priority_inversion_replacedecommissioningvoter
14028-
description: Number of priority inversions in the replicate queue that resulted in replace decommissioning voter action during processing
14029-
y_axis_label: Replicas
14030-
type: COUNTER
14031-
unit: COUNT
14032-
aggregation: AVG
14033-
derivative: NON_NEGATIVE_DERIVATIVE
1403413922
- name: queue.replicate.priority_inversion.requeue
1403513923
exported_name: queue_replicate_priority_inversion_requeue
1403613924
description: Number of priority inversions in the replicate queue that resulted in requeuing of the replicas. A priority inversion occurs when the priority at processing time ends up being lower than at enqueue time. When the priority has changed from a high priority repair action to rebalance, the change is requeued to avoid unfairness.

pkg/kv/kvserver/replicate_queue.go

Lines changed: 5 additions & 161 deletions
Original file line numberDiff line numberDiff line change
@@ -339,90 +339,6 @@ var (
339339
Measurement: "Replicas",
340340
Unit: metric.Unit_COUNT,
341341
}
342-
metaReplicateQueuePriorityInversionForAddVoterCount = metric.Metadata{
343-
Name: "queue.replicate.priority_inversion.addvoter",
344-
Help: "Number of priority inversions in the replicate queue that resulted in add voter action during processing",
345-
Measurement: "Replicas",
346-
Unit: metric.Unit_COUNT,
347-
}
348-
metaReplicateQueuePriorityInversionForReplaceDecommissioningVoterCount = metric.Metadata{
349-
Name: "queue.replicate.priority_inversion.replacedecommissioningvoter",
350-
Help: "Number of priority inversions in the replicate queue that resulted in replace decommissioning voter action during processing",
351-
Measurement: "Replicas",
352-
Unit: metric.Unit_COUNT,
353-
}
354-
metaReplicateQueuePriorityInversionForRemoveDeadVoterCount = metric.Metadata{
355-
Name: "queue.replicate.priority_inversion.removedeadvoter",
356-
Help: "Number of priority inversions in the replicate queue that resulted in remove dead voter action during processing",
357-
Measurement: "Replicas",
358-
Unit: metric.Unit_COUNT,
359-
}
360-
metaReplicateQueuePriorityInversionForRemoveDecommissioningVoterCount = metric.Metadata{
361-
Name: "queue.replicate.priority_inversion.removedecommissioningvoter",
362-
Help: "Number of priority inversions in the replicate queue that resulted in remove decommissioning voter action during processing",
363-
Measurement: "Replicas",
364-
Unit: metric.Unit_COUNT,
365-
}
366-
metaReplicateQueuePriorityInversionForRemoveVoterCount = metric.Metadata{
367-
Name: "queue.replicate.priority_inversion.removevoter",
368-
Help: "Number of priority inversions in the replicate queue that resulted in remove voter action during processing",
369-
Measurement: "Replicas",
370-
Unit: metric.Unit_COUNT,
371-
}
372-
metaReplicateQueuePriorityInversionForReplaceDeadNonVoterCount = metric.Metadata{
373-
Name: "queue.replicate.priority_inversion.replacedeadnonvoter",
374-
Help: "Number of priority inversions in the replicate queue that resulted in replace dead non-voter action during processing",
375-
Measurement: "Replicas",
376-
Unit: metric.Unit_COUNT,
377-
}
378-
metaReplicateQueuePriorityInversionForAddNonVoterCount = metric.Metadata{
379-
Name: "queue.replicate.priority_inversion.addnonvoter",
380-
Help: "Number of priority inversions in the replicate queue that resulted in add non-voter action during processing",
381-
Measurement: "Replicas",
382-
Unit: metric.Unit_COUNT,
383-
}
384-
metaReplicateQueuePriorityInversionForReplaceDecommissioningNonVoterCount = metric.Metadata{
385-
Name: "queue.replicate.priority_inversion.replacedecommissioningnonvoter",
386-
Help: "Number of priority inversions in the replicate queue that resulted in replace decommissioning non-voter action during processing",
387-
Measurement: "Replicas",
388-
Unit: metric.Unit_COUNT,
389-
}
390-
metaReplicateQueuePriorityInversionForRemoveDeadNonVoterCount = metric.Metadata{
391-
Name: "queue.replicate.priority_inversion.removedeadnonvoter",
392-
Help: "Number of priority inversions in the replicate queue that resulted in remove dead non-voter action during processing",
393-
Measurement: "Replicas",
394-
Unit: metric.Unit_COUNT,
395-
}
396-
metaReplicateQueuePriorityInversionForRemoveDecommissioningNonVoterCount = metric.Metadata{
397-
Name: "queue.replicate.priority_inversion.removedecommissioningnonvoter",
398-
Help: "Number of priority inversions in the replicate queue that resulted in remove decommissioning non-voter action during processing",
399-
Measurement: "Replicas",
400-
Unit: metric.Unit_COUNT,
401-
}
402-
metaReplicateQueuePriorityInversionForRemoveNonVoterCount = metric.Metadata{
403-
Name: "queue.replicate.priority_inversion.removenonvoter",
404-
Help: "Number of priority inversions in the replicate queue that resulted in remove non-voter action during processing",
405-
Measurement: "Replicas",
406-
Unit: metric.Unit_COUNT,
407-
}
408-
metaReplicateQueuePriorityInversionForConsiderRebalance = metric.Metadata{
409-
Name: "queue.replicate.priority_inversion.considerrebalance",
410-
Help: "Number of priority inversions in the replicate queue that resulted in consider rebalance action during processing",
411-
Measurement: "Replicas",
412-
Unit: metric.Unit_COUNT,
413-
}
414-
metaReplicateQueuePriorityInversionForRangeUnavailable = metric.Metadata{
415-
Name: "queue.replicate.priority_inversion.rangeunavailable",
416-
Help: "Number of priority inversions in the replicate queue that resulted in range unavailable action during processing",
417-
Measurement: "Replicas",
418-
Unit: metric.Unit_COUNT,
419-
}
420-
metaReplicateQueuePriorityInversionForNoop = metric.Metadata{
421-
Name: "queue.replicate.priority_inversion.noop",
422-
Help: "Number of priority inversions in the replicate queue that resulted in noop action during processing",
423-
Measurement: "Replicas",
424-
Unit: metric.Unit_COUNT,
425-
}
426342
)
427343

428344
// quorumError indicates a retryable error condition which sends replicas being
@@ -484,26 +400,9 @@ type ReplicateQueueMetrics struct {
484400
// AllocatorConsiderRebalance, and AllocatorFinalizeAtomicReplicationChange
485401
// allocator actions.
486402

487-
// Priority Inversion. Not tracked for
488-
// AllocatorFinalizeAtomicReplicationChange, AllocatorRemoveLearner,
489-
// AllocatorReplaceDeadVoter since they are the highest priority actions and
490-
// cannot be inverted. (17 total actions-3=14)
491-
RequeueDueToPriorityInversion *metric.Counter
492-
PriorityInversionTotal *metric.Counter
493-
PriorityInversionForAddVoterCount *metric.Counter
494-
PriorityInversionForReplaceDecommissioningVoterCount *metric.Counter
495-
PriorityInversionForRemoveDeadVoterCount *metric.Counter
496-
PriorityInversionForRemoveDecommissioningVoterCount *metric.Counter
497-
PriorityInversionForRemoveVoterCount *metric.Counter
498-
PriorityInversionForReplaceDeadNonVoterCount *metric.Counter
499-
PriorityInversionForAddNonVoterCount *metric.Counter
500-
PriorityInversionForReplaceDecommissioningNonVoterCount *metric.Counter
501-
PriorityInversionForRemoveDeadNonVoterCount *metric.Counter
502-
PriorityInversionForRemoveDecommissioningNonVoterCount *metric.Counter
503-
PriorityInversionForRemoveNonVoterCount *metric.Counter
504-
PriorityInversionForConsiderRebalance *metric.Counter
505-
PriorityInversionForRangeUnavailable *metric.Counter
506-
PriorityInversionForNoop *metric.Counter
403+
// Priority Inversion.
404+
RequeueDueToPriorityInversion *metric.Counter
405+
PriorityInversionTotal *metric.Counter
507406
}
508407

509408
func makeReplicateQueueMetrics() ReplicateQueueMetrics {
@@ -541,22 +440,8 @@ func makeReplicateQueueMetrics() ReplicateQueueMetrics {
541440
RemoveDecommissioningReplicaSuccessCount: metric.NewCounter(metaReplicateQueueRemoveDecommissioningReplicaSuccessCount),
542441
RemoveDecommissioningReplicaErrorCount: metric.NewCounter(metaReplicateQueueRemoveDecommissioningReplicaErrorCount),
543442

544-
RequeueDueToPriorityInversion: metric.NewCounter(metaReplicateQueueRequeueDueToPriorityInversion),
545-
PriorityInversionTotal: metric.NewCounter(metaReplicateQueuePriorityInversionTotal),
546-
PriorityInversionForAddVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForAddVoterCount),
547-
PriorityInversionForReplaceDecommissioningVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForReplaceDecommissioningVoterCount),
548-
PriorityInversionForRemoveDeadVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForRemoveDeadVoterCount),
549-
PriorityInversionForRemoveDecommissioningVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForRemoveDecommissioningVoterCount),
550-
PriorityInversionForRemoveVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForRemoveVoterCount),
551-
PriorityInversionForReplaceDeadNonVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForReplaceDeadNonVoterCount),
552-
PriorityInversionForAddNonVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForAddNonVoterCount),
553-
PriorityInversionForReplaceDecommissioningNonVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForReplaceDecommissioningNonVoterCount),
554-
PriorityInversionForRemoveDeadNonVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForRemoveDeadNonVoterCount),
555-
PriorityInversionForRemoveDecommissioningNonVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForRemoveDecommissioningNonVoterCount),
556-
PriorityInversionForRemoveNonVoterCount: metric.NewCounter(metaReplicateQueuePriorityInversionForRemoveNonVoterCount),
557-
PriorityInversionForConsiderRebalance: metric.NewCounter(metaReplicateQueuePriorityInversionForConsiderRebalance),
558-
PriorityInversionForRangeUnavailable: metric.NewCounter(metaReplicateQueuePriorityInversionForRangeUnavailable),
559-
PriorityInversionForNoop: metric.NewCounter(metaReplicateQueuePriorityInversionForNoop),
443+
RequeueDueToPriorityInversion: metric.NewCounter(metaReplicateQueueRequeueDueToPriorityInversion),
444+
PriorityInversionTotal: metric.NewCounter(metaReplicateQueuePriorityInversionTotal),
560445
}
561446
}
562447

@@ -680,47 +565,6 @@ func (metrics *ReplicateQueueMetrics) trackErrorByAllocatorAction(
680565

681566
}
682567

683-
// trackPriorityInversion tracks the action that the replicate queue ended up
684-
// processing when the priority at enqueue time was higher than the priority at
685-
// processing time.
686-
func (metrics *ReplicateQueueMetrics) trackPriorityInversion(
687-
actionAtProcessingTime allocatorimpl.AllocatorAction,
688-
) {
689-
metrics.PriorityInversionTotal.Inc(1)
690-
switch actionAtProcessingTime {
691-
case allocatorimpl.AllocatorAddVoter:
692-
metrics.PriorityInversionForAddVoterCount.Inc(1)
693-
case allocatorimpl.AllocatorReplaceDecommissioningVoter:
694-
metrics.PriorityInversionForReplaceDecommissioningVoterCount.Inc(1)
695-
case allocatorimpl.AllocatorRemoveDeadVoter:
696-
metrics.PriorityInversionForRemoveDeadVoterCount.Inc(1)
697-
case allocatorimpl.AllocatorRemoveDecommissioningVoter:
698-
metrics.PriorityInversionForRemoveDecommissioningVoterCount.Inc(1)
699-
case allocatorimpl.AllocatorRemoveVoter:
700-
metrics.PriorityInversionForRemoveVoterCount.Inc(1)
701-
case allocatorimpl.AllocatorReplaceDeadNonVoter:
702-
metrics.PriorityInversionForReplaceDeadNonVoterCount.Inc(1)
703-
case allocatorimpl.AllocatorAddNonVoter:
704-
metrics.PriorityInversionForAddNonVoterCount.Inc(1)
705-
case allocatorimpl.AllocatorReplaceDecommissioningNonVoter:
706-
metrics.PriorityInversionForReplaceDecommissioningNonVoterCount.Inc(1)
707-
case allocatorimpl.AllocatorRemoveDeadNonVoter:
708-
metrics.PriorityInversionForRemoveDeadNonVoterCount.Inc(1)
709-
case allocatorimpl.AllocatorRemoveDecommissioningNonVoter:
710-
metrics.PriorityInversionForRemoveDecommissioningNonVoterCount.Inc(1)
711-
case allocatorimpl.AllocatorRemoveNonVoter:
712-
metrics.PriorityInversionForRemoveNonVoterCount.Inc(1)
713-
case allocatorimpl.AllocatorConsiderRebalance:
714-
metrics.PriorityInversionForConsiderRebalance.Inc(1)
715-
case allocatorimpl.AllocatorRangeUnavailable:
716-
metrics.PriorityInversionForRangeUnavailable.Inc(1)
717-
case allocatorimpl.AllocatorNoop:
718-
metrics.PriorityInversionForNoop.Inc(1)
719-
default:
720-
panic("unhandled default case")
721-
}
722-
}
723-
724568
// trackProcessResult increases the corresponding success/error count metric for
725569
// processing a particular allocator action through the replicate queue.
726570
func (metrics *ReplicateQueueMetrics) trackResultByAllocatorAction(

pkg/kv/kvserver/replicate_queue_test.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2561,6 +2561,7 @@ func TestReplicateQueueDecommissionScannerDisabled(t *testing.T) {
25612561
func TestPriorityInversionRequeue(t *testing.T) {
25622562
defer leaktest.AfterTest(t)()
25632563
defer log.Scope(t).Close(t)
2564+
skip.UnderDuress(t)
25642565

25652566
ctx := context.Background()
25662567
settings := cluster.MakeTestingClusterSettings()
@@ -2640,9 +2641,6 @@ func TestPriorityInversionRequeue(t *testing.T) {
26402641
if c := store.ReplicateQueueMetrics().PriorityInversionTotal.Count(); c == 0 {
26412642
return errors.New("expected non-zero priority inversion total count but got 0")
26422643
}
2643-
if c := store.ReplicateQueueMetrics().PriorityInversionForConsiderRebalance.Count(); c == 0 {
2644-
return errors.New("expected non-zero priority inversion count for consider rebalance but got 0")
2645-
}
26462644
if c := store.ReplicateQueueMetrics().RequeueDueToPriorityInversion.Count(); c == 0 {
26472645
return errors.New("expected to requeue due to priority inversion but got 0")
26482646
}

0 commit comments

Comments
 (0)