Skip to content

Commit b95e1a5

Browse files
committed
kvserver: add decommissioning ranges metric
Introduce the `ranges.decommissioning` gauge metric, which represents the number of ranges with at least one replica on a decommissioning node. The metric is reported by the leaseholder, or if there is no valid leaseholder, the first live replica in the descriptor, similar to (under|over)-replication metrics. The metric can be used to approximately identify the distribution of decommissioning work remaining across nodes, as the leaseholder replica is responsible for triggering the replacement of decommissioning replicas for its own range. Informs: #130085 Release note (ops change): The `ranges.decommissioning` metric is added, representing the number of ranges which have a replica on a decommissioning node.
1 parent 873f392 commit b95e1a5

File tree

6 files changed

+60
-9
lines changed

6 files changed

+60
-9
lines changed

docs/generated/metrics/metrics.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,7 @@
530530
<tr><td>STORAGE</td><td>rangekeybytes</td><td>Number of bytes taken up by range keys (e.g. MVCC range tombstones)</td><td>Storage</td><td>GAUGE</td><td>BYTES</td><td>AVG</td><td>NONE</td></tr>
531531
<tr><td>STORAGE</td><td>rangekeycount</td><td>Count of all range keys (e.g. MVCC range tombstones)</td><td>Keys</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
532532
<tr><td>STORAGE</td><td>ranges</td><td>Number of ranges</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
533+
<tr><td>STORAGE</td><td>ranges.decommissioning</td><td>Number of ranges with at lease one replica on a decommissioning node</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
533534
<tr><td>STORAGE</td><td>ranges.overreplicated</td><td>Number of ranges with more live replicas than the replication target</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
534535
<tr><td>STORAGE</td><td>ranges.unavailable</td><td>Number of ranges with fewer live replicas than needed for quorum</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
535536
<tr><td>STORAGE</td><td>ranges.underreplicated</td><td>Number of ranges with fewer live replicas than the replication target</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>

pkg/kv/kvserver/metrics.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,12 @@ var (
120120
Measurement: "Ranges",
121121
Unit: metric.Unit_COUNT,
122122
}
123+
metaDecommissioningRangeCount = metric.Metadata{
124+
Name: "ranges.decommissioning",
125+
Help: "Number of ranges with at lease one replica on a decommissioning node",
126+
Measurement: "Ranges",
127+
Unit: metric.Unit_COUNT,
128+
}
123129

124130
// Lease request metrics.
125131
metaLeaseRequestSuccessCount = metric.Metadata{
@@ -2049,6 +2055,7 @@ type StoreMetrics struct {
20492055
UnavailableRangeCount *metric.Gauge
20502056
UnderReplicatedRangeCount *metric.Gauge
20512057
OverReplicatedRangeCount *metric.Gauge
2058+
DecommissioningRangeCount *metric.Gauge
20522059

20532060
// Lease request metrics for successful and failed lease requests. These
20542061
// count proposals (i.e. it does not matter how many replicas apply the
@@ -2682,6 +2689,7 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
26822689
UnavailableRangeCount: metric.NewGauge(metaUnavailableRangeCount),
26832690
UnderReplicatedRangeCount: metric.NewGauge(metaUnderReplicatedRangeCount),
26842691
OverReplicatedRangeCount: metric.NewGauge(metaOverReplicatedRangeCount),
2692+
DecommissioningRangeCount: metric.NewGauge(metaDecommissioningRangeCount),
26852693

26862694
// Lease request metrics.
26872695
LeaseRequestSuccessCount: metric.NewCounter(metaLeaseRequestSuccessCount),

pkg/kv/kvserver/replica_metrics.go

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ type ReplicaMetrics struct {
5050
Unavailable bool
5151
Underreplicated bool
5252
Overreplicated bool
53+
Decommissioning bool
5354
RaftLogTooLarge bool
5455
BehindCount int64
5556
PausedFollowerCount int64
@@ -157,7 +158,7 @@ func calcReplicaMetrics(d calcReplicaMetricsInput) ReplicaMetrics {
157158
}
158159
}
159160

160-
rangeCounter, unavailable, underreplicated, overreplicated := calcRangeCounter(
161+
rangeCounter, unavailable, underreplicated, overreplicated, decommissioning := calcRangeCounter(
161162
d.storeID, d.desc, d.leaseStatus, d.livenessMap, d.conf.GetNumVoters(), d.conf.NumReplicas, d.clusterNodes)
162163

163164
// The raft leader computes the number of raft entries that replicas are
@@ -185,6 +186,7 @@ func calcReplicaMetrics(d calcReplicaMetricsInput) ReplicaMetrics {
185186
Unavailable: unavailable,
186187
Underreplicated: underreplicated,
187188
Overreplicated: overreplicated,
189+
Decommissioning: decommissioning,
188190
RaftLogTooLarge: d.raftLogSizeTrusted &&
189191
d.raftLogSize > raftLogTooLargeMultiple*d.raftCfg.RaftLogTruncationThreshold,
190192
BehindCount: leaderBehindCount,
@@ -224,7 +226,7 @@ func calcRangeCounter(
224226
livenessMap livenesspb.IsLiveMap,
225227
numVoters, numReplicas int32,
226228
clusterNodes int,
227-
) (rangeCounter, unavailable, underreplicated, overreplicated bool) {
229+
) (rangeCounter, unavailable, underreplicated, overreplicated, decommissioning bool) {
228230
// If there is a live leaseholder (regardless of whether the lease is still
229231
// valid) that leaseholder is responsible for range-level metrics.
230232
if livenessMap[leaseStatus.Lease.Replica.NodeID].IsLive {
@@ -259,6 +261,7 @@ func calcRangeCounter(
259261
} else if neededVoters < liveVoters || neededNonVoters < liveNonVoters {
260262
overreplicated = true
261263
}
264+
decommissioning = calcDecommissioningCount(desc, livenessMap) > 0
262265
}
263266
return
264267
}
@@ -305,6 +308,16 @@ func calcBehindCount(
305308
return behindCount
306309
}
307310

311+
func calcDecommissioningCount(desc *roachpb.RangeDescriptor, livenessMap livenesspb.IsLiveMap) int {
312+
var decommissioningCount int
313+
for _, rd := range desc.Replicas().Descriptors() {
314+
if livenessMap[rd.NodeID].Membership.Decommissioning() {
315+
decommissioningCount++
316+
}
317+
}
318+
return decommissioningCount
319+
}
320+
308321
// LoadStats returns the load statistics for the replica.
309322
func (r *Replica) LoadStats() load.ReplicaLoadStats {
310323
return r.loadStats.Stats()

pkg/kv/kvserver/replica_metrics_test.go

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -55,18 +55,19 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) {
5555
}))
5656

5757
{
58-
ctr, down, under, over := calcRangeCounter(1100, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.IsLiveMap{
58+
ctr, down, under, over, decom := calcRangeCounter(1100, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.IsLiveMap{
5959
1000: livenesspb.IsLiveMapEntry{IsLive: true}, // by NodeID
6060
}, 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */)
6161

6262
require.True(t, ctr)
6363
require.True(t, down)
6464
require.True(t, under)
6565
require.False(t, over)
66+
require.False(t, decom)
6667
}
6768

6869
{
69-
ctr, down, under, over := calcRangeCounter(1000, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.IsLiveMap{
70+
ctr, down, under, over, decom := calcRangeCounter(1000, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.IsLiveMap{
7071
1000: livenesspb.IsLiveMapEntry{IsLive: false},
7172
}, 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */)
7273

@@ -76,10 +77,11 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) {
7677
require.False(t, down)
7778
require.False(t, under)
7879
require.False(t, over)
80+
require.False(t, decom)
7981
}
8082

8183
{
82-
ctr, down, under, over := calcRangeCounter(11, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.IsLiveMap{
84+
ctr, down, under, over, decom := calcRangeCounter(11, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.IsLiveMap{
8385
10: livenesspb.IsLiveMapEntry{IsLive: true},
8486
100: livenesspb.IsLiveMapEntry{IsLive: true},
8587
1000: livenesspb.IsLiveMapEntry{IsLive: true},
@@ -90,11 +92,12 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) {
9092
require.False(t, down)
9193
require.False(t, under)
9294
require.False(t, over)
95+
require.False(t, decom)
9396
}
9497

9598
{
9699
// Single non-voter dead
97-
ctr, down, under, over := calcRangeCounter(11, oneVoterAndThreeNonVoters, leaseStatus, livenesspb.IsLiveMap{
100+
ctr, down, under, over, decom := calcRangeCounter(11, oneVoterAndThreeNonVoters, leaseStatus, livenesspb.IsLiveMap{
98101
10: livenesspb.IsLiveMapEntry{IsLive: true},
99102
100: livenesspb.IsLiveMapEntry{IsLive: true},
100103
1000: livenesspb.IsLiveMapEntry{IsLive: false},
@@ -105,11 +108,12 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) {
105108
require.False(t, down)
106109
require.True(t, under)
107110
require.False(t, over)
111+
require.False(t, decom)
108112
}
109113

110114
{
111115
// All non-voters are dead, but range is not unavailable
112-
ctr, down, under, over := calcRangeCounter(11, oneVoterAndThreeNonVoters, leaseStatus, livenesspb.IsLiveMap{
116+
ctr, down, under, over, decom := calcRangeCounter(11, oneVoterAndThreeNonVoters, leaseStatus, livenesspb.IsLiveMap{
113117
10: livenesspb.IsLiveMapEntry{IsLive: true},
114118
100: livenesspb.IsLiveMapEntry{IsLive: false},
115119
1000: livenesspb.IsLiveMapEntry{IsLive: false},
@@ -120,11 +124,12 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) {
120124
require.False(t, down)
121125
require.True(t, under)
122126
require.False(t, over)
127+
require.False(t, decom)
123128
}
124129

125130
{
126131
// More non-voters than needed
127-
ctr, down, under, over := calcRangeCounter(11, oneVoterAndThreeNonVoters, leaseStatus, livenesspb.IsLiveMap{
132+
ctr, down, under, over, decom := calcRangeCounter(11, oneVoterAndThreeNonVoters, leaseStatus, livenesspb.IsLiveMap{
128133
10: livenesspb.IsLiveMapEntry{IsLive: true},
129134
100: livenesspb.IsLiveMapEntry{IsLive: true},
130135
1000: livenesspb.IsLiveMapEntry{IsLive: true},
@@ -135,6 +140,24 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) {
135140
require.False(t, down)
136141
require.False(t, under)
137142
require.True(t, over)
143+
require.False(t, decom)
144+
}
145+
146+
{
147+
// Decommissioning node.
148+
ctr, down, under, over, decom := calcRangeCounter(11, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.IsLiveMap{
149+
10: livenesspb.IsLiveMapEntry{IsLive: true},
150+
100: livenesspb.IsLiveMapEntry{IsLive: true,
151+
Liveness: livenesspb.Liveness{Membership: livenesspb.MembershipStatus_DECOMMISSIONING}},
152+
1000: livenesspb.IsLiveMapEntry{IsLive: true},
153+
2000: livenesspb.IsLiveMapEntry{IsLive: true},
154+
}, 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */)
155+
156+
require.True(t, ctr)
157+
require.False(t, down)
158+
require.False(t, under)
159+
require.False(t, over)
160+
require.True(t, decom)
138161
}
139162
}
140163

@@ -242,7 +265,7 @@ func TestCalcRangeCounterLeaseHolder(t *testing.T) {
242265
for _, nodeID := range tc.liveNodes {
243266
livenessMap[nodeID] = livenesspb.IsLiveMapEntry{IsLive: true}
244267
}
245-
ctr, _, _, _ := calcRangeCounter(tc.storeID, rangeDesc, tc.leaseStatus, livenessMap,
268+
ctr, _, _, _, _ := calcRangeCounter(tc.storeID, rangeDesc, tc.leaseStatus, livenessMap,
246269
3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */)
247270
require.Equal(t, tc.expectCounter, ctr)
248271
})

pkg/kv/kvserver/store.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2987,6 +2987,7 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
29872987
unavailableRangeCount int64
29882988
underreplicatedRangeCount int64
29892989
overreplicatedRangeCount int64
2990+
decommissioningRangeCount int64
29902991
behindCount int64
29912992
pausedFollowerCount int64
29922993
ioOverload float64
@@ -3067,6 +3068,9 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
30673068
if metrics.Overreplicated {
30683069
overreplicatedRangeCount++
30693070
}
3071+
if metrics.Decommissioning {
3072+
decommissioningRangeCount++
3073+
}
30703074
}
30713075
pausedFollowerCount += metrics.PausedFollowerCount
30723076
slowRaftProposalCount += metrics.SlowRaftProposalCount
@@ -3128,6 +3132,7 @@ func (s *Store) updateReplicationGauges(ctx context.Context) error {
31283132
s.metrics.UnavailableRangeCount.Update(unavailableRangeCount)
31293133
s.metrics.UnderReplicatedRangeCount.Update(underreplicatedRangeCount)
31303134
s.metrics.OverReplicatedRangeCount.Update(overreplicatedRangeCount)
3135+
s.metrics.DecommissioningRangeCount.Update(decommissioningRangeCount)
31313136
s.metrics.RaftLogFollowerBehindCount.Update(behindCount)
31323137
s.metrics.RaftPausedFollowerCount.Update(pausedFollowerCount)
31333138
s.metrics.IOOverload.Update(ioOverload)

pkg/ts/catalog/chart_catalog.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -600,6 +600,7 @@ var charts = []sectionDescription{
600600
"ranges.unavailable",
601601
"ranges.underreplicated",
602602
"ranges.overreplicated",
603+
"ranges.decommissioning",
603604
},
604605
},
605606
{

0 commit comments

Comments
 (0)