Skip to content

Commit b175bad

Browse files
craig[bot]kvoli
andcommitted
Merge #130117
130117: kvserver: enqueue decom ranges at an interval behind a setting r=arulajmani a=kvoli Introduce the `ranges.decommissioning` gauge metric, which represents the number of ranges with at least one replica on a decommissioning node. The metric is reported by the leaseholder, or if there is no valid leaseholder, the first live replica in the descriptor, similar to (under|over)-replication metrics. The metric can be used to approximately identify the distribution of decommissioning work remaining across nodes, as the leaseholder replica is responsible for triggering the replacement of decommissioning replicas for its own range. Informs: #130085 Release note (ops change): The `ranges.decommissioning` metric is added, representing the number of ranges which have a replica on a decommissioning node. --- When `kv.enqueue_in_replicate_queue_on_problem.interval` is set to a positive non-zero value, leaseholder replicas of ranges which are underreplicated will be enqueued into the replicate queue every `kv.enqueue_in_replicate_queue_on_problem.interval` interval. When `kv.enqueue_in_replicate_queue_on_problem.interval` is set to 0, no enqueueing on underreplication will take place, outside of the regular replica scanner. A recommended value for users enabling the enqueue (non-zero), is 15 minutes e.g., ``` SET CLUSTER SETTING kv.enqueue_in_replicate_queue_on_problem.interval='900s' ``` --- Resolves: #130085 Release note (ops change): The `ranges.decommissioning` metric is added, representing the number of ranges which have a replica on a decommissioning node. Co-authored-by: Austen McClernon <[email protected]>
2 parents 080e478 + 637cd5d commit b175bad

File tree

9 files changed

+209
-11
lines changed

9 files changed

+209
-11
lines changed

docs/generated/metrics/metrics.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,7 @@
567567
<tr><td>STORAGE</td><td>rangekeybytes</td><td>Number of bytes taken up by range keys (e.g. MVCC range tombstones)</td><td>Storage</td><td>GAUGE</td><td>BYTES</td><td>AVG</td><td>NONE</td></tr>
568568
<tr><td>STORAGE</td><td>rangekeycount</td><td>Count of all range keys (e.g. MVCC range tombstones)</td><td>Keys</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
569569
<tr><td>STORAGE</td><td>ranges</td><td>Number of ranges</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
570+
<tr><td>STORAGE</td><td>ranges.decommissioning</td><td>Number of ranges with at lease one replica on a decommissioning node</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
570571
<tr><td>STORAGE</td><td>ranges.overreplicated</td><td>Number of ranges with more live replicas than the replication target</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
571572
<tr><td>STORAGE</td><td>ranges.unavailable</td><td>Number of ranges with fewer live replicas than needed for quorum</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
572573
<tr><td>STORAGE</td><td>ranges.underreplicated</td><td>Number of ranges with fewer live replicas than the replication target</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>

pkg/kv/kvserver/metrics.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,12 @@ var (
120120
Measurement: "Ranges",
121121
Unit: metric.Unit_COUNT,
122122
}
123+
metaDecommissioningRangeCount = metric.Metadata{
124+
Name: "ranges.decommissioning",
125+
Help: "Number of ranges with at lease one replica on a decommissioning node",
126+
Measurement: "Ranges",
127+
Unit: metric.Unit_COUNT,
128+
}
123129

124130
// Lease request metrics.
125131
metaLeaseRequestSuccessCount = metric.Metadata{
@@ -2566,6 +2572,7 @@ type StoreMetrics struct {
25662572
UnavailableRangeCount *metric.Gauge
25672573
UnderReplicatedRangeCount *metric.Gauge
25682574
OverReplicatedRangeCount *metric.Gauge
2575+
DecommissioningRangeCount *metric.Gauge
25692576

25702577
// Lease request metrics for successful and failed lease requests. These
25712578
// count proposals (i.e. it does not matter how many replicas apply the
@@ -3263,6 +3270,7 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
32633270
UnavailableRangeCount: metric.NewGauge(metaUnavailableRangeCount),
32643271
UnderReplicatedRangeCount: metric.NewGauge(metaUnderReplicatedRangeCount),
32653272
OverReplicatedRangeCount: metric.NewGauge(metaOverReplicatedRangeCount),
3273+
DecommissioningRangeCount: metric.NewGauge(metaDecommissioningRangeCount),
32663274

32673275
// Lease request metrics.
32683276
LeaseRequestSuccessCount: metric.NewCounter(metaLeaseRequestSuccessCount),

pkg/kv/kvserver/replica.go

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
2525
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/abortspan"
2626
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator"
27+
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/allocatorimpl"
2728
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/plan"
2829
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval"
2930
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency"
@@ -927,6 +928,12 @@ type Replica struct {
927928
// changes for a range on the leaseholder.
928929
allocatorToken *plan.AllocatorToken
929930

931+
// lastProblemRangeReplicateEnqueueTime is the last time this replica was
932+
// eagerly enqueued into the replicate queue due to being underreplicated
933+
// or having a decommissioning replica. This is used to throttle enqueue
934+
// attempts.
935+
lastProblemRangeReplicateEnqueueTime atomic.Value
936+
930937
// unreachablesMu contains a set of remote ReplicaIDs that are to be reported
931938
// as unreachable on the next raft tick.
932939
unreachablesMu struct {
@@ -2557,3 +2564,57 @@ func racV2EnabledWhenLeaderLevel(
25572564
// TODO(sumeer): implement fully, once all the dependencies are implemented.
25582565
return replica_rac2.NotEnabledWhenLeader
25592566
}
2567+
2568+
// maybeEnqueueProblemRange will enqueue the replica for processing into the
2569+
// replicate queue iff:
2570+
//
2571+
// - The replica is the holder of a valid lease.
2572+
// - EnqueueProblemRangeInReplicateQueueInterval is enabled (set to a
2573+
// non-zero value)
2574+
// - The last time the replica was enqueued is longer than
2575+
// EnqueueProblemRangeInReplicateQueueInterval.
2576+
//
2577+
// The replica is enqueued at a decommissioning priority. Note that by default,
2578+
// this behavior is disabled (zero interval). Also note that this method should
2579+
// NOT be called unless the range is known to require action e.g.,
2580+
// decommissioning|underreplicated.
2581+
//
2582+
// NOTE: This method is motivated by a bug where decommissioning stalls because
2583+
// a decommissioning range is not enqueued in the replicate queue in a timely
2584+
// manner via the replica scanner, see #130199. This functionality is disabled
2585+
// by default for this reason.
2586+
func (r *Replica) maybeEnqueueProblemRange(
2587+
ctx context.Context, now time.Time, leaseValid, isLeaseholder bool,
2588+
) {
2589+
// The method expects the caller to provide whether the lease is valid and
2590+
// the replica is the leaseholder for the range, so that it can avoid
2591+
// unnecessary work. We expect this method to be called in the context of
2592+
// updating metrics.
2593+
if !isLeaseholder || !leaseValid {
2594+
// The replicate queue will not process the replica without a valid lease.
2595+
// Nothing to do.
2596+
return
2597+
}
2598+
2599+
interval := EnqueueProblemRangeInReplicateQueueInterval.Get(&r.store.cfg.Settings.SV)
2600+
if interval == 0 {
2601+
// The setting is disabled.
2602+
return
2603+
}
2604+
lastTime := r.lastProblemRangeReplicateEnqueueTime.Load().(time.Time)
2605+
if lastTime.Add(interval).After(now) {
2606+
// The last time the replica was enqueued is less than the interval ago,
2607+
// nothing to do.
2608+
return
2609+
}
2610+
// The replica is the leaseholder for a range which requires action and it
2611+
// has been longer than EnqueueProblemRangeInReplicateQueueInterval since the
2612+
// last time it was enqueued. Try to swap the last time with now. We don't
2613+
// expect a race, however if the value changed underneath us we won't enqueue
2614+
// the replica as we lost the race.
2615+
if !r.lastProblemRangeReplicateEnqueueTime.CompareAndSwap(lastTime, now) {
2616+
return
2617+
}
2618+
r.store.replicateQueue.AddAsync(ctx, r,
2619+
allocatorimpl.AllocatorReplaceDecommissioningVoter.Priority())
2620+
}

pkg/kv/kvserver/replica_init.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ func newUninitializedReplicaWithoutRaftGroup(
184184
store.rebalanceObjManager.Objective().ToSplitObjective(),
185185
)
186186
}
187+
r.lastProblemRangeReplicateEnqueueTime.Store(store.Clock().PhysicalTime())
187188

188189
// NB: the state will be loaded when the replica gets initialized.
189190
r.mu.state = uninitState

pkg/kv/kvserver/replica_metrics.go

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ type ReplicaMetrics struct {
5151
Unavailable bool
5252
Underreplicated bool
5353
Overreplicated bool
54+
Decommissioning bool
5455
RaftLogTooLarge bool
5556
RangeTooLarge bool
5657
BehindCount int64
@@ -172,7 +173,7 @@ func calcReplicaMetrics(d calcReplicaMetricsInput) ReplicaMetrics {
172173
rangeTooLargeMultiple = 2
173174
)
174175
largeRangeThreshold := rangeTooLargeMultiple * d.conf.RangeMaxBytes
175-
rangeCounter, unavailable, underreplicated, overreplicated, tooLarge := calcRangeCounter(
176+
rangeCounter, unavailable, underreplicated, overreplicated, tooLarge, decommissioning := calcRangeCounter(
176177
d.storeID, d.desc, d.leaseStatus, d.vitalityMap, d.conf.GetNumVoters(), d.conf.NumReplicas,
177178
d.clusterNodes, largeRangeThreshold, d.rangeSize)
178179

@@ -200,6 +201,7 @@ func calcReplicaMetrics(d calcReplicaMetricsInput) ReplicaMetrics {
200201
Unavailable: unavailable,
201202
Underreplicated: underreplicated,
202203
Overreplicated: overreplicated,
204+
Decommissioning: decommissioning,
203205
RaftLogTooLarge: d.raftLogSizeTrusted &&
204206
d.raftLogSize > raftLogTooLargeMultiple*d.raftCfg.RaftLogTruncationThreshold,
205207
RangeTooLarge: tooLarge,
@@ -243,7 +245,7 @@ func calcRangeCounter(
243245
numVoters, numReplicas int32,
244246
clusterNodes int,
245247
rangeTooLargeThreshold, rangeSize int64,
246-
) (rangeCounter, unavailable, underreplicated, overreplicated, tooLarge bool) {
248+
) (rangeCounter, unavailable, underreplicated, overreplicated, tooLarge, decommissioning bool) {
247249
// If there is a live leaseholder (regardless of whether the lease is still
248250
// valid) that leaseholder is responsible for range-level metrics.
249251
if vitalityMap[leaseStatus.Lease.Replica.NodeID].IsLive(livenesspb.Metrics) {
@@ -279,6 +281,7 @@ func calcRangeCounter(
279281
overreplicated = true
280282
}
281283
tooLarge = rangeSize > rangeTooLargeThreshold
284+
decommissioning = calcDecommissioningCount(desc, vitalityMap) > 0
282285
}
283286
return
284287
}
@@ -333,6 +336,18 @@ func calcBehindCount(
333336
return behindCount
334337
}
335338

339+
func calcDecommissioningCount(
340+
desc *roachpb.RangeDescriptor, vitalityMap livenesspb.NodeVitalityMap,
341+
) int {
342+
var decommissioningCount int
343+
for _, rd := range desc.Replicas().Descriptors() {
344+
if vitalityMap[rd.NodeID].IsDecommissioning() {
345+
decommissioningCount++
346+
}
347+
}
348+
return decommissioningCount
349+
}
350+
336351
// LoadStats returns the load statistics for the replica.
337352
func (r *Replica) LoadStats() load.ReplicaLoadStats {
338353
return r.loadStats.Stats()

pkg/kv/kvserver/replica_metrics_test.go

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -55,18 +55,19 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) {
5555
}))
5656

5757
{
58-
ctr, down, under, over, _ := calcRangeCounter(1100, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.NodeVitalityMap{
58+
ctr, down, under, over, _, decom := calcRangeCounter(1100, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.NodeVitalityMap{
5959
1000: livenesspb.FakeNodeVitality(true), // by NodeID
6060
}, 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */, 0 /* rangeTooLargeThreshold */, 0 /* rangeSize */)
6161

6262
require.True(t, ctr)
6363
require.True(t, down)
6464
require.True(t, under)
6565
require.False(t, over)
66+
require.False(t, decom)
6667
}
6768

6869
{
69-
ctr, down, under, over, _ := calcRangeCounter(1000, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.NodeVitalityMap{
70+
ctr, down, under, over, _, decom := calcRangeCounter(1000, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.NodeVitalityMap{
7071
1000: livenesspb.FakeNodeVitality(false),
7172
}, 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */, 0 /* rangeTooLargeThreshold */, 0 /* rangeSize */)
7273

@@ -76,10 +77,11 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) {
7677
require.False(t, down)
7778
require.False(t, under)
7879
require.False(t, over)
80+
require.False(t, decom)
7981
}
8082

8183
{
82-
ctr, down, under, over, _ := calcRangeCounter(11, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.NodeVitalityMap{
84+
ctr, down, under, over, _, decom := calcRangeCounter(11, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.NodeVitalityMap{
8385
10: livenesspb.FakeNodeVitality(true),
8486
100: livenesspb.FakeNodeVitality(true),
8587
1000: livenesspb.FakeNodeVitality(true),
@@ -90,11 +92,12 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) {
9092
require.False(t, down)
9193
require.False(t, under)
9294
require.False(t, over)
95+
require.False(t, decom)
9396
}
9497

9598
{
9699
// Single non-voter dead
97-
ctr, down, under, over, _ := calcRangeCounter(11, oneVoterAndThreeNonVoters, leaseStatus, livenesspb.NodeVitalityMap{
100+
ctr, down, under, over, _, decom := calcRangeCounter(11, oneVoterAndThreeNonVoters, leaseStatus, livenesspb.NodeVitalityMap{
98101
10: livenesspb.FakeNodeVitality(true),
99102
100: livenesspb.FakeNodeVitality(true),
100103
1000: livenesspb.FakeNodeVitality(false),
@@ -105,11 +108,12 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) {
105108
require.False(t, down)
106109
require.True(t, under)
107110
require.False(t, over)
111+
require.False(t, decom)
108112
}
109113

110114
{
111115
// All non-voters are dead, but range is not unavailable
112-
ctr, down, under, over, _ := calcRangeCounter(11, oneVoterAndThreeNonVoters, leaseStatus, livenesspb.NodeVitalityMap{
116+
ctr, down, under, over, _, decom := calcRangeCounter(11, oneVoterAndThreeNonVoters, leaseStatus, livenesspb.NodeVitalityMap{
113117
10: livenesspb.FakeNodeVitality(true),
114118
100: livenesspb.FakeNodeVitality(false),
115119
1000: livenesspb.FakeNodeVitality(false),
@@ -120,11 +124,12 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) {
120124
require.False(t, down)
121125
require.True(t, under)
122126
require.False(t, over)
127+
require.False(t, decom)
123128
}
124129

125130
{
126131
// More non-voters than needed
127-
ctr, down, under, over, _ := calcRangeCounter(11, oneVoterAndThreeNonVoters, leaseStatus, livenesspb.NodeVitalityMap{
132+
ctr, down, under, over, _, decom := calcRangeCounter(11, oneVoterAndThreeNonVoters, leaseStatus, livenesspb.NodeVitalityMap{
128133
10: livenesspb.FakeNodeVitality(true),
129134
100: livenesspb.FakeNodeVitality(true),
130135
1000: livenesspb.FakeNodeVitality(true),
@@ -135,11 +140,12 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) {
135140
require.False(t, down)
136141
require.False(t, under)
137142
require.True(t, over)
143+
require.False(t, decom)
138144
}
139145

140146
{
141147
// Range larger than the threshold.
142-
ctr, _, _, _, large := calcRangeCounter(1100, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.NodeVitalityMap{
148+
ctr, _, _, _, large, _ := calcRangeCounter(1100, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.NodeVitalityMap{
143149
1000: livenesspb.FakeNodeVitality(true), // by NodeID
144150
}, 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */, 1000 /* rangeTooLargeThreshold */, 2000 /* rangeSize */)
145151

@@ -148,14 +154,28 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) {
148154
}
149155

150156
{
151-
ctr, _, _, _, large := calcRangeCounter(1000, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.NodeVitalityMap{
157+
ctr, _, _, _, large, _ := calcRangeCounter(1000, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.NodeVitalityMap{
152158
1000: livenesspb.FakeNodeVitality(false),
153159
}, 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */, 1000 /* rangeTooLargeThreshold */, 2000 /* rangeSize */)
154160
require.False(t, ctr)
155161
// Only the node responsible for the range can report if the range is too
156162
// large.
157163
require.False(t, large)
158164
}
165+
166+
{
167+
// Decommissioning node.
168+
vitality := livenesspb.TestCreateNodeVitality(10, 100, 1000, 2000)
169+
vitality.Decommissioning(100, true /* alive */)
170+
ctr, down, under, over, _, decom := calcRangeCounter(11, threeVotersAndSingleNonVoter, leaseStatus, vitality.ScanNodeVitalityFromCache(),
171+
3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */, 0 /* rangeTooLargeThreshold */, 0 /* rangeSize */)
172+
173+
require.True(t, ctr)
174+
require.False(t, down)
175+
require.False(t, under)
176+
require.False(t, over)
177+
require.True(t, decom)
178+
}
159179
}
160180

161181
func TestCalcRangeCounterLeaseHolder(t *testing.T) {
@@ -262,7 +282,7 @@ func TestCalcRangeCounterLeaseHolder(t *testing.T) {
262282
for _, nodeID := range tc.liveNodes {
263283
livenessMap[nodeID] = livenesspb.FakeNodeVitality(true)
264284
}
265-
ctr, _, _, _, _ := calcRangeCounter(tc.storeID, rangeDesc, tc.leaseStatus, livenessMap,
285+
ctr, _, _, _, _, _ := calcRangeCounter(tc.storeID, rangeDesc, tc.leaseStatus, livenessMap,
266286
3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */, 0 /* rangeTooLargeThreshold */, 0 /* rangeSize */)
267287
require.Equal(t, tc.expectCounter, ctr)
268288
})

pkg/kv/kvserver/replicate_queue.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,22 @@ var EnqueueInReplicateQueueOnSpanConfigUpdateEnabled = settings.RegisterBoolSett
8989
true,
9090
)
9191

92+
// EnqueueProblemRangeInReplicateQueueInterval controls the interval at which
93+
// problem ranges are enqueued into the replicate queue for processing, outside
94+
// of the normal scanner interval. A problem range is one which is
95+
// underreplicated or has a replica on a decommissioning store. The setting is
96+
// disabled when set to 0. By default, the setting is disabled.
97+
var EnqueueProblemRangeInReplicateQueueInterval = settings.RegisterDurationSetting(
98+
settings.SystemOnly,
99+
"kv.enqueue_in_replicate_queue_on_problem.interval",
100+
"interval at which problem ranges are enqueued into the replicate queue for "+
101+
"processing, outside of the normal scanner interval; a problem range is "+
102+
"one which is underreplicated or has a replica on a decommissioning store, "+
103+
"disabled when set to 0",
104+
0,
105+
settings.NonNegativeDuration,
106+
)
107+
92108
var (
93109
metaReplicateQueueAddReplicaCount = metric.Metadata{
94110
Name: "queue.replicate.addreplica",

0 commit comments

Comments
 (0)