Skip to content

Commit

Permalink
raft: don't propose config changes with regressed lead support
Browse files Browse the repository at this point in the history
Fixes #125355.

If the lead support has not caught up from the previous configuration,
we must not propose another configuration change. Doing so would
compromise the lead support promise made by the previous configuration
and used as an expiration of a leader lease. Instead, we wait for the
lead support under the current configuration to catch up to the maximum
lead support reached under the previous config. If the lead support is
never able to catch up, the leader will eventually step down due to
CheckQuorum.

As a reminder, if we reject a config change, it will be reproposed again
shortly, at which point the lead support will likely have caught up.

As a second reminder, we fortify both voters and learners. We also
always add new replicas as learners before promoting them to voters. The
combination here means that a leader will commonly already be supported
a voter when it is promoted added to the lead support computation.

Release note: None
  • Loading branch information
nvanbenschoten committed Oct 16, 2024
1 parent eb02f38 commit 0b6cb2a
Show file tree
Hide file tree
Showing 6 changed files with 638 additions and 3 deletions.
12 changes: 12 additions & 0 deletions pkg/raft/confchange/validate.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ type ValidationContext struct {
CurConfig *quorum.Config
Applied uint64
PendingConfIndex uint64
LeadSupportSafe bool
DisableValidationAgainstCurConfig bool
}

Expand All @@ -35,6 +36,17 @@ func ValidateProp(ctx ValidationContext, cc pb.ConfChangeV2) error {
ctx.PendingConfIndex, ctx.Applied)
}

// If the lead support has not caught up from the previous configuration, we
// must not propose another configuration change. Doing so would compromise
// the lead support promise made by the previous configuration and used as an
// expiration of a leader lease. Instead, we wait for the lead support under
// the current configuration to catch up to the maximum lead support reached
// under the previous config. If the lead support is never able to catch up,
// the leader will eventually step down due to CheckQuorum.
if !ctx.LeadSupportSafe {
return errors.Errorf("lead support has not caught up to previous configuration")
}

// The DisableValidationAgainstCurConfig flag allows disabling config change
// constraints that are guaranteed by the upper state machine layer (incorrect
// ones will apply as no-ops). See raft.Config.DisableConfChangeValidation for
Expand Down
24 changes: 24 additions & 0 deletions pkg/raft/confchange/validate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ func TestValidateProp(t *testing.T) {
CurConfig: configNormal,
Applied: 10,
PendingConfIndex: 5,
LeadSupportSafe: true,
},
cc: changeNormal,
},
Expand All @@ -49,6 +50,7 @@ func TestValidateProp(t *testing.T) {
CurConfig: configNormal,
Applied: 10,
PendingConfIndex: 10,
LeadSupportSafe: true,
},
cc: changeNormal,
},
Expand All @@ -58,16 +60,29 @@ func TestValidateProp(t *testing.T) {
CurConfig: configNormal,
Applied: 10,
PendingConfIndex: 15,
LeadSupportSafe: true,
},
cc: changeNormal,
expErr: "possible unapplied conf change at index 15 \\(applied to 10\\)",
},
{
name: "invalid, unsafe lead support",
ctx: ValidationContext{
CurConfig: configNormal,
Applied: 10,
PendingConfIndex: 5,
LeadSupportSafe: false,
},
cc: changeNormal,
expErr: "lead support has not caught up to previous configuration",
},
{
name: "invalid, already in joint state",
ctx: ValidationContext{
CurConfig: configJoint,
Applied: 10,
PendingConfIndex: 5,
LeadSupportSafe: true,
},
cc: changeNormal,
expErr: "must transition out of joint config first",
Expand All @@ -78,6 +93,7 @@ func TestValidateProp(t *testing.T) {
CurConfig: configNormal,
Applied: 10,
PendingConfIndex: 5,
LeadSupportSafe: true,
},
cc: changeLeaveJoint,
expErr: "not in joint state; refusing empty conf change",
Expand All @@ -88,6 +104,7 @@ func TestValidateProp(t *testing.T) {
CurConfig: configJoint,
Applied: 10,
PendingConfIndex: 5,
LeadSupportSafe: true,
DisableValidationAgainstCurConfig: true,
},
cc: changeNormal,
Expand All @@ -98,6 +115,7 @@ func TestValidateProp(t *testing.T) {
CurConfig: configNormal,
Applied: 10,
PendingConfIndex: 5,
LeadSupportSafe: true,
DisableValidationAgainstCurConfig: true,
},
cc: changeLeaveJoint,
Expand All @@ -108,6 +126,7 @@ func TestValidateProp(t *testing.T) {
CurConfig: configNormal,
Applied: 10,
PendingConfIndex: 5,
LeadSupportSafe: true,
},
cc: pb.ConfChangeV2{Changes: []pb.ConfChangeSingle{
{Type: pb.ConfChangeRemoveNode, NodeID: 3},
Expand All @@ -121,6 +140,7 @@ func TestValidateProp(t *testing.T) {
CurConfig: configNormal,
Applied: 10,
PendingConfIndex: 5,
LeadSupportSafe: true,
},
cc: pb.ConfChangeV2{Changes: []pb.ConfChangeSingle{
{Type: pb.ConfChangeRemoveNode, NodeID: 3},
Expand All @@ -133,6 +153,7 @@ func TestValidateProp(t *testing.T) {
CurConfig: configNormal,
Applied: 10,
PendingConfIndex: 5,
LeadSupportSafe: true,
},
cc: pb.ConfChangeV2{Changes: []pb.ConfChangeSingle{
{Type: pb.ConfChangeRemoveNode, NodeID: 3},
Expand All @@ -146,6 +167,7 @@ func TestValidateProp(t *testing.T) {
CurConfig: configNormal,
Applied: 10,
PendingConfIndex: 5,
LeadSupportSafe: true,
},
cc: pb.ConfChangeV2{Changes: []pb.ConfChangeSingle{
{Type: pb.ConfChangeAddLearnerNode, NodeID: 3},
Expand All @@ -159,6 +181,7 @@ func TestValidateProp(t *testing.T) {
CurConfig: configNormal,
Applied: 10,
PendingConfIndex: 5,
LeadSupportSafe: true,
DisableValidationAgainstCurConfig: true,
},
cc: pb.ConfChangeV2{Changes: []pb.ConfChangeSingle{
Expand All @@ -172,6 +195,7 @@ func TestValidateProp(t *testing.T) {
CurConfig: configNormal,
Applied: 10,
PendingConfIndex: 5,
LeadSupportSafe: true,
DisableValidationAgainstCurConfig: true,
},
cc: pb.ConfChangeV2{Changes: []pb.ConfChangeSingle{
Expand Down
1 change: 1 addition & 0 deletions pkg/raft/raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -1634,6 +1634,7 @@ func stepLeader(r *raft, m pb.Message) error {
CurConfig: &r.config,
Applied: r.raftLog.applied,
PendingConfIndex: r.pendingConfIndex,
LeadSupportSafe: r.fortificationTracker.ConfigChangeSafe(),
DisableValidationAgainstCurConfig: r.disableConfChangeValidation,
}
if err := confchange.ValidateProp(ccCtx, cc.AsV2()); err != nil {
Expand Down
Loading

0 comments on commit 0b6cb2a

Please sign in to comment.