Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix race with dead server cleanup when adding new raft nodes #20986

Merged
merged 2 commits into from
Jun 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions changelog/20986.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
storage/raft: Fix race where new follower joining can get pruned by dead server cleanup.
```
12 changes: 8 additions & 4 deletions physical/raft/raft_autopilot.go
Original file line number Diff line number Diff line change
Expand Up @@ -215,13 +215,15 @@ func NewFollowerStates() *FollowerStates {
}
}

// Update the peer information in the follower states. Note that this function runs on the active node.
func (s *FollowerStates) Update(req *EchoRequestUpdate) {
// Update the peer information in the follower states. Note that this function
// runs on the active node. Returns true if a new entry was added, as opposed
// to modifying one already present.
func (s *FollowerStates) Update(req *EchoRequestUpdate) bool {
s.l.Lock()
defer s.l.Unlock()

state, ok := s.followers[req.NodeID]
if !ok {
state, present := s.followers[req.NodeID]
if !present {
state = &FollowerState{
IsDead: atomic.NewBool(false),
}
Expand All @@ -236,6 +238,8 @@ func (s *FollowerStates) Update(req *EchoRequestUpdate) {
state.Version = req.SDKVersion
state.UpgradeVersion = req.UpgradeVersion
state.RedundancyZone = req.RedundancyZone

return !present
}

// Clear wipes all the information regarding peers in the follower states.
Expand Down
34 changes: 17 additions & 17 deletions vault/logical_system_raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -248,9 +248,8 @@ func (b *SystemBackend) handleRaftRemovePeerUpdate() framework.OperationFunc {
if err := raftBackend.RemovePeer(ctx, serverID); err != nil {
return nil, err
}
if b.Core.raftFollowerStates != nil {
b.Core.raftFollowerStates.Delete(serverID)
}

b.Core.raftFollowerStates.Delete(serverID)

return nil, nil
}
Expand Down Expand Up @@ -351,29 +350,30 @@ func (b *SystemBackend) handleRaftBootstrapAnswerWrite() framework.OperationFunc
return nil, errors.New("could not decode raft TLS configuration")
}

var desiredSuffrage string
switch nonVoter {
case true:
err = raftBackend.AddNonVotingPeer(ctx, serverID, clusterAddr)
desiredSuffrage = "non-voter"
default:
err = raftBackend.AddPeer(ctx, serverID, clusterAddr)
}
if err != nil {
return nil, err
desiredSuffrage = "voter"
}

var desiredSuffrage string
added := b.Core.raftFollowerStates.Update(&raft.EchoRequestUpdate{
NodeID: serverID,
DesiredSuffrage: desiredSuffrage,
})

switch nonVoter {
case true:
desiredSuffrage = "non-voter"
err = raftBackend.AddNonVotingPeer(ctx, serverID, clusterAddr)
default:
desiredSuffrage = "voter"
err = raftBackend.AddPeer(ctx, serverID, clusterAddr)
}

if b.Core.raftFollowerStates != nil {
b.Core.raftFollowerStates.Update(&raft.EchoRequestUpdate{
NodeID: serverID,
DesiredSuffrage: desiredSuffrage,
})
if err != nil {
if added {
b.Core.raftFollowerStates.Delete(serverID)
}
return nil, err
}

peers, err := raftBackend.Peers(ctx)
Expand Down