Skip to content

Commit

Permalink
raft: clean up bootstrap
Browse files Browse the repository at this point in the history
This is the first (maybe not last) step in cleaning up the bootstrap
code around StartNode.

Initializing a Raft group for the first time is awkward, since a
configuration has to be pulled from thin air. The way this is solved
today is unclean: The app is supposed to pass peers to StartNode(),
we add configuration changes for them to the log, immediately pretend
that they are applied, but actually leave them unapplied (to give the
app a chance to observe them, though if the app did decide to not apply
them things would really go off the rails), and then return control to
the app. The app will then process the initial Readys and as a result
the configuration will be persisted to disk; restarts of the node then
use RestartNode which doesn't take any peers.

The code that did this lived awkwardly in two places fairly deep down
the callstack, though it was really only necessary in StartNode(). This
commit refactors things to make this more obvious: only StartNode does
this dance now. In particular, RawNode does not support this at all any
more; it expects the app to set up its Storage correctly.

Future work may provide helpers to make this "preseeding" of the Storage
more user-friendly. It isn't entirely straightforward to do so since
the Storage interface doesn't provide the right accessors for this
purpose. Briefly speaking, we want to make sure that a non-bootstrapped
node can never catch up via the log so that we can implicitly use one
of the "skipped" log entries to represent the configuration change into
the bootstrap configuration. This is an invasive change that affects
all consumers of raft, and it is of lower urgency since the code (post
this commit) already encapsulates the complexity sufficiently.
  • Loading branch information
tbg committed Jul 18, 2019
1 parent 6a9d89d commit 069125b
Show file tree
Hide file tree
Showing 6 changed files with 245 additions and 180 deletions.
6 changes: 5 additions & 1 deletion etcdserver/raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -479,7 +479,11 @@ func startNode(cfg ServerConfig, cl *membership.RaftCluster, ids []types.ID) (id
}
}

n = raft.StartNode(c, peers)
if len(peers) == 0 {
n = raft.RestartNode(c)
} else {
n = raft.StartNode(c, peers)
}
raftStatusMu.Lock()
raftStatus = n.Status
raftStatusMu.Unlock()
Expand Down
62 changes: 60 additions & 2 deletions raft/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -197,12 +197,63 @@ type Peer struct {

// StartNode returns a new Node given configuration and a list of raft peers.
// It appends a ConfChangeAddNode entry for each given peer to the initial log.
//
// Peers must not be zero length; call RestartNode in that case.
func StartNode(c *Config, peers []Peer) Node {
rn, err := NewRawNode(c, peers)
if len(peers) == 0 {
panic("no peers given; use RestartNode instead")
}
rn, err := NewRawNode(c)
if err != nil {
panic(err)
}

lastIndex, err := rn.raft.raftLog.storage.LastIndex()
if err != nil {
panic(err)
}

if lastIndex != 0 {
panic("can't StartNode on a nonempty Storage")
}

// We've faked out initial entries above, but nothing has been
// persisted. Start with an empty HardState (thus the first Ready will
// emit a HardState update for the app to persist).
rn.prevHardSt = emptyState

// TODO(tbg): remove StartNode and give the application the right tools to
// bootstrap the initial membership in a cleaner way.
rn.raft.becomeFollower(1, None)
ents := make([]pb.Entry, len(peers))
for i, peer := range peers {
cc := pb.ConfChange{Type: pb.ConfChangeAddNode, NodeID: peer.ID, Context: peer.Context}
data, err := cc.Marshal()
if err != nil {
panic(err)
}

ents[i] = pb.Entry{Type: pb.EntryConfChange, Term: 1, Index: uint64(i + 1), Data: data}
}
rn.raft.raftLog.append(ents...)

// Now apply them, mainly so that the application can call Campaign
// immediately after StartNode in tests. Note that these nodes will
// be added to raft twice: here and when the application's Ready
// loop calls ApplyConfChange. The calls to addNode must come after
// all calls to raftLog.append so progress.next is set after these
// bootstrapping entries (it is an error if we try to append these
// entries since they have already been committed).
// We do not set raftLog.applied so the application will be able
// to observe all conf changes via Ready.CommittedEntries.
//
// TODO(bdarnell): These entries are still unstable; do we need to preserve
// the invariant that committed < unstable?
rn.raft.raftLog.committed = uint64(len(ents))
for _, peer := range peers {
rn.raft.applyConfChange(pb.ConfChange{NodeID: peer.ID, Type: pb.ConfChangeAddNode})
}

n := newNode()
n.logger = c.Logger

Expand All @@ -215,7 +266,14 @@ func StartNode(c *Config, peers []Peer) Node {
// If the caller has an existing state machine, pass in the last log index that
// has been applied to it; otherwise use zero.
func RestartNode(c *Config) Node {
return StartNode(c, nil)
rn, err := NewRawNode(c)
if err != nil {
panic(err)
}
n := newNode()
n.logger = c.Logger
go n.run(rn)
return &n
}

type msgWithResult struct {
Expand Down
6 changes: 3 additions & 3 deletions raft/node_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -910,7 +910,7 @@ func TestCommitPagination(t *testing.T) {
s := NewMemoryStorage()
cfg := newTestConfig(1, []uint64{1}, 10, 1, s)
cfg.MaxCommittedSizePerReady = 2048
rn, err := NewRawNode(cfg, nil)
rn, err := NewRawNode(cfg)
if err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -1002,7 +1002,7 @@ func TestNodeCommitPaginationAfterRestart(t *testing.T) {
// this and *will* return it (which is how the Commit index ended up being 10 initially).
cfg.MaxSizePerMsg = size - uint64(s.ents[len(s.ents)-1].Size()) - 1

rn, err := NewRawNode(cfg, nil)
rn, err := NewRawNode(cfg)
if err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -1032,7 +1032,7 @@ func TestNodeBoundedLogGrowthWithPartition(t *testing.T) {
s := NewMemoryStorage()
cfg := newTestConfig(1, []uint64{1}, 10, 1, s)
cfg.MaxUncommittedEntriesSize = maxEntrySize
rn, err := NewRawNode(cfg, nil)
rn, err := NewRawNode(cfg)
if err != nil {
t.Fatal(err)
}
Expand Down
4 changes: 3 additions & 1 deletion raft/raft_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4271,9 +4271,11 @@ func newTestLearnerRaft(id uint64, peers []uint64, learners []uint64, election,
return newRaft(cfg)
}

// newTestRawNode sets up a RawNode with the given peers. The configuration will
// not be reflected in the Storage.
func newTestRawNode(id uint64, peers []uint64, election, heartbeat int, storage Storage) *RawNode {
cfg := newTestConfig(id, peers, election, heartbeat, storage)
rn, err := NewRawNode(cfg, nil)
rn, err := NewRawNode(cfg)
if err != nil {
panic(err)
}
Expand Down
62 changes: 4 additions & 58 deletions raft/rawnode.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,72 +37,18 @@ type RawNode struct {
prevHardSt pb.HardState
}

// NewRawNode returns a new RawNode given configuration and a list of raft peers.
func NewRawNode(config *Config, peers []Peer) (*RawNode, error) {
// NewRawNode instantiates a RawNode from the given configuration.
func NewRawNode(config *Config) (*RawNode, error) {
if config.ID == 0 {
panic("config.ID must not be zero")
}
r := newRaft(config)
rn := &RawNode{
raft: r,
}
if err := rn.init(peers); err != nil {
return nil, err
}
return rn, nil
}

func (rn *RawNode) init(peers []Peer) error {
r := rn.raft
lastIndex, err := rn.raft.raftLog.storage.LastIndex()
if err != nil {
return err
}
// If the log is empty, this is a new RawNode (like StartNode); otherwise it's
// restoring an existing RawNode (like RestartNode).
// TODO(bdarnell): rethink RawNode initialization and whether the application needs
// to be able to tell us when it expects the RawNode to exist.
if lastIndex == 0 {
rn.raft.becomeFollower(1, None)
ents := make([]pb.Entry, len(peers))
for i, peer := range peers {
cc := pb.ConfChange{Type: pb.ConfChangeAddNode, NodeID: peer.ID, Context: peer.Context}
data, err := cc.Marshal()
if err != nil {
return err
}

ents[i] = pb.Entry{Type: pb.EntryConfChange, Term: 1, Index: uint64(i + 1), Data: data}
}
rn.raft.raftLog.append(ents...)

// Now apply them, mainly so that the application can call Campaign
// immediately after StartNode in tests. Note that these nodes will
// be added to raft twice: here and when the application's Ready
// loop calls ApplyConfChange. The calls to addNode must come after
// all calls to raftLog.append so progress.next is set after these
// bootstrapping entries (it is an error if we try to append these
// entries since they have already been committed).
// We do not set raftLog.applied so the application will be able
// to observe all conf changes via Ready.CommittedEntries.
//
// TODO(bdarnell): These entries are still unstable; do we need to preserve
// the invariant that committed < unstable?
r.raftLog.committed = uint64(len(ents))
for _, peer := range peers {
r.applyConfChange(pb.ConfChange{NodeID: peer.ID, Type: pb.ConfChangeAddNode})
}
}

// Set the initial hard and soft states after performing all initialization.
rn.prevSoftSt = r.softState()
if lastIndex == 0 {
rn.prevHardSt = emptyState
} else {
rn.prevHardSt = r.hardState()
}

return nil
rn.prevHardSt = r.hardState()
return rn, nil
}

// Tick advances the internal logical clock by a single tick.
Expand Down
Loading

0 comments on commit 069125b

Please sign in to comment.