From f1d34aeecb6fd30761fd12f745e367ae2e8297ae Mon Sep 17 00:00:00 2001 From: Manish R Jain Date: Tue, 15 Sep 2020 16:41:39 -0700 Subject: [PATCH] fix(Alpha): Immediately take a snapshot if we don't have one (#6458) This PR would cause the Alpha leader to immediately trigger a snapshot creation if it doesn't already have one. This is useful when you bring a new cluster from bulk loader. If you remove an Alpha instance and add a new Alpha instance, the new follower won't get a snapshot if the leader doesn't have one. This PR avoids that by causing the leader to get a snapshot asap. Fixes DGRAPH-2445 (cherry picked from commit 6bd7f8e7d96d48108b4b524bdd8a2ea80fd077a0) --- worker/draft.go | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/worker/draft.go b/worker/draft.go index b200b3fa4b3..06bba941dba 100644 --- a/worker/draft.go +++ b/worker/draft.go @@ -933,12 +933,22 @@ func (n *node) checkpointAndClose(done chan struct{}) { } if n.AmLeader() { - var calculate bool + // If leader doesn't have a snapshot, we should create one immediately. This is very + // useful when you bring up the cluster from bulk loader. If you remove an alpha and + // add a new alpha, the new follower won't get a snapshot if the leader doesn't have + // one. + snap, err := n.Store.Snapshot() + if err != nil { + glog.Errorf("While retrieving snapshot from Store: %v\n", err) + continue + } + calculate := raft.IsEmptySnap(snap) // If no snapshot, then calculate one immediately. + if chk, err := n.Store.Checkpoint(); err == nil { if first, err := n.Store.FirstIndex(); err == nil { // Save some cycles by only calculating snapshot if the checkpoint has gone // quite a bit further than the first index. - calculate = chk >= first+uint64(x.WorkerConfig.SnapshotAfter) + calculate = calculate || chk >= first+uint64(x.WorkerConfig.SnapshotAfter) glog.V(3).Infof("Evaluating snapshot first:%d chk:%d (chk-first:%d) "+ "snapshotAfter:%d snap:%v", first, chk, chk-first, x.WorkerConfig.SnapshotAfter, calculate) @@ -956,7 +966,10 @@ func (n *node) checkpointAndClose(done chan struct{}) { // snapshotting. We just need to do enough, so that we don't have a huge backlog of // entries to process on a restart. if calculate { - if err := n.proposeSnapshot(x.WorkerConfig.SnapshotAfter); err != nil { + // We can set discardN argument to zero, because we already know that calculate + // would be true if either we absolutely needed to calculate the snapshot, + // or our checkpoint already crossed the SnapshotAfter threshold. + if err := n.proposeSnapshot(0); err != nil { glog.Errorf("While calculating and proposing snapshot: %v", err) } }