Skip to content

Commit

Permalink
Bug Fix: Ensure that followers wait before deleting tablet (#4602)
Browse files Browse the repository at this point in the history
In #4496, we introduced the concept of waiting for latest membership status, before the source Alpha proposes deletion of the tablet (/predicate). However, that fix only applied that check on the source Alpha group's leader. It should have been applied on each member of the Alpha group.

This PR fixes that by making the expected membership checksum part of the `CleanPredicate` proposal. That way, each Alpha can block until they see the latest membership update which says that they no longer serve that tablet, before deleting it. This fixes a bunch of `bank/move-tablet` failures, but we still see very rare `bank/move-tablet` violations.

Another fix this PR contains is to avoid doing posting list splits, which were causing `uid-set/move-tablet` failures, fixing #4538.

Changes:
* Martin's Split code has a bug. Let's not split posting lists.
* Better messaging around turning off split
* Make ExpectedChecksum part of the CleanPredicate proposal, so all followers also block deletion of the predicate until they have seen the latest membership update.

(cherry picked from commit 2540a93)
  • Loading branch information
manishrjain authored and danielmai committed Jan 27, 2020
1 parent 79be69d commit d61eb69
Show file tree
Hide file tree
Showing 6 changed files with 324 additions and 281 deletions.
1 change: 1 addition & 0 deletions protos/pb.proto
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ message Proposal {
OracleDelta delta = 8;
Snapshot snapshot = 9; // Used to tell the group when to take snapshot.
uint64 index = 10; // Used to store Raft index, in raft.Ready.
uint64 expected_checksum = 11; // Block an operation until membership reaches this checksum.
}

message KVS {
Expand Down
559 changes: 298 additions & 261 deletions protos/pb/pb.pb.go

Large diffs are not rendered by default.

16 changes: 16 additions & 0 deletions worker/draft.go
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,22 @@ func (n *node) applyCommitted(proposal *pb.Proposal) error {

case len(proposal.CleanPredicate) > 0:
n.elog.Printf("Cleaning predicate: %s", proposal.CleanPredicate)
end := time.Now().Add(10 * time.Second)
for proposal.ExpectedChecksum > 0 && time.Now().Before(end) {
cur := atomic.LoadUint64(&groups().membershipChecksum)
if proposal.ExpectedChecksum == cur {
break
}
time.Sleep(100 * time.Millisecond)
glog.Infof("Waiting for checksums to match. Expected: %d. Current: %d\n",
proposal.ExpectedChecksum, cur)
}
if time.Now().After(end) {
glog.Warningf(
"Giving up on predicate deletion: %q due to timeout. Wanted checksum: %d.",
proposal.CleanPredicate, proposal.ExpectedChecksum)
return nil
}
return posting.DeletePredicate(ctx, proposal.CleanPredicate)

case proposal.Delta != nil:
Expand Down
1 change: 1 addition & 0 deletions worker/groups.go
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,7 @@ func (g *groupi) applyState(state *pb.MembershipState) {
// removing a freshly added node.

for _, member := range g.state.GetRemoved() {
// TODO: This leader check can be done once instead of repeatedly.
if member.GetGroupId() == g.Node.gid && g.Node.AmLeader() {
go func() {
// Don't try to remove a member if it's already marked as removed in
Expand Down
25 changes: 6 additions & 19 deletions worker/predicate_move.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@ import (
"fmt"
"io"
"strconv"
"sync/atomic"
"time"

"github.com/golang/glog"
"github.com/pkg/errors"
Expand Down Expand Up @@ -193,25 +191,14 @@ func (w *grpcWorker) MovePredicate(ctx context.Context,
return &emptyPayload, errEmptyPredicate
}

// This loop ensures that we have seen the latest membership update following the move where
// this predicate now belongs to another group. Without this check, this group could end up
// serving transactions asking for this predicate, even after this tablet has been deleted. This
// issue is known to have caused Jepsen failures.
for in.ExpectedChecksum > 0 {
cur := atomic.LoadUint64(&groups().membershipChecksum)
if in.ExpectedChecksum == cur {
break
}
if ctx.Err() != nil {
return &emptyPayload, ctx.Err()
}
glog.Infof("Waiting for checksums to match. Expected: %d. Current: %d\n",
in.ExpectedChecksum, cur)
time.Sleep(time.Second)
}
if in.DestGid == 0 {
glog.Infof("Was instructed to delete tablet: %v", in.Predicate)
p := &pb.Proposal{CleanPredicate: in.Predicate}
// Expected Checksum ensures that all the members of this group would block until they get
// the latest membership status where this predicate now belongs to another group. So they
// know that they are no longer serving this predicate, before they delete it from their
// state. Without this checksum, the members could end up deleting the predicate and then
// serve a request asking for that predicate, causing Jepsen failures.
p := &pb.Proposal{CleanPredicate: in.Predicate, ExpectedChecksum: in.ExpectedChecksum}
return &emptyPayload, groups().Node.proposeAndWait(ctx, p)
}
if err := posting.Oracle().WaitForTs(ctx, in.TxnTs); err != nil {
Expand Down
3 changes: 2 additions & 1 deletion worker/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -796,7 +796,8 @@ func processTask(ctx context.Context, q *pb.Query, gid uint32) (*pb.Result, erro
stop := x.SpanTimer(span, "processTask"+q.Attr)
defer stop()

span.Annotatef(nil, "Waiting for startTs: %d", q.ReadTs)
span.Annotatef(nil, "Waiting for startTs: %d at node: %d, gid: %d",
q.ReadTs, groups().Node.Id, gid)
if err := posting.Oracle().WaitForTs(ctx, q.ReadTs); err != nil {
return &pb.Result{}, err
}
Expand Down

0 comments on commit d61eb69

Please sign in to comment.