Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 30 additions & 3 deletions go/vt/vtctl/reparentutil/emergency_reparenter.go
Original file line number Diff line number Diff line change
Expand Up @@ -351,17 +351,44 @@ func (erp *EmergencyReparenter) waitForAllRelayLogsToApply(
groupCtx, groupCancel := context.WithTimeout(ctx, opts.WaitReplicasTimeout)
defer groupCancel()

waiterCount := 0

for candidate := range validCandidates {
// When we called StopReplicationAndBuildStatusMaps, we got back two
// maps: (1) the StopReplicationStatus of any replicas that actually
// stopped replication; and (2) the MasterStatus of anything that
// returned ErrNotReplica, which is a tablet that is either the current
// primary or is stuck thinking it is a MASTER but is not in actuality.
//
// If we have a tablet in the validCandidates map that does not appear
// in the statusMap, then we have either (a) the current primary, which
// is not replicating, so it is not applying relay logs; or (b) a tablet
// that is stuck thinking it is MASTER but is not in actuality. In that
// second case - (b) - we will most likely find that the stuck MASTER
// does not have a winning position, and fail the ERS. If, on the other
// hand, it does have a winning position, we are trusting the operator
// to know what they are doing by emergency-reparenting onto that
// tablet. In either case, it does not make sense to wait for relay logs
// to apply on a tablet that was never applying relay logs in the first
// place, so we skip it, and log that we did.
status, ok := statusMap[candidate]
if !ok {
erp.logger.Infof("EmergencyReparent candidate %v not in replica status map; this means it was not running replication (because it was formerly MASTER), so skipping WaitForRelayLogsToApply step for this candidate", candidate)
continue
}

go func(alias string) {
var err error
defer func() { errCh <- err }()
err = WaitForRelayLogsToApply(groupCtx, erp.tmc, tabletMap[alias], statusMap[alias])
err = WaitForRelayLogsToApply(groupCtx, erp.tmc, tabletMap[alias], status)
}(candidate)

waiterCount++
}

errgroup := concurrency.ErrorGroup{
NumGoroutines: len(validCandidates),
NumRequiredSuccesses: len(validCandidates),
NumGoroutines: waiterCount,
NumRequiredSuccesses: waiterCount,
NumAllowedErrors: 0,
}
rec := errgroup.Wait(groupCancel, errCh)
Expand Down
105 changes: 105 additions & 0 deletions go/vt/vtctl/reparentutil/emergency_reparenter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,111 @@ func TestEmergencyReparenter_reparentShardLocked(t *testing.T) {
},
shouldErr: false,
},
{
name: "success with existing primary",
ts: memorytopo.NewServer("zone1"),
tmc: &emergencyReparenterTestTMClient{
DemoteMasterResults: map[string]struct {
Status *replicationdatapb.MasterStatus
Error error
}{
"zone1-0000000100": {
Status: &replicationdatapb.MasterStatus{
Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21",
},
},
},
PopulateReparentJournalResults: map[string]error{
"zone1-0000000102": nil,
},
PromoteReplicaResults: map[string]struct {
Result string
Error error
}{
"zone1-0000000102": {
Result: "ok",
Error: nil,
},
},
SetMasterResults: map[string]error{
"zone1-0000000100": nil,
"zone1-0000000101": nil,
},
StopReplicationAndGetStatusResults: map[string]struct {
Status *replicationdatapb.Status
StopStatus *replicationdatapb.StopReplicationStatus
Error error
}{
"zone1-0000000100": { // This tablet claims MASTER, so is not running replication.
Error: mysql.ErrNotReplica,
},
"zone1-0000000101": {
StopStatus: &replicationdatapb.StopReplicationStatus{
Before: &replicationdatapb.Status{},
After: &replicationdatapb.Status{
MasterUuid: "3E11FA47-71CA-11E1-9E33-C80AA9429562",
RelayLogPosition: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21",
},
},
},
"zone1-0000000102": {
StopStatus: &replicationdatapb.StopReplicationStatus{
Before: &replicationdatapb.Status{},
After: &replicationdatapb.Status{
MasterUuid: "3E11FA47-71CA-11E1-9E33-C80AA9429562",
RelayLogPosition: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-26",
},
},
},
},
WaitForPositionResults: map[string]map[string]error{
"zone1-0000000101": {
"MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21": nil,
},
"zone1-0000000102": {
"MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-26": nil,
},
},
},
shards: []*vtctldatapb.Shard{
{
Keyspace: "testkeyspace",
Name: "-",
},
},
tablets: []*topodatapb.Tablet{
{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
Keyspace: "testkeyspace",
Shard: "-",
Type: topodatapb.TabletType_MASTER,
},
{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 101,
},
Keyspace: "testkeyspace",
Shard: "-",
},
{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 102,
},
Keyspace: "testkeyspace",
Shard: "-",
Hostname: "most up-to-date position, wins election",
},
},
keyspace: "testkeyspace",
shard: "-",
opts: EmergencyReparentOptions{},
shouldErr: false,
},
{
name: "shard not found",
ts: memorytopo.NewServer("zone1"),
Expand Down