From c5bbcc7865d9edf481ad7911904caa561db2f47b Mon Sep 17 00:00:00 2001 From: Andrew Mason Date: Fri, 19 Feb 2021 16:34:11 -0500 Subject: [PATCH 1/4] ERS should remove currently-serving shard primary from consideration This was in the old implementation, and was overlooked in the port to an encapsulated struct. I've added tests as penance. Signed-off-by: Andrew Mason --- .../reparentutil/emergency_reparenter.go | 42 +++ .../reparentutil/emergency_reparenter_test.go | 263 ++++++++++++++++++ 2 files changed, 305 insertions(+) diff --git a/go/vt/vtctl/reparentutil/emergency_reparenter.go b/go/vt/vtctl/reparentutil/emergency_reparenter.go index 1ad55715985..5616e8d10fa 100644 --- a/go/vt/vtctl/reparentutil/emergency_reparenter.go +++ b/go/vt/vtctl/reparentutil/emergency_reparenter.go @@ -30,6 +30,7 @@ import ( "vitess.io/vitess/go/vt/logutil" "vitess.io/vitess/go/vt/topo" "vitess.io/vitess/go/vt/topo/topoproto" + "vitess.io/vitess/go/vt/topotools" "vitess.io/vitess/go/vt/topotools/events" "vitess.io/vitess/go/vt/vterrors" "vitess.io/vitess/go/vt/vttablet/tmclient" @@ -277,6 +278,47 @@ func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *eve return vterrors.Wrapf(err, "failed to get tablet map for %v/%v: %v", keyspace, shard, err) } + if opts.NewPrimaryAlias != nil { + primaryElectAliasStr := topoproto.TabletAliasString(opts.NewPrimaryAlias) + primaryElectTabletInfo, ok := tabletMap[primaryElectAliasStr] + if !ok { + return vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "primary-elect tablet %v is not in the shard", primaryElectAliasStr) + } + + ev.NewMaster = *primaryElectTabletInfo.Tablet + + if topoproto.TabletAliasEqual(shardInfo.MasterAlias, opts.NewPrimaryAlias) { + return vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "primary-elect tablet %v is already the shard primary", primaryElectAliasStr) + } + } + + if shardInfo.HasMaster() { + deleteOldPrimary := true + shardPrimaryAliasStr := topoproto.TabletAliasString(shardInfo.MasterAlias) + shardPrimaryTabletInfo, ok := tabletMap[shardPrimaryAliasStr] + if ok { + delete(tabletMap, shardPrimaryAliasStr) + } else { + shardPrimaryTabletInfo, err = erp.ts.GetTablet(ctx, shardInfo.MasterAlias) + if err != nil { + erp.logger.Warningf("cannot read old primary tablet %v, won't touch it: %v", shardPrimaryAliasStr, err) + deleteOldPrimary = false + } + } + + if deleteOldPrimary { + ev.OldMaster = *shardPrimaryTabletInfo.Tablet + erp.logger.Infof("deleting old primary", shardPrimaryAliasStr) + + ctx, cancel := context.WithTimeout(ctx, opts.WaitReplicasTimeout) + defer cancel() + + if err := topotools.DeleteTablet(ctx, erp.ts, shardPrimaryTabletInfo.Tablet); err != nil { + erp.logger.Warningf("failed to delete old primary tablet %v: %v", shardPrimaryAliasStr, err) + } + } + } + statusMap, primaryStatusMap, err := StopReplicationAndBuildStatusMaps(ctx, erp.tmc, ev, tabletMap, opts.WaitReplicasTimeout, opts.IgnoreReplicas, erp.logger) if err != nil { return vterrors.Wrapf(err, "failed to stop replication and build status maps: %v", err) diff --git a/go/vt/vtctl/reparentutil/emergency_reparenter_test.go b/go/vt/vtctl/reparentutil/emergency_reparenter_test.go index 95ba0b9e885..40154989e87 100644 --- a/go/vt/vtctl/reparentutil/emergency_reparenter_test.go +++ b/go/vt/vtctl/reparentutil/emergency_reparenter_test.go @@ -37,6 +37,7 @@ import ( replicationdatapb "vitess.io/vitess/go/vt/proto/replicationdata" topodatapb "vitess.io/vitess/go/vt/proto/topodata" vtctldatapb "vitess.io/vitess/go/vt/proto/vtctldata" + "vitess.io/vitess/go/vt/proto/vttime" ) func TestNewEmergencyReparenter(t *testing.T) { @@ -338,6 +339,268 @@ func TestEmergencyReparenter_reparentShardLocked(t *testing.T) { }, shouldErr: false, }, + { + name: "requested primary-elect is already shard primary", + ts: memorytopo.NewServer("zone1"), + shards: []*vtctldatapb.Shard{ + { + Keyspace: "testkeyspace", + Name: "-", + Shard: &topodatapb.Shard{ + IsMasterServing: true, + MasterAlias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 101, + }, + MasterTermStartTime: &vttime.Time{ + Seconds: 100, + }, + }, + }, + }, + tablets: []*topodatapb.Tablet{ + { + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + Keyspace: "testkeyspace", + Shard: "-", + }, + { + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 101, + }, + Keyspace: "testkeyspace", + Shard: "-", + Type: topodatapb.TabletType_MASTER, + MasterTermStartTime: &vttime.Time{ + Seconds: 100, + }, + }, + { + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 102, + }, + Keyspace: "testkeyspace", + Shard: "-", + }, + }, + keyspace: "testkeyspace", + shard: "-", + opts: EmergencyReparentOptions{ + NewPrimaryAlias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 101, + }, + }, + shouldErr: true, + }, + { + name: "shard primary is not in tablet map and ignored", + ts: memorytopo.NewServer("zone1"), + tmc: &emergencyReparenterTestTMClient{ + PopulateReparentJournalResults: map[string]error{ + "zone1-0000000102": nil, + }, + PromoteReplicaResults: map[string]struct { + Result string + Error error + }{ + "zone1-0000000102": { + Result: "ok", + Error: nil, + }, + }, + SetMasterResults: map[string]error{ + "zone1-0000000100": nil, + "zone1-0000000101": nil, + }, + StopReplicationAndGetStatusResults: map[string]struct { + Status *replicationdatapb.Status + StopStatus *replicationdatapb.StopReplicationStatus + Error error + }{ + "zone1-0000000100": { + StopStatus: &replicationdatapb.StopReplicationStatus{ + Before: &replicationdatapb.Status{}, + After: &replicationdatapb.Status{ + MasterUuid: "3E11FA47-71CA-11E1-9E33-C80AA9429562", + RelayLogPosition: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21", + }, + }, + }, + "zone1-0000000101": { + StopStatus: &replicationdatapb.StopReplicationStatus{ + Before: &replicationdatapb.Status{}, + After: &replicationdatapb.Status{ + MasterUuid: "3E11FA47-71CA-11E1-9E33-C80AA9429562", + RelayLogPosition: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21", + }, + }, + }, + "zone1-0000000102": { + StopStatus: &replicationdatapb.StopReplicationStatus{ + Before: &replicationdatapb.Status{}, + After: &replicationdatapb.Status{ + MasterUuid: "3E11FA47-71CA-11E1-9E33-C80AA9429562", + RelayLogPosition: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-26", + }, + }, + }, + }, + WaitForPositionResults: map[string]map[string]error{ + "zone1-0000000100": { + "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21": nil, + }, + "zone1-0000000101": { + "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21": nil, + }, + "zone1-0000000102": { + "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-26": nil, + }, + }, + }, + shards: []*vtctldatapb.Shard{ + { + Keyspace: "testkeyspace", + Name: "-", + Shard: &topodatapb.Shard{ + IsMasterServing: true, + MasterAlias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 404, + }, + MasterTermStartTime: &vttime.Time{ + Seconds: 100, + }, + }, + }, + }, + tablets: []*topodatapb.Tablet{ + { + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + Keyspace: "testkeyspace", + Shard: "-", + }, + { + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 101, + }, + Keyspace: "testkeyspace", + Shard: "-", + }, + { + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 102, + }, + Keyspace: "testkeyspace", + Shard: "-", + }, + }, + keyspace: "testkeyspace", + shard: "-", + opts: EmergencyReparentOptions{}, + shouldErr: false, + }, + { + // Here, all our (2) tablets are tied, but the current primary is + // ignored, and we promote zone1-101. + name: "shard primary is deleted from tablet map", + ts: memorytopo.NewServer("zone1"), + tmc: &emergencyReparenterTestTMClient{ + PopulateReparentJournalResults: map[string]error{ + "zone1-0000000101": nil, + }, + PromoteReplicaResults: map[string]struct { + Result string + Error error + }{ + "zone1-0000000101": { + Result: "ok", + Error: nil, + }, + }, + SetMasterResults: map[string]error{ + "zone1-0000000100": nil, + }, + StopReplicationAndGetStatusResults: map[string]struct { + Status *replicationdatapb.Status + StopStatus *replicationdatapb.StopReplicationStatus + Error error + }{ + "zone1-0000000100": { + StopStatus: &replicationdatapb.StopReplicationStatus{ + Before: &replicationdatapb.Status{}, + After: &replicationdatapb.Status{ + MasterUuid: "3E11FA47-71CA-11E1-9E33-C80AA9429562", + RelayLogPosition: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21", + }, + }, + }, + "zone1-0000000101": { + StopStatus: &replicationdatapb.StopReplicationStatus{ + Before: &replicationdatapb.Status{}, + After: &replicationdatapb.Status{ + MasterUuid: "3E11FA47-71CA-11E1-9E33-C80AA9429562", + RelayLogPosition: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21", + }, + }, + }, + }, + WaitForPositionResults: map[string]map[string]error{ + "zone1-0000000100": { + "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21": nil, + }, + "zone1-0000000101": { + "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21": nil, + }, + }, + }, + shards: []*vtctldatapb.Shard{ + { + Keyspace: "testkeyspace", + Name: "-", + Shard: &topodatapb.Shard{ + IsMasterServing: true, + MasterAlias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + }, + }, + }, + tablets: []*topodatapb.Tablet{ + { + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + Keyspace: "testkeyspace", + Shard: "-", + Type: topodatapb.TabletType_MASTER, + }, + { + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 101, + }, + Keyspace: "testkeyspace", + Shard: "-", + }, + }, + keyspace: "testkeyspace", + shard: "-", + opts: EmergencyReparentOptions{}, + shouldErr: false, + }, { name: "shard not found", ts: memorytopo.NewServer("zone1"), From 949fe73676976cf58de6846563ebf6c359dd697e Mon Sep 17 00:00:00 2001 From: Andrew Mason Date: Sat, 20 Feb 2021 11:40:58 -0500 Subject: [PATCH 2/4] Revert "ERS should remove currently-serving shard primary from consideration" This reverts commit c5bbcc7865d9edf481ad7911904caa561db2f47b. Signed-off-by: Andrew Mason --- .../reparentutil/emergency_reparenter.go | 42 --- .../reparentutil/emergency_reparenter_test.go | 263 ------------------ 2 files changed, 305 deletions(-) diff --git a/go/vt/vtctl/reparentutil/emergency_reparenter.go b/go/vt/vtctl/reparentutil/emergency_reparenter.go index 5616e8d10fa..1ad55715985 100644 --- a/go/vt/vtctl/reparentutil/emergency_reparenter.go +++ b/go/vt/vtctl/reparentutil/emergency_reparenter.go @@ -30,7 +30,6 @@ import ( "vitess.io/vitess/go/vt/logutil" "vitess.io/vitess/go/vt/topo" "vitess.io/vitess/go/vt/topo/topoproto" - "vitess.io/vitess/go/vt/topotools" "vitess.io/vitess/go/vt/topotools/events" "vitess.io/vitess/go/vt/vterrors" "vitess.io/vitess/go/vt/vttablet/tmclient" @@ -278,47 +277,6 @@ func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *eve return vterrors.Wrapf(err, "failed to get tablet map for %v/%v: %v", keyspace, shard, err) } - if opts.NewPrimaryAlias != nil { - primaryElectAliasStr := topoproto.TabletAliasString(opts.NewPrimaryAlias) - primaryElectTabletInfo, ok := tabletMap[primaryElectAliasStr] - if !ok { - return vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "primary-elect tablet %v is not in the shard", primaryElectAliasStr) - } - - ev.NewMaster = *primaryElectTabletInfo.Tablet - - if topoproto.TabletAliasEqual(shardInfo.MasterAlias, opts.NewPrimaryAlias) { - return vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "primary-elect tablet %v is already the shard primary", primaryElectAliasStr) - } - } - - if shardInfo.HasMaster() { - deleteOldPrimary := true - shardPrimaryAliasStr := topoproto.TabletAliasString(shardInfo.MasterAlias) - shardPrimaryTabletInfo, ok := tabletMap[shardPrimaryAliasStr] - if ok { - delete(tabletMap, shardPrimaryAliasStr) - } else { - shardPrimaryTabletInfo, err = erp.ts.GetTablet(ctx, shardInfo.MasterAlias) - if err != nil { - erp.logger.Warningf("cannot read old primary tablet %v, won't touch it: %v", shardPrimaryAliasStr, err) - deleteOldPrimary = false - } - } - - if deleteOldPrimary { - ev.OldMaster = *shardPrimaryTabletInfo.Tablet - erp.logger.Infof("deleting old primary", shardPrimaryAliasStr) - - ctx, cancel := context.WithTimeout(ctx, opts.WaitReplicasTimeout) - defer cancel() - - if err := topotools.DeleteTablet(ctx, erp.ts, shardPrimaryTabletInfo.Tablet); err != nil { - erp.logger.Warningf("failed to delete old primary tablet %v: %v", shardPrimaryAliasStr, err) - } - } - } - statusMap, primaryStatusMap, err := StopReplicationAndBuildStatusMaps(ctx, erp.tmc, ev, tabletMap, opts.WaitReplicasTimeout, opts.IgnoreReplicas, erp.logger) if err != nil { return vterrors.Wrapf(err, "failed to stop replication and build status maps: %v", err) diff --git a/go/vt/vtctl/reparentutil/emergency_reparenter_test.go b/go/vt/vtctl/reparentutil/emergency_reparenter_test.go index 40154989e87..95ba0b9e885 100644 --- a/go/vt/vtctl/reparentutil/emergency_reparenter_test.go +++ b/go/vt/vtctl/reparentutil/emergency_reparenter_test.go @@ -37,7 +37,6 @@ import ( replicationdatapb "vitess.io/vitess/go/vt/proto/replicationdata" topodatapb "vitess.io/vitess/go/vt/proto/topodata" vtctldatapb "vitess.io/vitess/go/vt/proto/vtctldata" - "vitess.io/vitess/go/vt/proto/vttime" ) func TestNewEmergencyReparenter(t *testing.T) { @@ -339,268 +338,6 @@ func TestEmergencyReparenter_reparentShardLocked(t *testing.T) { }, shouldErr: false, }, - { - name: "requested primary-elect is already shard primary", - ts: memorytopo.NewServer("zone1"), - shards: []*vtctldatapb.Shard{ - { - Keyspace: "testkeyspace", - Name: "-", - Shard: &topodatapb.Shard{ - IsMasterServing: true, - MasterAlias: &topodatapb.TabletAlias{ - Cell: "zone1", - Uid: 101, - }, - MasterTermStartTime: &vttime.Time{ - Seconds: 100, - }, - }, - }, - }, - tablets: []*topodatapb.Tablet{ - { - Alias: &topodatapb.TabletAlias{ - Cell: "zone1", - Uid: 100, - }, - Keyspace: "testkeyspace", - Shard: "-", - }, - { - Alias: &topodatapb.TabletAlias{ - Cell: "zone1", - Uid: 101, - }, - Keyspace: "testkeyspace", - Shard: "-", - Type: topodatapb.TabletType_MASTER, - MasterTermStartTime: &vttime.Time{ - Seconds: 100, - }, - }, - { - Alias: &topodatapb.TabletAlias{ - Cell: "zone1", - Uid: 102, - }, - Keyspace: "testkeyspace", - Shard: "-", - }, - }, - keyspace: "testkeyspace", - shard: "-", - opts: EmergencyReparentOptions{ - NewPrimaryAlias: &topodatapb.TabletAlias{ - Cell: "zone1", - Uid: 101, - }, - }, - shouldErr: true, - }, - { - name: "shard primary is not in tablet map and ignored", - ts: memorytopo.NewServer("zone1"), - tmc: &emergencyReparenterTestTMClient{ - PopulateReparentJournalResults: map[string]error{ - "zone1-0000000102": nil, - }, - PromoteReplicaResults: map[string]struct { - Result string - Error error - }{ - "zone1-0000000102": { - Result: "ok", - Error: nil, - }, - }, - SetMasterResults: map[string]error{ - "zone1-0000000100": nil, - "zone1-0000000101": nil, - }, - StopReplicationAndGetStatusResults: map[string]struct { - Status *replicationdatapb.Status - StopStatus *replicationdatapb.StopReplicationStatus - Error error - }{ - "zone1-0000000100": { - StopStatus: &replicationdatapb.StopReplicationStatus{ - Before: &replicationdatapb.Status{}, - After: &replicationdatapb.Status{ - MasterUuid: "3E11FA47-71CA-11E1-9E33-C80AA9429562", - RelayLogPosition: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21", - }, - }, - }, - "zone1-0000000101": { - StopStatus: &replicationdatapb.StopReplicationStatus{ - Before: &replicationdatapb.Status{}, - After: &replicationdatapb.Status{ - MasterUuid: "3E11FA47-71CA-11E1-9E33-C80AA9429562", - RelayLogPosition: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21", - }, - }, - }, - "zone1-0000000102": { - StopStatus: &replicationdatapb.StopReplicationStatus{ - Before: &replicationdatapb.Status{}, - After: &replicationdatapb.Status{ - MasterUuid: "3E11FA47-71CA-11E1-9E33-C80AA9429562", - RelayLogPosition: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-26", - }, - }, - }, - }, - WaitForPositionResults: map[string]map[string]error{ - "zone1-0000000100": { - "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21": nil, - }, - "zone1-0000000101": { - "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21": nil, - }, - "zone1-0000000102": { - "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-26": nil, - }, - }, - }, - shards: []*vtctldatapb.Shard{ - { - Keyspace: "testkeyspace", - Name: "-", - Shard: &topodatapb.Shard{ - IsMasterServing: true, - MasterAlias: &topodatapb.TabletAlias{ - Cell: "zone1", - Uid: 404, - }, - MasterTermStartTime: &vttime.Time{ - Seconds: 100, - }, - }, - }, - }, - tablets: []*topodatapb.Tablet{ - { - Alias: &topodatapb.TabletAlias{ - Cell: "zone1", - Uid: 100, - }, - Keyspace: "testkeyspace", - Shard: "-", - }, - { - Alias: &topodatapb.TabletAlias{ - Cell: "zone1", - Uid: 101, - }, - Keyspace: "testkeyspace", - Shard: "-", - }, - { - Alias: &topodatapb.TabletAlias{ - Cell: "zone1", - Uid: 102, - }, - Keyspace: "testkeyspace", - Shard: "-", - }, - }, - keyspace: "testkeyspace", - shard: "-", - opts: EmergencyReparentOptions{}, - shouldErr: false, - }, - { - // Here, all our (2) tablets are tied, but the current primary is - // ignored, and we promote zone1-101. - name: "shard primary is deleted from tablet map", - ts: memorytopo.NewServer("zone1"), - tmc: &emergencyReparenterTestTMClient{ - PopulateReparentJournalResults: map[string]error{ - "zone1-0000000101": nil, - }, - PromoteReplicaResults: map[string]struct { - Result string - Error error - }{ - "zone1-0000000101": { - Result: "ok", - Error: nil, - }, - }, - SetMasterResults: map[string]error{ - "zone1-0000000100": nil, - }, - StopReplicationAndGetStatusResults: map[string]struct { - Status *replicationdatapb.Status - StopStatus *replicationdatapb.StopReplicationStatus - Error error - }{ - "zone1-0000000100": { - StopStatus: &replicationdatapb.StopReplicationStatus{ - Before: &replicationdatapb.Status{}, - After: &replicationdatapb.Status{ - MasterUuid: "3E11FA47-71CA-11E1-9E33-C80AA9429562", - RelayLogPosition: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21", - }, - }, - }, - "zone1-0000000101": { - StopStatus: &replicationdatapb.StopReplicationStatus{ - Before: &replicationdatapb.Status{}, - After: &replicationdatapb.Status{ - MasterUuid: "3E11FA47-71CA-11E1-9E33-C80AA9429562", - RelayLogPosition: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21", - }, - }, - }, - }, - WaitForPositionResults: map[string]map[string]error{ - "zone1-0000000100": { - "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21": nil, - }, - "zone1-0000000101": { - "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21": nil, - }, - }, - }, - shards: []*vtctldatapb.Shard{ - { - Keyspace: "testkeyspace", - Name: "-", - Shard: &topodatapb.Shard{ - IsMasterServing: true, - MasterAlias: &topodatapb.TabletAlias{ - Cell: "zone1", - Uid: 100, - }, - }, - }, - }, - tablets: []*topodatapb.Tablet{ - { - Alias: &topodatapb.TabletAlias{ - Cell: "zone1", - Uid: 100, - }, - Keyspace: "testkeyspace", - Shard: "-", - Type: topodatapb.TabletType_MASTER, - }, - { - Alias: &topodatapb.TabletAlias{ - Cell: "zone1", - Uid: 101, - }, - Keyspace: "testkeyspace", - Shard: "-", - }, - }, - keyspace: "testkeyspace", - shard: "-", - opts: EmergencyReparentOptions{}, - shouldErr: false, - }, { name: "shard not found", ts: memorytopo.NewServer("zone1"), From 3df1501536dab76e76ed9970b8419065843bdc19 Mon Sep 17 00:00:00 2001 From: Andrew Mason Date: Sat, 20 Feb 2021 15:33:48 -0500 Subject: [PATCH 3/4] Add minimal test case to cause panic Signed-off-by: Andrew Mason --- .../reparentutil/emergency_reparenter_test.go | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/go/vt/vtctl/reparentutil/emergency_reparenter_test.go b/go/vt/vtctl/reparentutil/emergency_reparenter_test.go index 95ba0b9e885..4f1d7170cab 100644 --- a/go/vt/vtctl/reparentutil/emergency_reparenter_test.go +++ b/go/vt/vtctl/reparentutil/emergency_reparenter_test.go @@ -338,6 +338,111 @@ func TestEmergencyReparenter_reparentShardLocked(t *testing.T) { }, shouldErr: false, }, + { + name: "success with existing primary", + ts: memorytopo.NewServer("zone1"), + tmc: &emergencyReparenterTestTMClient{ + DemoteMasterResults: map[string]struct { + Status *replicationdatapb.MasterStatus + Error error + }{ + "zone1-0000000100": { + Status: &replicationdatapb.MasterStatus{ + Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21", + }, + }, + }, + PopulateReparentJournalResults: map[string]error{ + "zone1-0000000102": nil, + }, + PromoteReplicaResults: map[string]struct { + Result string + Error error + }{ + "zone1-0000000102": { + Result: "ok", + Error: nil, + }, + }, + SetMasterResults: map[string]error{ + "zone1-0000000100": nil, + "zone1-0000000101": nil, + }, + StopReplicationAndGetStatusResults: map[string]struct { + Status *replicationdatapb.Status + StopStatus *replicationdatapb.StopReplicationStatus + Error error + }{ + "zone1-0000000100": { // This tablet claims MASTER, so is not running replication. + Error: mysql.ErrNotReplica, + }, + "zone1-0000000101": { + StopStatus: &replicationdatapb.StopReplicationStatus{ + Before: &replicationdatapb.Status{}, + After: &replicationdatapb.Status{ + MasterUuid: "3E11FA47-71CA-11E1-9E33-C80AA9429562", + RelayLogPosition: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21", + }, + }, + }, + "zone1-0000000102": { + StopStatus: &replicationdatapb.StopReplicationStatus{ + Before: &replicationdatapb.Status{}, + After: &replicationdatapb.Status{ + MasterUuid: "3E11FA47-71CA-11E1-9E33-C80AA9429562", + RelayLogPosition: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-26", + }, + }, + }, + }, + WaitForPositionResults: map[string]map[string]error{ + "zone1-0000000101": { + "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21": nil, + }, + "zone1-0000000102": { + "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-26": nil, + }, + }, + }, + shards: []*vtctldatapb.Shard{ + { + Keyspace: "testkeyspace", + Name: "-", + }, + }, + tablets: []*topodatapb.Tablet{ + { + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + Keyspace: "testkeyspace", + Shard: "-", + Type: topodatapb.TabletType_MASTER, + }, + { + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 101, + }, + Keyspace: "testkeyspace", + Shard: "-", + }, + { + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 102, + }, + Keyspace: "testkeyspace", + Shard: "-", + Hostname: "most up-to-date position, wins election", + }, + }, + keyspace: "testkeyspace", + shard: "-", + opts: EmergencyReparentOptions{}, + shouldErr: false, + }, { name: "shard not found", ts: memorytopo.NewServer("zone1"), From 2400c5bba7c2f26598920c6a3564d9e26f231c5d Mon Sep 17 00:00:00 2001 From: Andrew Mason Date: Sat, 20 Feb 2021 15:49:57 -0500 Subject: [PATCH 4/4] Fix panic by skipping any candidates that returned ErrNotReplica during StopReplication phase Signed-off-by: Andrew Mason --- .../reparentutil/emergency_reparenter.go | 33 +++++++++++++++++-- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/go/vt/vtctl/reparentutil/emergency_reparenter.go b/go/vt/vtctl/reparentutil/emergency_reparenter.go index 1ad55715985..ca4d0dcc3c2 100644 --- a/go/vt/vtctl/reparentutil/emergency_reparenter.go +++ b/go/vt/vtctl/reparentutil/emergency_reparenter.go @@ -351,17 +351,44 @@ func (erp *EmergencyReparenter) waitForAllRelayLogsToApply( groupCtx, groupCancel := context.WithTimeout(ctx, opts.WaitReplicasTimeout) defer groupCancel() + waiterCount := 0 + for candidate := range validCandidates { + // When we called StopReplicationAndBuildStatusMaps, we got back two + // maps: (1) the StopReplicationStatus of any replicas that actually + // stopped replication; and (2) the MasterStatus of anything that + // returned ErrNotReplica, which is a tablet that is either the current + // primary or is stuck thinking it is a MASTER but is not in actuality. + // + // If we have a tablet in the validCandidates map that does not appear + // in the statusMap, then we have either (a) the current primary, which + // is not replicating, so it is not applying relay logs; or (b) a tablet + // that is stuck thinking it is MASTER but is not in actuality. In that + // second case - (b) - we will most likely find that the stuck MASTER + // does not have a winning position, and fail the ERS. If, on the other + // hand, it does have a winning position, we are trusting the operator + // to know what they are doing by emergency-reparenting onto that + // tablet. In either case, it does not make sense to wait for relay logs + // to apply on a tablet that was never applying relay logs in the first + // place, so we skip it, and log that we did. + status, ok := statusMap[candidate] + if !ok { + erp.logger.Infof("EmergencyReparent candidate %v not in replica status map; this means it was not running replication (because it was formerly MASTER), so skipping WaitForRelayLogsToApply step for this candidate", candidate) + continue + } + go func(alias string) { var err error defer func() { errCh <- err }() - err = WaitForRelayLogsToApply(groupCtx, erp.tmc, tabletMap[alias], statusMap[alias]) + err = WaitForRelayLogsToApply(groupCtx, erp.tmc, tabletMap[alias], status) }(candidate) + + waiterCount++ } errgroup := concurrency.ErrorGroup{ - NumGoroutines: len(validCandidates), - NumRequiredSuccesses: len(validCandidates), + NumGoroutines: waiterCount, + NumRequiredSuccesses: waiterCount, NumAllowedErrors: 0, } rec := errgroup.Wait(groupCancel, errCh)