From 46f0302b202c1d6a4f13d9c377d870e62b69c769 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Mon, 26 Jun 2023 08:31:53 +0000 Subject: [PATCH 1/3] feat: remove reset replication code in setreplicationsource Signed-off-by: Manan Gupta --- go/vt/mysqlctl/fakemysqldaemon.go | 1 - go/vt/mysqlctl/replication.go | 8 --- .../endtoend/init_shard_primary_test.go | 6 -- .../vttablet/tabletmanager/rpc_replication.go | 10 +-- go/vt/vttablet/tabletmanager/tm_init_test.go | 1 - go/vt/wrangler/testlib/backup_test.go | 25 ++----- .../testlib/copy_schema_shard_test.go | 3 +- .../testlib/emergency_reparent_shard_test.go | 13 +--- .../testlib/external_reparent_test.go | 14 +--- go/vt/wrangler/testlib/permissions_test.go | 3 +- .../testlib/planned_reparent_shard_test.go | 68 ++++--------------- go/vt/wrangler/testlib/reparent_utils_test.go | 7 +- go/vt/wrangler/testlib/version_test.go | 3 +- 13 files changed, 32 insertions(+), 130 deletions(-) diff --git a/go/vt/mysqlctl/fakemysqldaemon.go b/go/vt/mysqlctl/fakemysqldaemon.go index 9cf64914f67..10877a4d6d1 100644 --- a/go/vt/mysqlctl/fakemysqldaemon.go +++ b/go/vt/mysqlctl/fakemysqldaemon.go @@ -458,7 +458,6 @@ func (fmd *FakeMysqlDaemon) SetReplicationSource(ctx context.Context, host strin if stopReplicationBefore { cmds = append(cmds, "STOP SLAVE") } - cmds = append(cmds, "RESET SLAVE ALL") cmds = append(cmds, "FAKE SET MASTER") if startReplicationAfter { cmds = append(cmds, "START SLAVE") diff --git a/go/vt/mysqlctl/replication.go b/go/vt/mysqlctl/replication.go index 86317980d05..e180cd1e9c9 100644 --- a/go/vt/mysqlctl/replication.go +++ b/go/vt/mysqlctl/replication.go @@ -461,14 +461,6 @@ func (mysqld *Mysqld) SetReplicationSource(ctx context.Context, host string, por if stopReplicationBefore { cmds = append(cmds, conn.StopReplicationCommand()) } - // Reset replication parameters commands makes the instance forget the source host port - // This is required because sometimes MySQL gets stuck due to improper initialization of - // master info structure or related failures and throws errors like - // ERROR 1201 (HY000): Could not initialize master info structure; more error messages can be found in the MySQL error log - // These errors can only be resolved by resetting the replication parameters, otherwise START SLAVE fails. - // Therefore, we have elected to always reset the replication parameters whenever we try to set the source host port - // Since there is no real overhead, but it makes this function robust enough to also handle failures like these. - cmds = append(cmds, conn.ResetReplicationParametersCommands()...) smc := conn.SetReplicationSourceCommand(params, host, port, int(replicationConnectRetry.Seconds())) cmds = append(cmds, smc) if startReplicationAfter { diff --git a/go/vt/vtctl/grpcvtctldserver/endtoend/init_shard_primary_test.go b/go/vt/vtctl/grpcvtctldserver/endtoend/init_shard_primary_test.go index c1e5899aac6..fb1f1f5b40e 100644 --- a/go/vt/vtctl/grpcvtctldserver/endtoend/init_shard_primary_test.go +++ b/go/vt/vtctl/grpcvtctldserver/endtoend/init_shard_primary_test.go @@ -61,13 +61,11 @@ func TestInitShardPrimary(t *testing.T) { tablet2.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ // These come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", // These come from InitShardPrimary "FAKE RESET ALL REPLICATION", "FAKE SET SLAVE POSITION", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } @@ -75,12 +73,10 @@ func TestInitShardPrimary(t *testing.T) { tablet3.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", "FAKE RESET ALL REPLICATION", "FAKE SET SLAVE POSITION", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } @@ -125,7 +121,6 @@ func TestInitShardPrimaryNoFormerPrimary(t *testing.T) { tablet2.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ "FAKE RESET ALL REPLICATION", "FAKE SET SLAVE POSITION", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } @@ -134,7 +129,6 @@ func TestInitShardPrimaryNoFormerPrimary(t *testing.T) { tablet3.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ "FAKE RESET ALL REPLICATION", "FAKE SET SLAVE POSITION", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index 2ffccd89289..7b06bffd1f1 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -697,14 +697,8 @@ func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentA } host := parent.Tablet.MysqlHostname port := parent.Tablet.MysqlPort - // We want to reset the replication parameters and set replication source again when forceStartReplication is provided - // because sometimes MySQL gets stuck due to improper initialization of master info structure or related failures and throws errors like - // ERROR 1201 (HY000): Could not initialize master info structure; more error messages can be found in the MySQL error log - // These errors can only be resolved by resetting the replication parameters, otherwise START SLAVE fails. So when this RPC - // gets called from VTOrc or replication manager to fix the replication in these cases with forceStartReplication, we should also - // reset the replication parameters and set the source port information again. - if status.SourceHost != host || status.SourcePort != port || forceStartReplication { - // This handles reseting the replication parameters, changing the address and then starting the replication. + if status.SourceHost != host || status.SourcePort != port { + // This handles both changing the address and starting replication. if err := tm.MysqlDaemon.SetReplicationSource(ctx, host, port, wasReplicating, shouldbeReplicating); err != nil { if err := tm.handleRelayLogError(err); err != nil { return err diff --git a/go/vt/vttablet/tabletmanager/tm_init_test.go b/go/vt/vttablet/tabletmanager/tm_init_test.go index 5812d1ddd17..ccaedf0c272 100644 --- a/go/vt/vttablet/tabletmanager/tm_init_test.go +++ b/go/vt/vttablet/tabletmanager/tm_init_test.go @@ -374,7 +374,6 @@ func TestCheckPrimaryShip(t *testing.T) { fakeMysql.SetReplicationSourceInputs = append(fakeMysql.SetReplicationSourceInputs, fmt.Sprintf("%v:%v", otherTablet.MysqlHostname, otherTablet.MysqlPort)) fakeMysql.ExpectedExecuteSuperQueryList = []string{ "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } diff --git a/go/vt/wrangler/testlib/backup_test.go b/go/vt/wrangler/testlib/backup_test.go index b845a6d7104..f65ba2f6df3 100644 --- a/go/vt/wrangler/testlib/backup_test.go +++ b/go/vt/wrangler/testlib/backup_test.go @@ -175,9 +175,8 @@ func testBackupRestore(t *testing.T, cDetails *compressionDetails) error { }, } sourceTablet.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", // This first set of STOP and START commands come from @@ -188,7 +187,6 @@ func testBackupRestore(t *testing.T, cDetails *compressionDetails) error { // These commands come from SetReplicationSource RPC called // to set the correct primary and semi-sync after Backup has concluded "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } @@ -230,16 +228,14 @@ func testBackupRestore(t *testing.T, cDetails *compressionDetails) error { }, } destTablet.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", "STOP SLAVE", "RESET SLAVE ALL", "FAKE SET SLAVE POSITION", "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } @@ -296,7 +292,6 @@ func testBackupRestore(t *testing.T, cDetails *compressionDetails) error { "STOP SLAVE", "RESET SLAVE ALL", "FAKE SET SLAVE POSITION", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } @@ -417,9 +412,8 @@ func TestBackupRestoreLagged(t *testing.T) { } sourceTablet.FakeMysqlDaemon.SetReplicationSourceInputs = []string{fmt.Sprintf("%s:%d", primary.Tablet.MysqlHostname, primary.Tablet.MysqlPort)} sourceTablet.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", // This first set of STOP and START commands come from @@ -430,7 +424,6 @@ func TestBackupRestoreLagged(t *testing.T) { // These commands come from SetReplicationSource RPC called // to set the correct primary and semi-sync after Backup has concluded "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } @@ -488,16 +481,14 @@ func TestBackupRestoreLagged(t *testing.T) { }, } destTablet.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", "STOP SLAVE", "RESET SLAVE ALL", "FAKE SET SLAVE POSITION", "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } @@ -638,9 +629,8 @@ func TestRestoreUnreachablePrimary(t *testing.T) { } sourceTablet.FakeMysqlDaemon.SetReplicationSourceInputs = []string{fmt.Sprintf("%s:%d", primary.Tablet.MysqlHostname, primary.Tablet.MysqlPort)} sourceTablet.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", // This first set of STOP and START commands come from @@ -651,7 +641,6 @@ func TestRestoreUnreachablePrimary(t *testing.T) { // These commands come from SetReplicationSource RPC called // to set the correct primary and semi-sync after Backup has concluded "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } @@ -681,16 +670,14 @@ func TestRestoreUnreachablePrimary(t *testing.T) { }, } destTablet.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", "STOP SLAVE", "RESET SLAVE ALL", "FAKE SET SLAVE POSITION", "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } diff --git a/go/vt/wrangler/testlib/copy_schema_shard_test.go b/go/vt/wrangler/testlib/copy_schema_shard_test.go index b32e2c3d937..c3e574fea42 100644 --- a/go/vt/wrangler/testlib/copy_schema_shard_test.go +++ b/go/vt/wrangler/testlib/copy_schema_shard_test.go @@ -72,9 +72,8 @@ func copySchema(t *testing.T, useShardAsSource bool) { sourceRdonly := NewFakeTablet(t, wr, "cell1", 1, topodatapb.TabletType_RDONLY, sourceRdonlyDb, TabletKeyspaceShard(t, "ks", "-80")) sourceRdonly.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } diff --git a/go/vt/wrangler/testlib/emergency_reparent_shard_test.go b/go/vt/wrangler/testlib/emergency_reparent_shard_test.go index a68f0299c94..859cd606fdb 100644 --- a/go/vt/wrangler/testlib/emergency_reparent_shard_test.go +++ b/go/vt/wrangler/testlib/emergency_reparent_shard_test.go @@ -134,14 +134,12 @@ func TestEmergencyReparentShard(t *testing.T) { goodReplica1.FakeMysqlDaemon.WaitPrimaryPositions = append(goodReplica1.FakeMysqlDaemon.WaitPrimaryPositions, goodReplica1.FakeMysqlDaemon.CurrentSourceFilePosition) goodReplica1.FakeMysqlDaemon.SetReplicationSourceInputs = append(goodReplica1.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(newPrimary.Tablet), topoproto.MysqlAddr(oldPrimary.Tablet)) goodReplica1.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", "STOP SLAVE IO_THREAD", "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } @@ -167,12 +165,10 @@ func TestEmergencyReparentShard(t *testing.T) { goodReplica2.FakeMysqlDaemon.WaitPrimaryPositions = append(goodReplica2.FakeMysqlDaemon.WaitPrimaryPositions, goodReplica2.FakeMysqlDaemon.CurrentSourceFilePosition) goodReplica2.FakeMysqlDaemon.SetReplicationSourceInputs = append(goodReplica2.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(newPrimary.Tablet), topoproto.MysqlAddr(oldPrimary.Tablet)) goodReplica2.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", } goodReplica2.StartActionLoop(t, wr) @@ -234,7 +230,6 @@ func TestEmergencyReparentShardPrimaryElectNotBest(t *testing.T) { newPrimary.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ "STOP SLAVE IO_THREAD", "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", "SUBINSERT INTO _vt.reparent_journal (time_created_ns, action_name, primary_alias, replication_position) VALUES", @@ -267,14 +262,12 @@ func TestEmergencyReparentShardPrimaryElectNotBest(t *testing.T) { moreAdvancedReplica.FakeMysqlDaemon.WaitPrimaryPositions = append(moreAdvancedReplica.FakeMysqlDaemon.WaitPrimaryPositions, moreAdvancedReplica.FakeMysqlDaemon.CurrentSourceFilePosition) newPrimary.FakeMysqlDaemon.WaitPrimaryPositions = append(newPrimary.FakeMysqlDaemon.WaitPrimaryPositions, moreAdvancedReplica.FakeMysqlDaemon.CurrentPrimaryPosition) moreAdvancedReplica.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", "STOP SLAVE IO_THREAD", "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } diff --git a/go/vt/wrangler/testlib/external_reparent_test.go b/go/vt/wrangler/testlib/external_reparent_test.go index 5d47ca5724e..955515ac4b4 100644 --- a/go/vt/wrangler/testlib/external_reparent_test.go +++ b/go/vt/wrangler/testlib/external_reparent_test.go @@ -90,7 +90,6 @@ func TestTabletExternallyReparentedBasic(t *testing.T) { oldPrimary.FakeMysqlDaemon.SetReplicationSourceInputs = append(oldPrimary.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(newPrimary.Tablet)) oldPrimary.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - "RESET SLAVE ALL", "FAKE SET MASTER", "START Replica", } @@ -170,7 +169,6 @@ func TestTabletExternallyReparentedToReplica(t *testing.T) { // primary is still good to go. oldPrimary.FakeMysqlDaemon.SetReplicationSourceInputs = append(oldPrimary.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(newPrimary.Tablet)) oldPrimary.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - "RESET SLAVE ALL", "FAKE SET MASTER", "START Replica", } @@ -249,7 +247,6 @@ func TestTabletExternallyReparentedWithDifferentMysqlPort(t *testing.T) { oldPrimary.FakeMysqlDaemon.SetReplicationSourceInputs = append(oldPrimary.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(newPrimary.Tablet)) oldPrimary.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - "RESET SLAVE ALL", "FAKE SET MASTER", "START Replica", } @@ -262,9 +259,8 @@ func TestTabletExternallyReparentedWithDifferentMysqlPort(t *testing.T) { // TabletActionReplicaWasRestarted and point to the new mysql port goodReplica.FakeMysqlDaemon.SetReplicationSourceInputs = append(goodReplica.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(oldPrimary.Tablet)) goodReplica.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } @@ -339,7 +335,6 @@ func TestTabletExternallyReparentedContinueOnUnexpectedPrimary(t *testing.T) { oldPrimary.FakeMysqlDaemon.SetReplicationSourceInputs = append(oldPrimary.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(newPrimary.Tablet)) oldPrimary.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - "RESET SLAVE ALL", "FAKE SET MASTER", "START Replica", } @@ -352,9 +347,8 @@ func TestTabletExternallyReparentedContinueOnUnexpectedPrimary(t *testing.T) { // TabletActionReplicaWasRestarted and point to a bad host goodReplica.FakeMysqlDaemon.SetReplicationSourceInputs = append(goodReplica.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(oldPrimary.Tablet)) goodReplica.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } @@ -425,7 +419,6 @@ func TestTabletExternallyReparentedRerun(t *testing.T) { oldPrimary.FakeMysqlDaemon.SetReplicationSourceInputs = append(oldPrimary.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(newPrimary.Tablet)) oldPrimary.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - "RESET SLAVE ALL", "FAKE SET MASTER", "START Replica", } @@ -438,9 +431,8 @@ func TestTabletExternallyReparentedRerun(t *testing.T) { // On the good replica, we will respond to // TabletActionReplicaWasRestarted. goodReplica.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } diff --git a/go/vt/wrangler/testlib/permissions_test.go b/go/vt/wrangler/testlib/permissions_test.go index fec85ca3124..9d2a950c8e3 100644 --- a/go/vt/wrangler/testlib/permissions_test.go +++ b/go/vt/wrangler/testlib/permissions_test.go @@ -563,9 +563,8 @@ func TestPermissions(t *testing.T) { } replica.FakeMysqlDaemon.SetReplicationSourceInputs = append(replica.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(primary.Tablet)) replica.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } diff --git a/go/vt/wrangler/testlib/planned_reparent_shard_test.go b/go/vt/wrangler/testlib/planned_reparent_shard_test.go index b0f2cf54f4a..010c1dc2301 100644 --- a/go/vt/wrangler/testlib/planned_reparent_shard_test.go +++ b/go/vt/wrangler/testlib/planned_reparent_shard_test.go @@ -81,7 +81,6 @@ func TestPlannedReparentShardNoPrimaryProvided(t *testing.T) { } newPrimary.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", "SUBINSERT INTO _vt.reparent_journal (time_created_ns, action_name, primary_alias, replication_position) VALUES", @@ -96,11 +95,9 @@ func TestPlannedReparentShardNoPrimaryProvided(t *testing.T) { oldPrimary.FakeMysqlDaemon.CurrentPrimaryPosition = newPrimary.FakeMysqlDaemon.WaitPrimaryPositions[0] oldPrimary.FakeMysqlDaemon.SetReplicationSourceInputs = append(oldPrimary.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(newPrimary.Tablet)) oldPrimary.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", // we end up calling SetReplicationSource twice on the old primary - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } @@ -116,13 +113,11 @@ func TestPlannedReparentShardNoPrimaryProvided(t *testing.T) { goodReplica1.FakeMysqlDaemon.Replicating = true goodReplica1.FakeMysqlDaemon.SetReplicationSourceInputs = append(goodReplica1.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(newPrimary.Tablet), topoproto.MysqlAddr(oldPrimary.Tablet)) goodReplica1.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } @@ -198,7 +193,6 @@ func TestPlannedReparentShardNoError(t *testing.T) { } newPrimary.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", "SUBINSERT INTO _vt.reparent_journal (time_created_ns, action_name, primary_alias, replication_position) VALUES", @@ -213,11 +207,9 @@ func TestPlannedReparentShardNoError(t *testing.T) { oldPrimary.FakeMysqlDaemon.CurrentPrimaryPosition = newPrimary.FakeMysqlDaemon.WaitPrimaryPositions[0] oldPrimary.FakeMysqlDaemon.SetReplicationSourceInputs = append(oldPrimary.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(newPrimary.Tablet)) oldPrimary.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", // we end up calling SetReplicationSource twice on the old primary - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } @@ -233,13 +225,11 @@ func TestPlannedReparentShardNoError(t *testing.T) { goodReplica1.FakeMysqlDaemon.Replicating = true goodReplica1.FakeMysqlDaemon.SetReplicationSourceInputs = append(goodReplica1.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(newPrimary.Tablet), topoproto.MysqlAddr(oldPrimary.Tablet)) goodReplica1.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } @@ -250,12 +240,10 @@ func TestPlannedReparentShardNoError(t *testing.T) { goodReplica2.FakeMysqlDaemon.ReadOnly = true goodReplica2.FakeMysqlDaemon.SetReplicationSourceInputs = append(goodReplica2.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(newPrimary.Tablet), topoproto.MysqlAddr(oldPrimary.Tablet)) goodReplica2.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", } goodReplica2.StartActionLoop(t, wr) @@ -337,7 +325,6 @@ func TestPlannedReparentInitialization(t *testing.T) { goodReplica1.FakeMysqlDaemon.SetReplicationSourceInputs = append(goodReplica1.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(newPrimary.Tablet)) goodReplica1.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } @@ -350,7 +337,6 @@ func TestPlannedReparentInitialization(t *testing.T) { goodReplica2.FakeMysqlDaemon.SetReplicationSourceInputs = append(goodReplica2.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(newPrimary.Tablet)) goodReplica2.StartActionLoop(t, wr) goodReplica2.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - "RESET SLAVE ALL", "FAKE SET MASTER", } defer goodReplica2.StopActionLoop(t) @@ -421,7 +407,6 @@ func TestPlannedReparentShardWaitForPositionFail(t *testing.T) { } newPrimary.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", "SUBINSERT INTO _vt.reparent_journal (time_created_ns, action_name, primary_alias, replication_position) VALUES", @@ -436,7 +421,6 @@ func TestPlannedReparentShardWaitForPositionFail(t *testing.T) { oldPrimary.FakeMysqlDaemon.CurrentPrimaryPosition = newPrimary.FakeMysqlDaemon.PromoteResult oldPrimary.FakeMysqlDaemon.SetReplicationSourceInputs = append(oldPrimary.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(newPrimary.Tablet)) oldPrimary.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } @@ -451,13 +435,11 @@ func TestPlannedReparentShardWaitForPositionFail(t *testing.T) { goodReplica1.FakeMysqlDaemon.Replicating = true goodReplica1.FakeMysqlDaemon.SetReplicationSourceInputs = append(goodReplica1.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(newPrimary.Tablet), topoproto.MysqlAddr(oldPrimary.Tablet)) goodReplica1.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } @@ -468,12 +450,10 @@ func TestPlannedReparentShardWaitForPositionFail(t *testing.T) { goodReplica2.FakeMysqlDaemon.ReadOnly = true goodReplica2.FakeMysqlDaemon.SetReplicationSourceInputs = append(goodReplica2.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(newPrimary.Tablet), topoproto.MysqlAddr(oldPrimary.Tablet)) goodReplica2.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", } goodReplica2.StartActionLoop(t, wr) @@ -534,7 +514,6 @@ func TestPlannedReparentShardWaitForPositionTimeout(t *testing.T) { } newPrimary.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", "SUBINSERT INTO _vt.reparent_journal (time_created_ns, action_name, primary_alias, replication_position) VALUES", @@ -548,7 +527,6 @@ func TestPlannedReparentShardWaitForPositionTimeout(t *testing.T) { oldPrimary.FakeMysqlDaemon.CurrentPrimaryPosition = newPrimary.FakeMysqlDaemon.WaitPrimaryPositions[0] oldPrimary.FakeMysqlDaemon.SetReplicationSourceInputs = append(oldPrimary.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(newPrimary.Tablet)) oldPrimary.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } @@ -563,13 +541,11 @@ func TestPlannedReparentShardWaitForPositionTimeout(t *testing.T) { goodReplica1.FakeMysqlDaemon.Replicating = true goodReplica1.FakeMysqlDaemon.SetReplicationSourceInputs = append(goodReplica1.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(newPrimary.Tablet), topoproto.MysqlAddr(oldPrimary.Tablet)) goodReplica1.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } @@ -580,12 +556,10 @@ func TestPlannedReparentShardWaitForPositionTimeout(t *testing.T) { goodReplica2.FakeMysqlDaemon.ReadOnly = true goodReplica2.FakeMysqlDaemon.SetReplicationSourceInputs = append(goodReplica2.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(newPrimary.Tablet), topoproto.MysqlAddr(oldPrimary.Tablet)) goodReplica2.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", } goodReplica2.StartActionLoop(t, wr) @@ -643,9 +617,8 @@ func TestPlannedReparentShardRelayLogError(t *testing.T) { goodReplica1.FakeMysqlDaemon.Replicating = true goodReplica1.FakeMysqlDaemon.SetReplicationSourceInputs = append(goodReplica1.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(primary.Tablet)) goodReplica1.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", // simulate error that will trigger a call to RestartReplication @@ -726,9 +699,8 @@ func TestPlannedReparentShardRelayLogErrorStartReplication(t *testing.T) { goodReplica1.FakeMysqlDaemon.CurrentSourcePort = primary.Tablet.MysqlPort goodReplica1.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ // simulate error that will trigger a call to RestartReplication - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", // In SetReplicationSource, we find that the source host and port was already set correctly, @@ -807,7 +779,6 @@ func TestPlannedReparentShardPromoteReplicaFail(t *testing.T) { } newPrimary.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", "SUBINSERT INTO _vt.reparent_journal (time_created_ns, action_name, primary_alias, replication_position) VALUES", @@ -822,7 +793,6 @@ func TestPlannedReparentShardPromoteReplicaFail(t *testing.T) { oldPrimary.FakeMysqlDaemon.CurrentPrimaryPosition = newPrimary.FakeMysqlDaemon.WaitPrimaryPositions[0] oldPrimary.FakeMysqlDaemon.SetReplicationSourceInputs = append(oldPrimary.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(newPrimary.Tablet)) oldPrimary.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } @@ -837,13 +807,11 @@ func TestPlannedReparentShardPromoteReplicaFail(t *testing.T) { goodReplica1.FakeMysqlDaemon.Replicating = true goodReplica1.FakeMysqlDaemon.SetReplicationSourceInputs = append(goodReplica1.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(newPrimary.Tablet), topoproto.MysqlAddr(oldPrimary.Tablet)) goodReplica1.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } @@ -854,12 +822,10 @@ func TestPlannedReparentShardPromoteReplicaFail(t *testing.T) { goodReplica2.FakeMysqlDaemon.ReadOnly = true goodReplica2.FakeMysqlDaemon.SetReplicationSourceInputs = append(goodReplica2.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(newPrimary.Tablet), topoproto.MysqlAddr(oldPrimary.Tablet)) goodReplica2.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", } goodReplica2.StartActionLoop(t, wr) @@ -880,22 +846,18 @@ func TestPlannedReparentShardPromoteReplicaFail(t *testing.T) { newPrimary.FakeMysqlDaemon.PromoteError = nil newPrimary.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", // extra commands because of retry "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", "SUBINSERT INTO _vt.reparent_journal (time_created_ns, action_name, primary_alias, replication_position) VALUES", } oldPrimary.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", // extra commands because of retry - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } @@ -954,13 +916,11 @@ func TestPlannedReparentShardSamePrimary(t *testing.T) { goodReplica1.FakeMysqlDaemon.Replicating = true goodReplica1.FakeMysqlDaemon.SetReplicationSourceInputs = append(goodReplica1.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(oldPrimary.Tablet)) goodReplica1.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } @@ -971,12 +931,10 @@ func TestPlannedReparentShardSamePrimary(t *testing.T) { goodReplica2.FakeMysqlDaemon.ReadOnly = true goodReplica2.FakeMysqlDaemon.SetReplicationSourceInputs = append(goodReplica2.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(oldPrimary.Tablet)) goodReplica2.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", } goodReplica2.StartActionLoop(t, wr) diff --git a/go/vt/wrangler/testlib/reparent_utils_test.go b/go/vt/wrangler/testlib/reparent_utils_test.go index 5596d120e8c..f362f8736cf 100644 --- a/go/vt/wrangler/testlib/reparent_utils_test.go +++ b/go/vt/wrangler/testlib/reparent_utils_test.go @@ -90,9 +90,8 @@ func TestShardReplicationStatuses(t *testing.T) { replica.FakeMysqlDaemon.CurrentSourcePort = primary.Tablet.MysqlPort replica.FakeMysqlDaemon.SetReplicationSourceInputs = append(replica.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(primary.Tablet)) replica.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } @@ -160,13 +159,11 @@ func TestReparentTablet(t *testing.T) { replica.FakeMysqlDaemon.IOThreadRunning = true replica.FakeMysqlDaemon.SetReplicationSourceInputs = append(replica.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(primary.Tablet)) replica.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } diff --git a/go/vt/wrangler/testlib/version_test.go b/go/vt/wrangler/testlib/version_test.go index 18b29d61ac2..c54a0811948 100644 --- a/go/vt/wrangler/testlib/version_test.go +++ b/go/vt/wrangler/testlib/version_test.go @@ -93,9 +93,8 @@ func TestVersion(t *testing.T) { sourceReplicaGitRev := "fake git rev" sourceReplica.FakeMysqlDaemon.SetReplicationSourceInputs = append(sourceReplica.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(sourcePrimary.Tablet)) sourceReplica.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 4 statements come from tablet startup + // These 3 statements come from tablet startup "STOP SLAVE", - "RESET SLAVE ALL", "FAKE SET MASTER", "START SLAVE", } From b9735fa5c166660a9bb158641c367ed92264c321 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Mon, 26 Jun 2023 08:50:03 +0000 Subject: [PATCH 2/3] feat: fix relay log errors properly Signed-off-by: Manan Gupta --- .../vttablet/tabletmanager/rpc_replication.go | 8 ++- go/vt/wrangler/testlib/reparent_utils_test.go | 61 +++++++++++++++++++ 2 files changed, 68 insertions(+), 1 deletion(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index 7b06bffd1f1..df7a50d04b0 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -954,11 +954,17 @@ func (tm *TabletManager) fixSemiSyncAndReplication(tabletType topodatapb.TabletT return nil } +// handleRelayLogError resets replication of the instance. +// This is required because sometimes MySQL gets stuck due to improper initialization of +// master info structure or related failures and throws errors like +// ERROR 1201 (HY000): Could not initialize master info structure; more error messages can be found in the MySQL error log +// These errors can only be resolved by resetting the replication, otherwise START SLAVE fails. func (tm *TabletManager) handleRelayLogError(err error) error { // attempt to fix this error: // Slave failed to initialize relay log info structure from the repository (errno 1872) (sqlstate HY000) during query: START SLAVE // see https://bugs.mysql.com/bug.php?id=83713 or https://github.com/vitessio/vitess/issues/5067 - if strings.Contains(err.Error(), "Slave failed to initialize relay log info structure from the repository") { + // The same fix also works for https://github.com/vitessio/vitess/issues/10955. + if strings.Contains(err.Error(), "Slave failed to initialize relay log info structure from the repository") || strings.Contains(err.Error(), "Could not initialize master info structure") { // Stop, reset and start replication again to resolve this error if err := tm.MysqlDaemon.RestartReplication(tm.hookExtraEnv()); err != nil { return err diff --git a/go/vt/wrangler/testlib/reparent_utils_test.go b/go/vt/wrangler/testlib/reparent_utils_test.go index f362f8736cf..edd10f6dfdd 100644 --- a/go/vt/wrangler/testlib/reparent_utils_test.go +++ b/go/vt/wrangler/testlib/reparent_utils_test.go @@ -18,6 +18,7 @@ package testlib import ( "context" + "errors" "testing" "time" @@ -181,3 +182,63 @@ func TestReparentTablet(t *testing.T) { } checkSemiSyncEnabled(t, false, true, replica) } + +// TestSetReplicationSourceRelayLogError tests that SetReplicationSource works as intended when we receive a relay log error while starting replication. +func TestSetReplicationSourceRelayLogError(t *testing.T) { + ctx := context.Background() + ts := memorytopo.NewServer("cell1", "cell2") + wr := wrangler.New(logutil.NewConsoleLogger(), ts, tmclient.NewTabletManagerClient()) + + // create shard and tablets + if _, err := ts.GetOrCreateShard(ctx, "test_keyspace", "0"); err != nil { + t.Fatalf("CreateShard failed: %v", err) + } + primary := NewFakeTablet(t, wr, "cell1", 1, topodatapb.TabletType_PRIMARY, nil) + replica := NewFakeTablet(t, wr, "cell1", 2, topodatapb.TabletType_REPLICA, nil) + reparenttestutil.SetKeyspaceDurability(context.Background(), t, ts, "test_keyspace", "semi_sync") + + // mark the primary inside the shard + if _, err := ts.UpdateShardFields(ctx, "test_keyspace", "0", func(si *topo.ShardInfo) error { + si.PrimaryAlias = primary.Tablet.Alias + return nil + }); err != nil { + t.Fatalf("UpdateShardFields failed: %v", err) + } + + // primary action loop (to initialize host and port) + primary.StartActionLoop(t, wr) + defer primary.StopActionLoop(t) + + // replica loop + // We have to set the settings as replicating. Otherwise, + // the replication manager intervenes and tries to fix replication, + // which ends up making this test unpredictable. + replica.FakeMysqlDaemon.Replicating = true + replica.FakeMysqlDaemon.IOThreadRunning = true + replica.FakeMysqlDaemon.SetReplicationSourceInputs = append(replica.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(primary.Tablet)) + replica.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ + // These 3 statements come from tablet startup + "STOP SLAVE", + "FAKE SET MASTER", + "START SLAVE", + // We stop and reset the replication parameters because of relay log issues. + "STOP SLAVE", + "RESET SLAVE", + "START SLAVE", + } + replica.StartActionLoop(t, wr) + defer replica.StopActionLoop(t) + + // Set the correct error message that indicates we have received a relay log error. + replica.FakeMysqlDaemon.SetReplicationSourceError = errors.New("ERROR 1201 (HY000): Could not initialize master info structure; more error messages can be found in the MySQL error log") + // run ReparentTablet + if err := wr.SetReplicationSource(ctx, replica.Tablet); err != nil { + t.Fatalf("SetReplicationSource failed: %v", err) + } + + // check what was run + if err := replica.FakeMysqlDaemon.CheckSuperQueryList(); err != nil { + t.Fatalf("replica.FakeMysqlDaemon.CheckSuperQueryList failed: %v", err) + } + checkSemiSyncEnabled(t, false, true, replica) +} From 2f9f9c9d24bcd892851bca1f589762a6103b2ead Mon Sep 17 00:00:00 2001 From: Manan Gupta <35839558+GuptaManan100@users.noreply.github.com> Date: Wed, 28 Jun 2023 21:30:44 +0530 Subject: [PATCH 3/3] feat: don't run any reparent commands if the host is empty (#13396) Signed-off-by: Manan Gupta --- .../vttablet/tabletmanager/rpc_replication.go | 4 + go/vt/wrangler/testlib/fake_tablet.go | 2 +- go/vt/wrangler/testlib/reparent_utils_test.go | 121 ++++++++++++------ 3 files changed, 84 insertions(+), 43 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index df7a50d04b0..62cf93e6247 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -697,6 +697,10 @@ func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentA } host := parent.Tablet.MysqlHostname port := parent.Tablet.MysqlPort + // If host is empty, then we shouldn't even attempt the reparent. That tablet has already shutdown. + if host == "" { + return vterrors.New(vtrpc.Code_FAILED_PRECONDITION, "Shard primary has empty mysql hostname") + } if status.SourceHost != host || status.SourcePort != port { // This handles both changing the address and starting replication. if err := tm.MysqlDaemon.SetReplicationSource(ctx, host, port, wasReplicating, shouldbeReplicating); err != nil { diff --git a/go/vt/wrangler/testlib/fake_tablet.go b/go/vt/wrangler/testlib/fake_tablet.go index 0e4d38f59fc..7d840e13c16 100644 --- a/go/vt/wrangler/testlib/fake_tablet.go +++ b/go/vt/wrangler/testlib/fake_tablet.go @@ -245,7 +245,7 @@ func (ft *FakeTablet) StartActionLoop(t *testing.T, wr *wrangler.Wrangler) { // StopActionLoop will stop the Action Loop for the given FakeTablet func (ft *FakeTablet) StopActionLoop(t *testing.T) { if ft.TM == nil { - t.Fatalf("TM for %v is not running", ft.Tablet.Alias) + return } if ft.StartHTTPServer { ft.HTTPListener.Close() diff --git a/go/vt/wrangler/testlib/reparent_utils_test.go b/go/vt/wrangler/testlib/reparent_utils_test.go index edd10f6dfdd..6cb67714411 100644 --- a/go/vt/wrangler/testlib/reparent_utils_test.go +++ b/go/vt/wrangler/testlib/reparent_utils_test.go @@ -22,6 +22,8 @@ import ( "testing" "time" + "github.com/stretchr/testify/require" + "vitess.io/vitess/go/vt/vtctl/reparentutil/reparenttestutil" "vitess.io/vitess/go/vt/discovery" @@ -183,62 +185,97 @@ func TestReparentTablet(t *testing.T) { checkSemiSyncEnabled(t, false, true, replica) } -// TestSetReplicationSourceRelayLogError tests that SetReplicationSource works as intended when we receive a relay log error while starting replication. -func TestSetReplicationSourceRelayLogError(t *testing.T) { +// TestSetReplicationSource tests that SetReplicationSource works as intended under various circumstances. +func TestSetReplicationSource(t *testing.T) { ctx := context.Background() ts := memorytopo.NewServer("cell1", "cell2") wr := wrangler.New(logutil.NewConsoleLogger(), ts, tmclient.NewTabletManagerClient()) // create shard and tablets - if _, err := ts.GetOrCreateShard(ctx, "test_keyspace", "0"); err != nil { - t.Fatalf("CreateShard failed: %v", err) - } + _, err := ts.GetOrCreateShard(ctx, "test_keyspace", "0") + require.NoError(t, err, "CreateShard failed") + primary := NewFakeTablet(t, wr, "cell1", 1, topodatapb.TabletType_PRIMARY, nil) - replica := NewFakeTablet(t, wr, "cell1", 2, topodatapb.TabletType_REPLICA, nil) reparenttestutil.SetKeyspaceDurability(context.Background(), t, ts, "test_keyspace", "semi_sync") // mark the primary inside the shard - if _, err := ts.UpdateShardFields(ctx, "test_keyspace", "0", func(si *topo.ShardInfo) error { + _, err = ts.UpdateShardFields(ctx, "test_keyspace", "0", func(si *topo.ShardInfo) error { si.PrimaryAlias = primary.Tablet.Alias return nil - }); err != nil { - t.Fatalf("UpdateShardFields failed: %v", err) - } + }) + require.NoError(t, err, "UpdateShardFields failed") // primary action loop (to initialize host and port) primary.StartActionLoop(t, wr) defer primary.StopActionLoop(t) - // replica loop - // We have to set the settings as replicating. Otherwise, - // the replication manager intervenes and tries to fix replication, - // which ends up making this test unpredictable. - replica.FakeMysqlDaemon.Replicating = true - replica.FakeMysqlDaemon.IOThreadRunning = true - replica.FakeMysqlDaemon.SetReplicationSourceInputs = append(replica.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(primary.Tablet)) - replica.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 3 statements come from tablet startup - "STOP SLAVE", - "FAKE SET MASTER", - "START SLAVE", - // We stop and reset the replication parameters because of relay log issues. - "STOP SLAVE", - "RESET SLAVE", - "START SLAVE", - } - replica.StartActionLoop(t, wr) - defer replica.StopActionLoop(t) - - // Set the correct error message that indicates we have received a relay log error. - replica.FakeMysqlDaemon.SetReplicationSourceError = errors.New("ERROR 1201 (HY000): Could not initialize master info structure; more error messages can be found in the MySQL error log") - // run ReparentTablet - if err := wr.SetReplicationSource(ctx, replica.Tablet); err != nil { - t.Fatalf("SetReplicationSource failed: %v", err) - } - - // check what was run - if err := replica.FakeMysqlDaemon.CheckSuperQueryList(); err != nil { - t.Fatalf("replica.FakeMysqlDaemon.CheckSuperQueryList failed: %v", err) - } - checkSemiSyncEnabled(t, false, true, replica) + // test when we receive a relay log error while starting replication + t.Run("Relay log error", func(t *testing.T) { + replica := NewFakeTablet(t, wr, "cell1", 2, topodatapb.TabletType_REPLICA, nil) + // replica loop + // We have to set the settings as replicating. Otherwise, + // the replication manager intervenes and tries to fix replication, + // which ends up making this test unpredictable. + replica.FakeMysqlDaemon.Replicating = true + replica.FakeMysqlDaemon.IOThreadRunning = true + replica.FakeMysqlDaemon.SetReplicationSourceInputs = append(replica.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(primary.Tablet)) + replica.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ + // These 3 statements come from tablet startup + "STOP SLAVE", + "FAKE SET MASTER", + "START SLAVE", + // We stop and reset the replication parameters because of relay log issues. + "STOP SLAVE", + "RESET SLAVE", + "START SLAVE", + } + replica.StartActionLoop(t, wr) + defer replica.StopActionLoop(t) + + // Set the correct error message that indicates we have received a relay log error. + replica.FakeMysqlDaemon.SetReplicationSourceError = errors.New("ERROR 1201 (HY000): Could not initialize master info structure; more error messages can be found in the MySQL error log") + // run ReparentTablet + err = wr.SetReplicationSource(ctx, replica.Tablet) + require.NoError(t, err, "SetReplicationSource failed") + + // check what was run + err = replica.FakeMysqlDaemon.CheckSuperQueryList() + require.NoError(t, err, "CheckSuperQueryList failed") + checkSemiSyncEnabled(t, false, true, replica) + }) + + // test setting an empty hostname because of primary shutdown + t.Run("Primary tablet already shutdown", func(t *testing.T) { + replica := NewFakeTablet(t, wr, "cell1", 3, topodatapb.TabletType_REPLICA, nil) + // replica loop + replica.FakeMysqlDaemon.Replicating = true + replica.FakeMysqlDaemon.IOThreadRunning = true + replica.FakeMysqlDaemon.SetReplicationSourceInputs = append(replica.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(primary.Tablet)) + replica.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ + // These 3 statements come from tablet startup + "STOP SLAVE", + "FAKE SET MASTER", + "START SLAVE", + // For the SetReplicationSource call, we shouldn't get any queries at all! + } + replica.StartActionLoop(t, wr) + defer replica.StopActionLoop(t) + + // stop the primary + primary.StopActionLoop(t) + // update the primary topo record + wr.TopoServer().UpdateTabletFields(ctx, primary.Tablet.Alias, func(tablet *topodatapb.Tablet) error { + tablet.MysqlHostname = "" + return nil + }) + + // run SetReplicationSource + err = wr.SetReplicationSource(ctx, replica.Tablet) + require.ErrorContains(t, err, "Shard primary has empty mysql hostname") + + // check what was run + err = replica.FakeMysqlDaemon.CheckSuperQueryList() + require.NoError(t, err, "CheckSuperQueryList failed") + checkSemiSyncEnabled(t, false, true, replica) + }) }