diff --git a/go/vt/wrangler/reparent.go b/go/vt/wrangler/reparent.go index ccb56dd388d..b19e9e267fe 100644 --- a/go/vt/wrangler/reparent.go +++ b/go/vt/wrangler/reparent.go @@ -203,6 +203,11 @@ func (wr *Wrangler) initShardMasterLocked(ctx context.Context, ev *events.Repare // we stop. It is probably because it is unreachable, and may leave // an unstable database process in the mix, with a database daemon // at a wrong replication spot. + + // Create a context for the following RPCs that respects waitSlaveTimeout + resetCtx, resetCancel := context.WithTimeout(ctx, waitSlaveTimeout) + defer resetCancel() + event.DispatchUpdate(ev, "resetting replication on all tablets") wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} @@ -211,13 +216,14 @@ func (wr *Wrangler) initShardMasterLocked(ctx context.Context, ev *events.Repare go func(alias string, tabletInfo *topo.TabletInfo) { defer wg.Done() wr.logger.Infof("resetting replication on tablet %v", alias) - if err := wr.tmc.ResetReplication(ctx, tabletInfo.Tablet); err != nil { + if err := wr.tmc.ResetReplication(resetCtx, tabletInfo.Tablet); err != nil { rec.RecordError(fmt.Errorf("Tablet %v ResetReplication failed (either fix it, or Scrap it): %v", alias, err)) } }(alias, tabletInfo) } wg.Wait() if err := rec.Error(); err != nil { + // if any of the slaves failed return err } @@ -242,7 +248,7 @@ func (wr *Wrangler) initShardMasterLocked(ctx context.Context, ev *events.Repare // Create a cancelable context for the following RPCs. // If error conditions happen, we can cancel all outgoing RPCs. - replCtx, replCancel := context.WithCancel(ctx) + replCtx, replCancel := context.WithTimeout(ctx, waitSlaveTimeout) defer replCancel() // Now tell the new master to insert the reparent_journal row, @@ -430,7 +436,7 @@ func (wr *Wrangler) plannedReparentShardLocked(ctx context.Context, ev *events.R // Create a cancelable context for the following RPCs. // If error conditions happen, we can cancel all outgoing RPCs. - replCtx, replCancel := context.WithCancel(ctx) + replCtx, replCancel := context.WithTimeout(ctx, waitSlaveTimeout) defer replCancel() // Go through all the tablets: diff --git a/test/initial_sharding.py b/test/initial_sharding.py index 645ed257159..0a101192ae0 100755 --- a/test/initial_sharding.py +++ b/test/initial_sharding.py @@ -240,16 +240,27 @@ def test_resharding(self): for t in [shard_master, shard_replica, shard_rdonly1]: t.create_db('vt_test_keyspace') + # replica is not started, InitShardMaster should timeout shard_master.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) - shard_replica.start_vttablet(wait_for_state=None, - binlog_use_v3_resharding_mode=False) shard_rdonly1.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) - for t in [shard_master, shard_replica, shard_rdonly1]: + for t in [shard_master, shard_rdonly1]: t.wait_for_vttablet_state('NOT_SERVING') + # reparent to make the tablets work - expect fail + # because replica tablet is not up + _, stderr = utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0', + shard_master.tablet_alias], auto_log=True, expect_fail=True) + + self.assertIn('Tablet test_nj-0000062345 ResetReplication failed', stderr) + # start replica + shard_replica.start_vttablet(wait_for_state=None, + binlog_use_v3_resharding_mode=False) + + shard_replica.wait_for_vttablet_state('NOT_SERVING') + # reparent to make the tablets work utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0', shard_master.tablet_alias], auto_log=True) diff --git a/test/reparent.py b/test/reparent.py index c4957465e81..5f1f07a30b5 100755 --- a/test/reparent.py +++ b/test/reparent.py @@ -339,8 +339,8 @@ def _test_reparent_graceful(self, shard_id): tablet_62044.kill_vttablet() - # This is a manual test to check error formatting. - def _test_reparent_slave_offline(self, shard_id='0'): + # Reparenting should return error if replica vttablet is down + def test_reparent_slave_offline(self, shard_id='0'): utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # create the database so vttablets start, as they are serving @@ -377,9 +377,11 @@ def _test_reparent_slave_offline(self, shard_id='0'): tablet_31981.kill_vttablet() # Perform a graceful reparent operation. - utils.run_vtctl(['PlannedReparentShard', + _, stderr = utils.run_vtctl(['PlannedReparentShard', '-keyspace_shard', 'test_keyspace/' + shard_id, - '-new_master', tablet_62044.tablet_alias]) + '-new_master', tablet_62044.tablet_alias], expect_fail=True) + self.assertIn('Tablet test_ny-0000031981 SetMaster failed', stderr) + self._check_master_tablet(tablet_62044) tablet.kill_tablets([tablet_62344, tablet_62044, tablet_41983])