Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions go/vt/vtctld/api_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ func TestAPI(t *testing.T) {
{"GET", "shards/ks1/", "", `["-80","80-"]`},
{"GET", "shards/ks1/-80", "", `{
"master_alias": null,
"master_term_start_time":null,
"key_range": {
"start": null,
"end":"gA=="
Expand Down
9 changes: 4 additions & 5 deletions go/vt/vttablet/tabletmanager/rpc_replication.go
Original file line number Diff line number Diff line change
Expand Up @@ -576,11 +576,10 @@ func (agent *ActionAgent) setMasterLocked(ctx context.Context, parentAlias *topo

// if needed, wait until we get the replicated row, or our
// context times out
if !shouldbeReplicating || timeCreatedNS == 0 {
return nil
}
if err := agent.MysqlDaemon.WaitForReparentJournal(ctx, timeCreatedNS); err != nil {
return err
if shouldbeReplicating && timeCreatedNS != 0 {
if err := agent.MysqlDaemon.WaitForReparentJournal(ctx, timeCreatedNS); err != nil {
return err
}
}
if typeChanged {
if err := agent.refreshTablet(ctx, "SetMaster"); err != nil {
Expand Down
3 changes: 3 additions & 0 deletions go/vt/vttablet/tabletmanager/shard_sync.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,15 @@ func (agent *ActionAgent) shardSyncLoop(ctx context.Context) {
select {
case <-notifyChan:
// Something may have changed in the tablet state.
log.Info("Change to tablet state")
case <-retryChan:
// It's time to retry a previous failed sync attempt.
log.Info("Retry sync")
case event := <-shardWatch.watchChan:
// Something may have changed in the shard record.
// We don't use the watch event except to know that we should
// re-read the shard record, and to know if the watch dies.
log.Info("Change in shard record")
if event.Err != nil {
// The watch failed. Stop it so we start a new one if needed.
log.Errorf("Shard watch failed: %v", event.Err)
Expand Down
19 changes: 3 additions & 16 deletions go/vt/wrangler/reparent.go
Original file line number Diff line number Diff line change
Expand Up @@ -438,11 +438,13 @@ func (wr *Wrangler) plannedReparentShardLocked(ctx context.Context, ev *events.R
return fmt.Errorf("old master tablet %v DemoteMaster failed: %v", topoproto.TabletAliasString(shardInfo.MasterAlias), err)
}

promoteCtx, promoteCancel := context.WithTimeout(ctx, waitSlaveTimeout)
defer promoteCancel()
// Wait on the master-elect tablet until it reaches that position,
// then promote it.
wr.logger.Infof("promote replica %v", masterElectTabletAliasStr)
event.DispatchUpdate(ev, "promoting replica")
rp, err = wr.tmc.PromoteSlaveWhenCaughtUp(remoteCtx, masterElectTabletInfo.Tablet, rp)
rp, err = wr.tmc.PromoteSlaveWhenCaughtUp(promoteCtx, masterElectTabletInfo.Tablet, rp)
if err != nil || (ctx.Err() != nil && ctx.Err() == context.DeadlineExceeded) {
remoteCancel()
// If we fail to promote the new master, try to roll back to the
Expand All @@ -457,9 +459,6 @@ func (wr *Wrangler) plannedReparentShardLocked(ctx context.Context, ev *events.R
reparentJournalPos = rp
}

remoteCtx, remoteCancel = context.WithTimeout(ctx, waitSlaveTimeout)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was this unused? I don't see remoteCtx used below either, but just verifying the reason for removal.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it was unused. Not sure when references to it were removed.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It turns out that when code was moved around in #5226, it led to us calling PromoteSlave with an already used context, effectively reducing the timeout to some indeterminate value, and also partly undoing the changes in #4850. However, this is all still on the feature branch, so no harm done. I have added the necessary lines of code to get it back to the way it is supposed to be.

defer remoteCancel()

// Check we still have the topology lock.
if err := topo.CheckShardLocked(ctx, keyspace, shard); err != nil {
return fmt.Errorf("lost topology lock, aborting: %v", err)
Expand Down Expand Up @@ -520,18 +519,6 @@ func (wr *Wrangler) plannedReparentShardLocked(ctx context.Context, ev *events.R
return fmt.Errorf("failed to PopulateReparentJournal on master: %v", err)
}

// After the master is done, we can update the shard record.
// TODO(deepthi): Remove this when we make the master tablet responsible for
// updating the shard record.
wr.logger.Infof("updating shard record with new master %v", masterElectTabletAlias)
if _, err := wr.ts.UpdateShardFields(ctx, keyspace, shard, func(si *topo.ShardInfo) error {
si.MasterAlias = masterElectTabletAlias
return nil
}); err != nil {
wgReplicas.Wait()
return fmt.Errorf("failed to update shard master record: %v", err)
}

// Wait for the replicas to complete.
wgReplicas.Wait()
if err := rec.Error(); err != nil {
Expand Down
12 changes: 9 additions & 3 deletions go/vt/wrangler/testlib/planned_reparent_shard_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,9 @@ func TestPlannedReparentShardNoMasterProvided(t *testing.T) {
oldMaster.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{
"FAKE SET MASTER",
"START SLAVE",
// we end up calling SetMaster twice on the old master
"FAKE SET MASTER",
"START SLAVE",
}
oldMaster.StartActionLoop(t, wr)
defer oldMaster.StopActionLoop(t)
Expand All @@ -103,7 +106,7 @@ func TestPlannedReparentShardNoMasterProvided(t *testing.T) {
t.Fatalf("PlannedReparentShard failed: %v", err)
}

// // check what was run
// check what was run
if err := newMaster.FakeMysqlDaemon.CheckSuperQueryList(); err != nil {
t.Errorf("newMaster.FakeMysqlDaemon.CheckSuperQueryList failed: %v", err)
}
Expand All @@ -126,8 +129,8 @@ func TestPlannedReparentShardNoMasterProvided(t *testing.T) {
t.Errorf("oldMaster...QueryServiceControl not serving")
}

// // verify the old master was told to start replicating (and not
// // the slave that wasn't replicating in the first place)
// verify the old master was told to start replicating (and not
// the slave that wasn't replicating in the first place)
if !oldMaster.FakeMysqlDaemon.Replicating {
t.Errorf("oldMaster.FakeMysqlDaemon.Replicating not set")
}
Expand Down Expand Up @@ -188,6 +191,9 @@ func TestPlannedReparentShard(t *testing.T) {
oldMaster.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{
"FAKE SET MASTER",
"START SLAVE",
// we end up calling SetMaster twice on the old master
"FAKE SET MASTER",
"START SLAVE",
}
oldMaster.StartActionLoop(t, wr)
defer oldMaster.StopActionLoop(t)
Expand Down