Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions go/test/endtoend/backup/vtbackup/backup_only_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,8 @@ func firstBackupTest(t *testing.T, tabletType string) {
// check that the restored replica has the right local_metadata
result, err := replica2.VttabletProcess.QueryTabletWithDB("select * from local_metadata", "_vt")
require.Nil(t, err)
require.NotNil(t, result)
require.NotEmpty(t, result.Rows)
assert.Equal(t, replica2.Alias, result.Rows[0][1].ToString(), "Alias")
assert.Equal(t, "ks.0", result.Rows[1][1].ToString(), "ClusterAlias")
assert.Equal(t, cell, result.Rows[2][1].ToString(), "DataCenter")
Expand Down
22 changes: 22 additions & 0 deletions go/vt/mysqlctl/backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,28 @@ func removeExistingFiles(cnf *Mycnf) error {
return nil
}

// ShouldRestore checks whether a database with tables already exists
// and returns whether a restore action should be performed
func ShouldRestore(ctx context.Context, params RestoreParams) bool {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd feel better if we returned (bool, error) so the caller can distinguish between "we should restore" and "we failed to determine whether we should restore". That's how Restore() itself currently works. Is there a reason you thought this one should be different?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also would it make sense for Restore() to call ShouldRestore()?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I went back and forth between returning an error or not. Since you feel that is better, I'll change it.

if !params.DeleteBeforeRestore {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now that this is in a function, it might be easier to read like:

if params.DeleteBeforeRestore || RestoreWasInterrupted(params.Cnf) {
  return true
}

// check other stuff

if !RestoreWasInterrupted(params.Cnf) {
params.Logger.Infof("Restore: No %v file found, checking no existing data is present", RestoreState)
// Wait for mysqld to be ready, in case it was launched in parallel with us.
// If this doesn't succeed, assume we should attempt a restore
if err := params.Mysqld.Wait(ctx, params.Cnf); err != nil {
return true
}
ok, _ := checkNoDB(ctx, params.Mysqld, params.DbName)
if !ok {
params.Logger.Infof("Auto-restore is enabled, but mysqld already contains data. Assuming vttablet was just restarted.")
_ = PopulateMetadataTables(params.Mysqld, params.LocalMetadata, params.DbName)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems unexpected that a function called ShouldRestore would have side effects like this. Unless ShouldRestore is called from many places, I'd suggest moving the call to PopulateMetadataTables to the call site so the either/or logic is clearer: either we restore, or we merely populate metadata.

}
return ok
}
}
return true
}

// Restore is the main entry point for backup restore. If there is no
// appropriate backup on the BackupStorage, Restore logs an error
// and returns ErrNoBackup. Any other error is returned.
Expand Down
127 changes: 67 additions & 60 deletions go/vt/vttablet/tabletmanager/restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,9 @@ func (tm *TabletManager) RestoreData(ctx context.Context, logger logutil.Logger,
}

func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.Logger, waitForBackupInterval time.Duration, deleteBeforeRestore bool) error {

tablet := tm.Tablet()
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sougou said yesterday that we shouldn't be using tm.Tablet() (display state) for programmatic decision making. Did we change our minds about that principle, or is this more of a temporary workaround until we can do a more thorough audit of all uses?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We found that it is being used in too many places, so it is better to make sure that is returns the most up-to-date state.

originalType := tablet.Type
if err := tm.tmState.ChangeTabletType(ctx, topodatapb.TabletType_RESTORE); err != nil {
return err
}

// Try to restore. Depending on the reason for failure, we may be ok.
// If we're not ok, return an error and the tm will log.Fatalf,
// causing the process to be restarted and the restore retried.
Expand Down Expand Up @@ -117,74 +114,84 @@ func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.L
StartTime: logutil.ProtoToTime(keyspaceInfo.SnapshotTime),
}

// Loop until a backup exists, unless we were told to give up immediately.
var backupManifest *mysqlctl.BackupManifest
for {
backupManifest, err = mysqlctl.Restore(ctx, params)
if waitForBackupInterval == 0 {
break
if mysqlctl.ShouldRestore(ctx, params) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you see if this will read better if the check was moved to handleRestore in tm_init.go?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better to keep it here so that calls from other call-sites also benefit from this change.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like this would read better as an early return since we really want to skip the rest of the function.

if !mysqlctl.ShouldRestore(ctx, params) {
  // Just populate metadata and then return.
}

// Do actual restore.

// should not become master after restore
if originalType == topodatapb.TabletType_MASTER {
originalType = tm.baseTabletType
}
// We only retry a specific set of errors. The rest we return immediately.
if err != mysqlctl.ErrNoBackup && err != mysqlctl.ErrNoCompleteBackup {
break
if err := tm.tmState.ChangeTabletType(ctx, topodatapb.TabletType_RESTORE); err != nil {
return err
}
// Loop until a backup exists, unless we were told to give up immediately.
var backupManifest *mysqlctl.BackupManifest
for {
backupManifest, err = mysqlctl.Restore(ctx, params)
if waitForBackupInterval == 0 {
break
}
// We only retry a specific set of errors. The rest we return immediately.
if err != mysqlctl.ErrNoBackup && err != mysqlctl.ErrNoCompleteBackup {
break
}

log.Infof("No backup found. Waiting %v (from -wait_for_backup_interval flag) to check again.", waitForBackupInterval)
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(waitForBackupInterval):
log.Infof("No backup found. Waiting %v (from -wait_for_backup_interval flag) to check again.", waitForBackupInterval)
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(waitForBackupInterval):
}
}
}

var pos mysql.Position
if backupManifest != nil {
pos = backupManifest.Position
}
// If SnapshotTime is set , then apply the incremental change
if keyspaceInfo.SnapshotTime != nil {
err = tm.restoreToTimeFromBinlog(ctx, pos, keyspaceInfo.SnapshotTime)
if err != nil {
log.Errorf("unable to restore to the specified time %s, error : %v", keyspaceInfo.SnapshotTime.String(), err)
return nil
var pos mysql.Position
if backupManifest != nil {
pos = backupManifest.Position
}
}
switch err {
case nil:
// Starting from here we won't be able to recover if we get stopped by a cancelled
// context. Thus we use the background context to get through to the finish.
if keyspaceInfo.KeyspaceType == topodatapb.KeyspaceType_NORMAL {
// Reconnect to master only for "NORMAL" keyspaces
if err := tm.startReplication(context.Background(), pos, originalType); err != nil {
return err
// If SnapshotTime is set , then apply the incremental change
if keyspaceInfo.SnapshotTime != nil {
err = tm.restoreToTimeFromBinlog(ctx, pos, keyspaceInfo.SnapshotTime)
if err != nil {
log.Errorf("unable to restore to the specified time %s, error : %v", keyspaceInfo.SnapshotTime.String(), err)
return nil
}
}
case mysqlctl.ErrNoBackup:
// No-op, starting with empty database.
case mysqlctl.ErrExistingDB:
// No-op, assuming we've just restarted. Note the
// replication reporter may restart replication at the
// next health check if it thinks it should. We do not
// alter replication here.
default:
// If anything failed, we should reset the original tablet type
if err := tm.tmState.ChangeTabletType(ctx, originalType); err != nil {
log.Errorf("Could not change back to original tablet type %v: %v", originalType, err)
switch err {
case nil:
// Starting from here we won't be able to recover if we get stopped by a cancelled
// context. Thus we use the background context to get through to the finish.
if keyspaceInfo.KeyspaceType == topodatapb.KeyspaceType_NORMAL {
// Reconnect to master only for "NORMAL" keyspaces
if err := tm.startReplication(context.Background(), pos, originalType); err != nil {
return err
}
}
case mysqlctl.ErrNoBackup:
// No-op, starting with empty database.
case mysqlctl.ErrExistingDB:
// No-op, assuming we've just restarted. Note the
// replication reporter may restart replication at the
// next health check if it thinks it should. We do not
// alter replication here.
default:
// If anything failed, we should reset the original tablet type
if err := tm.tmState.ChangeTabletType(ctx, originalType); err != nil {
log.Errorf("Could not change back to original tablet type %v: %v", originalType, err)
}
return vterrors.Wrap(err, "Can't restore backup")
}
return vterrors.Wrap(err, "Can't restore backup")
}

// If we had type BACKUP or RESTORE it's better to set our type to the init_tablet_type to make result of the restore
// similar to completely clean start from scratch.
if (originalType == topodatapb.TabletType_BACKUP || originalType == topodatapb.TabletType_RESTORE) && *initTabletType != "" {
initType, err := topoproto.ParseTabletType(*initTabletType)
if err == nil {
originalType = initType
// If we had type BACKUP or RESTORE it's better to set our type to the init_tablet_type to make result of the restore
// similar to completely clean start from scratch.
if (originalType == topodatapb.TabletType_BACKUP || originalType == topodatapb.TabletType_RESTORE) && *initTabletType != "" {
initType, err := topoproto.ParseTabletType(*initTabletType)
if err == nil {
originalType = initType
}
}
}

// Change type back to original type if we're ok to serve.
return tm.tmState.ChangeTabletType(ctx, originalType)
// Change type back to original type if we're ok to serve.
return tm.tmState.ChangeTabletType(ctx, originalType)
}
return nil
}

// restoreToTimeFromBinlog restores to the snapshot time of the keyspace
Expand Down
6 changes: 3 additions & 3 deletions go/vt/vttablet/tabletmanager/tm_init.go
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ func (tm *TabletManager) Start(tablet *topodatapb.Tablet, healthCheckInterval ti
return nil
}

tm.tmState.Open(tm.BatchCtx)
tm.tmState.Open()
return nil
}

Expand Down Expand Up @@ -473,7 +473,7 @@ func (tm *TabletManager) checkMastership(ctx context.Context, si *topo.ShardInfo
tablet.MasterTermStartTime = oldTablet.MasterTermStartTime
})
} else {
log.Warningf("Shard master alias matches, but existing tablet is not master. Switching to master with the shard's master term start time: %v", oldTablet.MasterTermStartTime)
log.Warningf("Shard master alias matches, but existing tablet is not master. Switching from %v to master with the shard's master term start time: %v", oldTablet.Type, si.MasterTermStartTime)
tm.tmState.UpdateTablet(func(tablet *topodatapb.Tablet) {
tablet.Type = topodatapb.TabletType_MASTER
tablet.MasterTermStartTime = si.MasterTermStartTime
Expand Down Expand Up @@ -599,7 +599,7 @@ func (tm *TabletManager) handleRestore(ctx context.Context) (bool, error) {
if *restoreFromBackup {
go func() {
// Open the state manager after restore is done.
defer tm.tmState.Open(ctx)
defer tm.tmState.Open()

// restoreFromBackup will just be a regular action
// (same as if it was triggered remotely)
Expand Down
65 changes: 65 additions & 0 deletions go/vt/vttablet/tabletmanager/tm_init_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,71 @@ func TestStartDoesNotUpdateReplicationDataForTabletInWrongShard(t *testing.T) {
assert.Equal(t, 0, len(tablets))
}

func TestCheckTabletTypeResets(t *testing.T) {
defer func(saved time.Duration) { rebuildKeyspaceRetryInterval = saved }(rebuildKeyspaceRetryInterval)
rebuildKeyspaceRetryInterval = 10 * time.Millisecond

ctx := context.Background()
cell := "cell1"
ts := memorytopo.NewServer(cell)
alias := &topodatapb.TabletAlias{
Cell: "cell1",
Uid: 1,
}

// 1. Initialize the tablet as REPLICA.
// This will create the respective topology records.
tm := newTestTM(t, ts, 1, "ks", "0")
tablet := tm.Tablet()
ensureSrvKeyspace(t, ts, cell, "ks")
ti, err := ts.GetTablet(ctx, alias)
require.NoError(t, err)
assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
tm.Stop()

// 2. Update tablet record with tabletType RESTORE
_, err = ts.UpdateTabletFields(ctx, alias, func(t *topodatapb.Tablet) error {
t.Type = topodatapb.TabletType_RESTORE
return nil
})
require.NoError(t, err)
err = tm.Start(tablet, 0)
require.NoError(t, err)
assert.Equal(t, tm.tmState.tablet.Type, tm.tmState.displayState.tablet.Type)
ti, err = ts.GetTablet(ctx, alias)
require.NoError(t, err)
// Verify that it changes back to initTabletType
assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)

// 3. Update shard's master to our alias, then try to init again.
// (This simulates the case where the MasterAlias in the shard record says
// that we are the master but the tablet record says otherwise. In that case,
// we become master by inheriting the shard record's timestamp.)
now := time.Now()
_, err = ts.UpdateShardFields(ctx, "ks", "0", func(si *topo.ShardInfo) error {
si.MasterAlias = alias
si.MasterTermStartTime = logutil.TimeToProto(now)
// Reassign to now for easier comparison.
now = si.GetMasterTermStartTime()
return nil
})
require.NoError(t, err)
si, err := tm.createKeyspaceShard(ctx)
require.NoError(t, err)
err = tm.checkMastership(ctx, si)
require.NoError(t, err)
assert.Equal(t, tm.tmState.tablet.Type, tm.tmState.displayState.tablet.Type)
err = tm.initTablet(ctx)
require.NoError(t, err)
assert.Equal(t, tm.tmState.tablet.Type, tm.tmState.displayState.tablet.Type)
ti, err = ts.GetTablet(ctx, alias)
require.NoError(t, err)
assert.Equal(t, topodatapb.TabletType_MASTER, ti.Type)
ter0 := ti.GetMasterTermStartTime()
assert.Equal(t, now, ter0)
tm.Stop()
}

func newTestTM(t *testing.T, ts *topo.Server, uid int, keyspace, shard string) *TabletManager {
t.Helper()
ctx := context.Background()
Expand Down
11 changes: 7 additions & 4 deletions go/vt/vttablet/tabletmanager/tm_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,26 +68,28 @@ type tmState struct {
}

func newTMState(tm *TabletManager, tablet *topodatapb.Tablet) *tmState {
ctx, cancel := context.WithCancel(tm.BatchCtx)
return &tmState{
tm: tm,
displayState: displayState{
tablet: proto.Clone(tablet).(*topodatapb.Tablet),
},
tablet: tablet,
ctx: ctx,
cancel: cancel,
}
}

func (ts *tmState) Open(ctx context.Context) {
func (ts *tmState) Open() {
ts.mu.Lock()
defer ts.mu.Unlock()
if ts.isOpen {
return
}

ts.ctx, ts.cancel = context.WithCancel(ctx)
ts.isOpen = true
ts.updateLocked(ts.ctx)
ts.publishStateLocked(ctx)
ts.publishStateLocked(ts.ctx)
}

func (ts *tmState) Close() {
Expand Down Expand Up @@ -192,18 +194,19 @@ func (ts *tmState) UpdateTablet(update func(tablet *topodatapb.Tablet)) {
ts.mu.Lock()
defer ts.mu.Unlock()
update(ts.tablet)
ts.publishForDisplay()
}

func (ts *tmState) updateLocked(ctx context.Context) {
span, ctx := trace.NewSpan(ctx, "tmState.update")
defer span.Finish()
ts.publishForDisplay()

if !ts.isOpen {
return
}

terTime := logutil.ProtoToTime(ts.tablet.MasterTermStartTime)
ts.publishForDisplay()

// Disable TabletServer first so the nonserving state gets advertised
// before other services are shutdown.
Expand Down
2 changes: 1 addition & 1 deletion go/vt/vttablet/tabletmanager/tm_state_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ func TestStateOpenClose(t *testing.T) {
savedCtx := tm.tmState.ctx
tm.tmState.mu.Unlock()

tm.tmState.Open(context.Background())
tm.tmState.Open()

tm.tmState.mu.Lock()
assert.Equal(t, savedCtx, tm.tmState.ctx)
Expand Down
Loading