diff --git a/go/vt/mysqlctl/backupengine.go b/go/vt/mysqlctl/backupengine.go index 932fe2c1593..edb898c2ce9 100644 --- a/go/vt/mysqlctl/backupengine.go +++ b/go/vt/mysqlctl/backupengine.go @@ -217,8 +217,8 @@ func FindBackupToRestore(ctx context.Context, params RestoreParams, bhs []backup continue } } - if !checkBackupTime /* not snapshot */ || backupTime.Equal(params.StartTime) || backupTime.Before(params.StartTime) { - params.Logger.Infof("Restore: found backup %v %v to restore", bh.Directory(), bh.Name()) + if !checkBackupTime || backupTime.Equal(params.StartTime) || backupTime.Before(params.StartTime) { + params.Logger.Infof("Restore: found backup %v %v to restore using as start timestamp %v", bh.Directory(), bh.Name(), params.StartTime.Format(BackupTimestampFormat)) break } } diff --git a/go/vt/vttablet/tabletmanager/restore.go b/go/vt/vttablet/tabletmanager/restore.go index f0ad6f5448c..bef21b4d81a 100644 --- a/go/vt/vttablet/tabletmanager/restore.go +++ b/go/vt/vttablet/tabletmanager/restore.go @@ -45,7 +45,9 @@ import ( // It is only enabled if restore_from_backup is set. var ( - restoreFromBackup = flag.Bool("restore_from_backup", false, "(init restore parameter) will check BackupStorage for a recent backup at startup and start there") + restoreFromBackup = flag.Bool("restore_from_backup", false, "(init restore parameter) will check BackupStorage for a recent backup at startup and start there") + restoreFromBackupTs = flag.String("restore_from_backup_ts", "", "(init restore parameter) if set, restore the last backup taken at or before this timestamp. Example: '2021-04-29.133050'") + restoreConcurrency = flag.Int("restore_concurrency", 4, "(init restore parameter) how many concurrent files to restore at once") waitForBackupInterval = flag.Duration("wait_for_backup_interval", 0, "(init restore parameter) if this is greater than 0, instead of starting up empty when no backups are found, keep checking at this interval for a backup to appear") @@ -65,7 +67,7 @@ var ( // It will either work, fail gracefully, or return // an error in case of a non-recoverable error. // It takes the action lock so no RPC interferes. -func (tm *TabletManager) RestoreData(ctx context.Context, logger logutil.Logger, waitForBackupInterval time.Duration, deleteBeforeRestore bool) error { +func (tm *TabletManager) RestoreData(ctx context.Context, logger logutil.Logger, waitForBackupInterval time.Duration, deleteBeforeRestore bool, restoreFromBackupTs string) error { if err := tm.lock(ctx); err != nil { return err } @@ -119,7 +121,7 @@ func (tm *TabletManager) RestoreData(ctx context.Context, logger logutil.Logger, startTime = time.Now() - err = tm.restoreDataLocked(ctx, logger, waitForBackupInterval, deleteBeforeRestore) + err = tm.restoreDataLocked(ctx, logger, waitForBackupInterval, deleteBeforeRestore, restoreFromBackupTs) if err != nil { return err } @@ -137,7 +139,7 @@ func (tm *TabletManager) RestoreData(ctx context.Context, logger logutil.Logger, return nil } -func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.Logger, waitForBackupInterval time.Duration, deleteBeforeRestore bool) error { +func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.Logger, waitForBackupInterval time.Duration, deleteBeforeRestore bool, restoreFromBackupTs string) error { tablet := tm.Tablet() originalType := tablet.Type @@ -152,6 +154,17 @@ func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.L if err != nil { return err } + + // Check if we need to use a the latest or a custom backup timestamp for the restore + var startTime time.Time + + if restoreFromBackupTs != "" { + startTime, err = time.Parse(mysqlctl.BackupTimestampFormat, restoreFromBackupTs) + if err != nil { + return vterrors.New(vtrpcpb.Code_INVALID_ARGUMENT, fmt.Sprintf("unable to parse the timestamp passed via -restore_from_backup_ts: %v", err)) + } + } + // For a SNAPSHOT keyspace, we have to look for backups of BaseKeyspace // so we will pass the BaseKeyspace in RestoreParams instead of tablet.Keyspace if keyspaceInfo.KeyspaceType == topodatapb.KeyspaceType_SNAPSHOT { @@ -160,6 +173,9 @@ func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.L } keyspace = keyspaceInfo.BaseKeyspace log.Infof("Using base_keyspace %v to restore keyspace %v", keyspace, tablet.Keyspace) + + startTime = logutil.ProtoToTime(keyspaceInfo.SnapshotTime) + log.Infof("Using %v as backup time", startTime) } params := mysqlctl.RestoreParams{ @@ -173,7 +189,7 @@ func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.L DbName: topoproto.TabletDbName(tablet), Keyspace: keyspace, Shard: tablet.Shard, - StartTime: logutil.ProtoToTime(keyspaceInfo.SnapshotTime), + StartTime: startTime, } // Check whether we're going to restore before changing to RESTORE type, diff --git a/go/vt/vttablet/tabletmanager/rpc_backup.go b/go/vt/vttablet/tabletmanager/rpc_backup.go index caaf7107790..2b590a5dbc5 100644 --- a/go/vt/vttablet/tabletmanager/rpc_backup.go +++ b/go/vt/vttablet/tabletmanager/rpc_backup.go @@ -150,7 +150,7 @@ func (tm *TabletManager) Backup(ctx context.Context, concurrency int, logger log return returnErr } -// RestoreFromBackup deletes all local data and restores anew from the latest backup. +// RestoreFromBackup deletes all local data and restores a new from the latest backup. func (tm *TabletManager) RestoreFromBackup(ctx context.Context, logger logutil.Logger) error { if err := tm.lock(ctx); err != nil { return err @@ -169,7 +169,7 @@ func (tm *TabletManager) RestoreFromBackup(ctx context.Context, logger logutil.L l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger) // now we can run restore - err = tm.restoreDataLocked(ctx, l, 0 /* waitForBackupInterval */, true /* deleteBeforeRestore */) + err = tm.restoreDataLocked(ctx, l, 0 /* waitForBackupInterval */, true /* deleteBeforeRestore */, "" /*restoreFromBackupTs */) // re-run health check to be sure to capture any replication delay tm.QueryServiceControl.BroadcastHealth() diff --git a/go/vt/vttablet/tabletmanager/tm_init.go b/go/vt/vttablet/tabletmanager/tm_init.go index 79c78d9b235..a77bee1babc 100644 --- a/go/vt/vttablet/tabletmanager/tm_init.go +++ b/go/vt/vttablet/tabletmanager/tm_init.go @@ -644,7 +644,7 @@ func (tm *TabletManager) handleRestore(ctx context.Context) (bool, error) { // restoreFromBackup will just be a regular action // (same as if it was triggered remotely) - if err := tm.RestoreData(ctx, logutil.NewConsoleLogger(), *waitForBackupInterval, false /* deleteBeforeRestore */); err != nil { + if err := tm.RestoreData(ctx, logutil.NewConsoleLogger(), *waitForBackupInterval, false /* deleteBeforeRestore */, *restoreFromBackupTs); err != nil { log.Exitf("RestoreFromBackup failed: %v", err) } }() diff --git a/go/vt/wrangler/testlib/backup_test.go b/go/vt/wrangler/testlib/backup_test.go index 249bc1afa24..064319fb288 100644 --- a/go/vt/wrangler/testlib/backup_test.go +++ b/go/vt/wrangler/testlib/backup_test.go @@ -190,7 +190,7 @@ func TestBackupRestore(t *testing.T) { RelayLogInfoPath: path.Join(root, "relay-log.info"), } - require.NoError(t, destTablet.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */)) + require.NoError(t, destTablet.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, "" /* restoreFromBackupTs */)) // verify the full status require.NoError(t, destTablet.FakeMysqlDaemon.CheckSuperQueryList(), "destTablet.FakeMysqlDaemon.CheckSuperQueryList failed") assert.True(t, destTablet.FakeMysqlDaemon.Replicating) @@ -224,7 +224,7 @@ func TestBackupRestore(t *testing.T) { master.FakeMysqlDaemon.SetReplicationPositionPos = master.FakeMysqlDaemon.CurrentMasterPosition // restore master from backup - require.NoError(t, master.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */), "RestoreData failed") + require.NoError(t, master.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, "" /* restoreFromBackupTs */), "RestoreData failed") // tablet was created as MASTER, so it's baseTabletType is MASTER assert.Equal(t, topodatapb.TabletType_MASTER, master.Tablet.Type) assert.False(t, master.FakeMysqlDaemon.Replicating) @@ -238,7 +238,7 @@ func TestBackupRestore(t *testing.T) { "SHOW TABLES FROM `vt_test_keyspace`": {Rows: [][]sqltypes.Value{{sqltypes.NewVarBinary("a")}}}, } - require.NoError(t, master.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */), "RestoreData failed") + require.NoError(t, master.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, "" /* restoreFromBackupTs */), "RestoreData failed") // Tablet type should not change assert.Equal(t, topodatapb.TabletType_MASTER, master.Tablet.Type) assert.False(t, master.FakeMysqlDaemon.Replicating) @@ -416,7 +416,7 @@ func TestBackupRestoreLagged(t *testing.T) { errCh = make(chan error, 1) go func(ctx context.Context, tablet *FakeTablet) { - errCh <- tablet.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */) + errCh <- tablet.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, "" /* restoreFromBackupTs */) }(ctx, destTablet) timer = time.NewTicker(1 * time.Second) @@ -588,7 +588,7 @@ func TestRestoreUnreachableMaster(t *testing.T) { // set a short timeout so that we don't have to wait 30 seconds *topo.RemoteOperationTimeout = 2 * time.Second // Restore should still succeed - require.NoError(t, destTablet.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */)) + require.NoError(t, destTablet.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, "" /* restoreFromBackupTs */)) // verify the full status require.NoError(t, destTablet.FakeMysqlDaemon.CheckSuperQueryList(), "destTablet.FakeMysqlDaemon.CheckSuperQueryList failed") assert.True(t, destTablet.FakeMysqlDaemon.Replicating) @@ -739,7 +739,7 @@ func TestDisableActiveReparents(t *testing.T) { RelayLogInfoPath: path.Join(root, "relay-log.info"), } - require.NoError(t, destTablet.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */)) + require.NoError(t, destTablet.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, "" /* restoreFromBackupTs */)) // verify the full status require.NoError(t, destTablet.FakeMysqlDaemon.CheckSuperQueryList(), "destTablet.FakeMysqlDaemon.CheckSuperQueryList failed") assert.False(t, destTablet.FakeMysqlDaemon.Replicating)