-
Notifications
You must be signed in to change notification settings - Fork 2.3k
tm init: publish displayState #6641
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -218,6 +218,28 @@ func removeExistingFiles(cnf *Mycnf) error { | |
| return nil | ||
| } | ||
|
|
||
| // ShouldRestore checks whether a database with tables already exists | ||
| // and returns whether a restore action should be performed | ||
| func ShouldRestore(ctx context.Context, params RestoreParams) bool { | ||
| if !params.DeleteBeforeRestore { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now that this is in a function, it might be easier to read like: if params.DeleteBeforeRestore || RestoreWasInterrupted(params.Cnf) {
return true
}
// check other stuff |
||
| if !RestoreWasInterrupted(params.Cnf) { | ||
| params.Logger.Infof("Restore: No %v file found, checking no existing data is present", RestoreState) | ||
| // Wait for mysqld to be ready, in case it was launched in parallel with us. | ||
| // If this doesn't succeed, assume we should attempt a restore | ||
| if err := params.Mysqld.Wait(ctx, params.Cnf); err != nil { | ||
| return true | ||
| } | ||
| ok, _ := checkNoDB(ctx, params.Mysqld, params.DbName) | ||
| if !ok { | ||
| params.Logger.Infof("Auto-restore is enabled, but mysqld already contains data. Assuming vttablet was just restarted.") | ||
| _ = PopulateMetadataTables(params.Mysqld, params.LocalMetadata, params.DbName) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems unexpected that a function called |
||
| } | ||
| return ok | ||
| } | ||
| } | ||
| return true | ||
| } | ||
|
|
||
| // Restore is the main entry point for backup restore. If there is no | ||
| // appropriate backup on the BackupStorage, Restore logs an error | ||
| // and returns ErrNoBackup. Any other error is returned. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -76,12 +76,9 @@ func (tm *TabletManager) RestoreData(ctx context.Context, logger logutil.Logger, | |
| } | ||
|
|
||
| func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.Logger, waitForBackupInterval time.Duration, deleteBeforeRestore bool) error { | ||
|
|
||
| tablet := tm.Tablet() | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @sougou said yesterday that we shouldn't be using
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We found that it is being used in too many places, so it is better to make sure that is returns the most up-to-date state. |
||
| originalType := tablet.Type | ||
| if err := tm.tmState.ChangeTabletType(ctx, topodatapb.TabletType_RESTORE); err != nil { | ||
| return err | ||
| } | ||
|
|
||
| // Try to restore. Depending on the reason for failure, we may be ok. | ||
| // If we're not ok, return an error and the tm will log.Fatalf, | ||
| // causing the process to be restarted and the restore retried. | ||
|
|
@@ -117,74 +114,84 @@ func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.L | |
| StartTime: logutil.ProtoToTime(keyspaceInfo.SnapshotTime), | ||
| } | ||
|
|
||
| // Loop until a backup exists, unless we were told to give up immediately. | ||
| var backupManifest *mysqlctl.BackupManifest | ||
| for { | ||
| backupManifest, err = mysqlctl.Restore(ctx, params) | ||
| if waitForBackupInterval == 0 { | ||
| break | ||
| if mysqlctl.ShouldRestore(ctx, params) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you see if this will read better if the check was moved to
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better to keep it here so that calls from other call-sites also benefit from this change.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I feel like this would read better as an early return since we really want to skip the rest of the function. if !mysqlctl.ShouldRestore(ctx, params) {
// Just populate metadata and then return.
}
// Do actual restore. |
||
| // should not become master after restore | ||
| if originalType == topodatapb.TabletType_MASTER { | ||
| originalType = tm.baseTabletType | ||
| } | ||
| // We only retry a specific set of errors. The rest we return immediately. | ||
| if err != mysqlctl.ErrNoBackup && err != mysqlctl.ErrNoCompleteBackup { | ||
| break | ||
| if err := tm.tmState.ChangeTabletType(ctx, topodatapb.TabletType_RESTORE); err != nil { | ||
| return err | ||
| } | ||
| // Loop until a backup exists, unless we were told to give up immediately. | ||
| var backupManifest *mysqlctl.BackupManifest | ||
| for { | ||
| backupManifest, err = mysqlctl.Restore(ctx, params) | ||
| if waitForBackupInterval == 0 { | ||
| break | ||
| } | ||
| // We only retry a specific set of errors. The rest we return immediately. | ||
| if err != mysqlctl.ErrNoBackup && err != mysqlctl.ErrNoCompleteBackup { | ||
| break | ||
| } | ||
|
|
||
| log.Infof("No backup found. Waiting %v (from -wait_for_backup_interval flag) to check again.", waitForBackupInterval) | ||
| select { | ||
| case <-ctx.Done(): | ||
| return ctx.Err() | ||
| case <-time.After(waitForBackupInterval): | ||
| log.Infof("No backup found. Waiting %v (from -wait_for_backup_interval flag) to check again.", waitForBackupInterval) | ||
| select { | ||
| case <-ctx.Done(): | ||
| return ctx.Err() | ||
| case <-time.After(waitForBackupInterval): | ||
| } | ||
| } | ||
| } | ||
|
|
||
| var pos mysql.Position | ||
| if backupManifest != nil { | ||
| pos = backupManifest.Position | ||
| } | ||
| // If SnapshotTime is set , then apply the incremental change | ||
| if keyspaceInfo.SnapshotTime != nil { | ||
| err = tm.restoreToTimeFromBinlog(ctx, pos, keyspaceInfo.SnapshotTime) | ||
| if err != nil { | ||
| log.Errorf("unable to restore to the specified time %s, error : %v", keyspaceInfo.SnapshotTime.String(), err) | ||
| return nil | ||
| var pos mysql.Position | ||
| if backupManifest != nil { | ||
| pos = backupManifest.Position | ||
| } | ||
| } | ||
| switch err { | ||
| case nil: | ||
| // Starting from here we won't be able to recover if we get stopped by a cancelled | ||
| // context. Thus we use the background context to get through to the finish. | ||
| if keyspaceInfo.KeyspaceType == topodatapb.KeyspaceType_NORMAL { | ||
| // Reconnect to master only for "NORMAL" keyspaces | ||
| if err := tm.startReplication(context.Background(), pos, originalType); err != nil { | ||
| return err | ||
| // If SnapshotTime is set , then apply the incremental change | ||
| if keyspaceInfo.SnapshotTime != nil { | ||
| err = tm.restoreToTimeFromBinlog(ctx, pos, keyspaceInfo.SnapshotTime) | ||
| if err != nil { | ||
| log.Errorf("unable to restore to the specified time %s, error : %v", keyspaceInfo.SnapshotTime.String(), err) | ||
| return nil | ||
| } | ||
| } | ||
| case mysqlctl.ErrNoBackup: | ||
| // No-op, starting with empty database. | ||
| case mysqlctl.ErrExistingDB: | ||
| // No-op, assuming we've just restarted. Note the | ||
| // replication reporter may restart replication at the | ||
| // next health check if it thinks it should. We do not | ||
| // alter replication here. | ||
| default: | ||
| // If anything failed, we should reset the original tablet type | ||
| if err := tm.tmState.ChangeTabletType(ctx, originalType); err != nil { | ||
| log.Errorf("Could not change back to original tablet type %v: %v", originalType, err) | ||
| switch err { | ||
| case nil: | ||
| // Starting from here we won't be able to recover if we get stopped by a cancelled | ||
| // context. Thus we use the background context to get through to the finish. | ||
| if keyspaceInfo.KeyspaceType == topodatapb.KeyspaceType_NORMAL { | ||
| // Reconnect to master only for "NORMAL" keyspaces | ||
| if err := tm.startReplication(context.Background(), pos, originalType); err != nil { | ||
| return err | ||
| } | ||
| } | ||
| case mysqlctl.ErrNoBackup: | ||
| // No-op, starting with empty database. | ||
| case mysqlctl.ErrExistingDB: | ||
| // No-op, assuming we've just restarted. Note the | ||
| // replication reporter may restart replication at the | ||
| // next health check if it thinks it should. We do not | ||
| // alter replication here. | ||
| default: | ||
| // If anything failed, we should reset the original tablet type | ||
| if err := tm.tmState.ChangeTabletType(ctx, originalType); err != nil { | ||
| log.Errorf("Could not change back to original tablet type %v: %v", originalType, err) | ||
| } | ||
| return vterrors.Wrap(err, "Can't restore backup") | ||
| } | ||
| return vterrors.Wrap(err, "Can't restore backup") | ||
| } | ||
|
|
||
| // If we had type BACKUP or RESTORE it's better to set our type to the init_tablet_type to make result of the restore | ||
| // similar to completely clean start from scratch. | ||
| if (originalType == topodatapb.TabletType_BACKUP || originalType == topodatapb.TabletType_RESTORE) && *initTabletType != "" { | ||
| initType, err := topoproto.ParseTabletType(*initTabletType) | ||
| if err == nil { | ||
| originalType = initType | ||
| // If we had type BACKUP or RESTORE it's better to set our type to the init_tablet_type to make result of the restore | ||
| // similar to completely clean start from scratch. | ||
| if (originalType == topodatapb.TabletType_BACKUP || originalType == topodatapb.TabletType_RESTORE) && *initTabletType != "" { | ||
| initType, err := topoproto.ParseTabletType(*initTabletType) | ||
| if err == nil { | ||
| originalType = initType | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // Change type back to original type if we're ok to serve. | ||
| return tm.tmState.ChangeTabletType(ctx, originalType) | ||
| // Change type back to original type if we're ok to serve. | ||
| return tm.tmState.ChangeTabletType(ctx, originalType) | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| // restoreToTimeFromBinlog restores to the snapshot time of the keyspace | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd feel better if we returned
(bool, error)so the caller can distinguish between "we should restore" and "we failed to determine whether we should restore". That's howRestore()itself currently works. Is there a reason you thought this one should be different?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also would it make sense for Restore() to call ShouldRestore()?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I went back and forth between returning an error or not. Since you feel that is better, I'll change it.