diff --git a/go/vt/mysqlctl/backupengine.go b/go/vt/mysqlctl/backupengine.go index 2eb59cf9e11..9beeab75782 100644 --- a/go/vt/mysqlctl/backupengine.go +++ b/go/vt/mysqlctl/backupengine.go @@ -41,6 +41,7 @@ var ( type BackupEngine interface { ExecuteBackup(ctx context.Context, cnf *Mycnf, mysqld MysqlDaemon, logger logutil.Logger, bh backupstorage.BackupHandle, backupConcurrency int, hookExtraEnv map[string]string) (bool, error) ExecuteRestore(ctx context.Context, cnf *Mycnf, mysqld MysqlDaemon, logger logutil.Logger, dir string, bhs []backupstorage.BackupHandle, restoreConcurrency int, hookExtraEnv map[string]string) (mysql.Position, error) + ShouldDrainForBackup() bool } // BackupEngineMap contains the registered implementations for BackupEngine diff --git a/go/vt/mysqlctl/builtinbackupengine.go b/go/vt/mysqlctl/builtinbackupengine.go index 8d67819802c..71d352190a6 100644 --- a/go/vt/mysqlctl/builtinbackupengine.go +++ b/go/vt/mysqlctl/builtinbackupengine.go @@ -670,6 +670,12 @@ func (be *BuiltinBackupEngine) restoreFile(ctx context.Context, cnf *Mycnf, bh b return nil } +// ShouldDrainForBackup satisfies the BackupEngine interface +// backup requires query service to be stopped, hence true +func (be *BuiltinBackupEngine) ShouldDrainForBackup() bool { + return true +} + func init() { BackupEngineMap["builtin"] = &BuiltinBackupEngine{} } diff --git a/go/vt/mysqlctl/xtrabackupengine.go b/go/vt/mysqlctl/xtrabackupengine.go index 19ad51ac8a5..45520740e0a 100644 --- a/go/vt/mysqlctl/xtrabackupengine.go +++ b/go/vt/mysqlctl/xtrabackupengine.go @@ -730,6 +730,12 @@ func stripeReader(readers []io.Reader, blockSize int64) io.Reader { return reader } +// ShouldDrainForBackup satisfies the BackupEngine interface +// xtrabackup can run while tablet is serving, hence false +func (be *XtrabackupEngine) ShouldDrainForBackup() bool { + return false +} + func init() { BackupEngineMap[xtrabackupBackupMethod] = &XtrabackupEngine{} } diff --git a/go/vt/vttablet/tabletmanager/action_agent.go b/go/vt/vttablet/tabletmanager/action_agent.go index f84931db386..d6a406a3257 100644 --- a/go/vt/vttablet/tabletmanager/action_agent.go +++ b/go/vt/vttablet/tabletmanager/action_agent.go @@ -109,6 +109,10 @@ type ActionAgent struct { // only used if exportStats is true. statsTabletTypeCount *stats.CountersWithSingleLabel + // statsBackupIsRunning is set to 1 (true) if a backup is running + // only used if exportStats is true + statsBackupIsRunning *stats.GaugesWithMultiLabels + // batchCtx is given to the agent by its creator, and should be used for // any background tasks spawned by the agent. batchCtx context.Context @@ -209,8 +213,8 @@ type ActionAgent struct { // _lockTablesConnection is used to get and release the table read locks to pause replication _lockTablesConnection *dbconnpool.DBConnection _lockTablesTimer *time.Timer - // unused - //_lockTablesTimeout *time.Duration + // _isBackupRunning tells us whether there is a backup that is currently running + _isBackupRunning bool } // NewActionAgent creates a new ActionAgent and registers all the @@ -262,6 +266,7 @@ func NewActionAgent( agent.exportStats = true agent.statsTabletType = stats.NewString("TabletType") agent.statsTabletTypeCount = stats.NewCountersWithSingleLabel("TabletTypeCount", "Number of times the tablet changed to the labeled type", "type") + agent.statsBackupIsRunning = stats.NewGaugesWithMultiLabels("BackupIsRunning", "Whether a backup is running", []string{"mode"}) var mysqlHost string var mysqlPort int32 diff --git a/go/vt/vttablet/tabletmanager/rpc_backup.go b/go/vt/vttablet/tabletmanager/rpc_backup.go index c4b9b06fca4..b2ffffd3933 100644 --- a/go/vt/vttablet/tabletmanager/rpc_backup.go +++ b/go/vt/vttablet/tabletmanager/rpc_backup.go @@ -30,13 +30,13 @@ import ( topodatapb "vitess.io/vitess/go/vt/proto/topodata" ) +const ( + backupModeOnline = "online" + backupModeOffline = "offline" +) + // Backup takes a db backup and sends it to the BackupStorage func (agent *ActionAgent) Backup(ctx context.Context, concurrency int, logger logutil.Logger, allowMaster bool) error { - if err := agent.lock(ctx); err != nil { - return err - } - defer agent.unlock() - if agent.Cnf == nil { return fmt.Errorf("cannot perform backup without my.cnf, please restart vttablet with a my.cnf file specified") } @@ -49,7 +49,11 @@ func (agent *ActionAgent) Backup(ctx context.Context, concurrency int, logger lo if !allowMaster && currentTablet.Type == topodatapb.TabletType_MASTER { return fmt.Errorf("type MASTER cannot take backup. if you really need to do this, rerun the backup command with -allow_master") } - + engine, err := mysqlctl.GetBackupEngine() + if err != nil { + return vterrors.Wrap(err, "failed to find backup engine") + } + // get Tablet info from topo so that it is up to date tablet, err := agent.TopoServer.GetTablet(ctx, agent.TabletAlias) if err != nil { return err @@ -57,14 +61,28 @@ func (agent *ActionAgent) Backup(ctx context.Context, concurrency int, logger lo if !allowMaster && tablet.Type == topodatapb.TabletType_MASTER { return fmt.Errorf("type MASTER cannot take backup. if you really need to do this, rerun the backup command with -allow_master") } - originalType := tablet.Type - engine, err := mysqlctl.GetBackupEngine() - if err != nil { - return vterrors.Wrap(err, "failed to find backup engine") + // prevent concurrent backups, and record stats + backupMode := backupModeOnline + if engine.ShouldDrainForBackup() { + backupMode = backupModeOffline } - builtin, _ := engine.(*mysqlctl.BuiltinBackupEngine) - if builtin != nil { + if err := agent.beginBackup(backupMode); err != nil { + return err + } + defer agent.endBackup(backupMode) + + var originalType topodatapb.TabletType + if engine.ShouldDrainForBackup() { + if err := agent.lock(ctx); err != nil { + return err + } + defer agent.unlock() + tablet, err := agent.TopoServer.GetTablet(ctx, agent.TabletAlias) + if err != nil { + return err + } + originalType = tablet.Type // update our type to BACKUP if _, err := topotools.ChangeType(ctx, agent.TopoServer, tablet.Alias, topodatapb.TabletType_BACKUP); err != nil { return err @@ -83,8 +101,7 @@ func (agent *ActionAgent) Backup(ctx context.Context, concurrency int, logger lo name := fmt.Sprintf("%v.%v", time.Now().UTC().Format("2006-01-02.150405"), topoproto.TabletAliasString(tablet.Alias)) returnErr := mysqlctl.Backup(ctx, agent.Cnf, agent.MysqlDaemon, l, dir, name, concurrency, agent.hookExtraEnv()) - if builtin != nil { - + if engine.ShouldDrainForBackup() { bgCtx := context.Background() // Starting from here we won't be able to recover if we get stopped by a cancelled // context. It is also possible that the context already timed out during the @@ -138,3 +155,32 @@ func (agent *ActionAgent) RestoreFromBackup(ctx context.Context, logger logutil. return err } + +func (agent *ActionAgent) beginBackup(backupMode string) error { + agent.mutex.Lock() + defer agent.mutex.Unlock() + if agent._isBackupRunning { + return fmt.Errorf("a backup is already running on tablet: %v", agent.TabletAlias) + } + // when mode is online we don't take the action lock, so we continue to serve, + // but let's set _isBackupRunning to true + // so that we only allow one online backup at a time + // offline backups also run only one at a time because we take the action lock + // so this is not really needed in that case, however we are using it to record the state + agent._isBackupRunning = true + if agent.exportStats { + agent.statsBackupIsRunning.Set([]string{backupMode}, 1) + } + return nil +} + +func (agent *ActionAgent) endBackup(backupMode string) { + // now we set _isBackupRunning back to false + // have to take the mutex lock before writing to _ fields + agent.mutex.Lock() + defer agent.mutex.Unlock() + agent._isBackupRunning = false + if agent.exportStats { + agent.statsBackupIsRunning.Set([]string{backupMode}, 0) + } +}