Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions go/vt/mysqlctl/backupengine.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ var (
type BackupEngine interface {
ExecuteBackup(ctx context.Context, cnf *Mycnf, mysqld MysqlDaemon, logger logutil.Logger, bh backupstorage.BackupHandle, backupConcurrency int, hookExtraEnv map[string]string) (bool, error)
ExecuteRestore(ctx context.Context, cnf *Mycnf, mysqld MysqlDaemon, logger logutil.Logger, dir string, bhs []backupstorage.BackupHandle, restoreConcurrency int, hookExtraEnv map[string]string) (mysql.Position, error)
ShouldDrainForBackup() bool
}

// BackupEngineMap contains the registered implementations for BackupEngine
Expand Down
6 changes: 6 additions & 0 deletions go/vt/mysqlctl/builtinbackupengine.go
Original file line number Diff line number Diff line change
Expand Up @@ -670,6 +670,12 @@ func (be *BuiltinBackupEngine) restoreFile(ctx context.Context, cnf *Mycnf, bh b
return nil
}

// ShouldDrainForBackup satisfies the BackupEngine interface
// backup requires query service to be stopped, hence true
func (be *BuiltinBackupEngine) ShouldDrainForBackup() bool {
return true
}

func init() {
BackupEngineMap["builtin"] = &BuiltinBackupEngine{}
}
6 changes: 6 additions & 0 deletions go/vt/mysqlctl/xtrabackupengine.go
Original file line number Diff line number Diff line change
Expand Up @@ -730,6 +730,12 @@ func stripeReader(readers []io.Reader, blockSize int64) io.Reader {
return reader
}

// ShouldDrainForBackup satisfies the BackupEngine interface
// xtrabackup can run while tablet is serving, hence false
func (be *XtrabackupEngine) ShouldDrainForBackup() bool {
return false
}

func init() {
BackupEngineMap[xtrabackupBackupMethod] = &XtrabackupEngine{}
}
9 changes: 7 additions & 2 deletions go/vt/vttablet/tabletmanager/action_agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@ type ActionAgent struct {
// only used if exportStats is true.
statsTabletTypeCount *stats.CountersWithSingleLabel

// statsBackupIsRunning is set to 1 (true) if a backup is running
// only used if exportStats is true
statsBackupIsRunning *stats.GaugesWithMultiLabels

// batchCtx is given to the agent by its creator, and should be used for
// any background tasks spawned by the agent.
batchCtx context.Context
Expand Down Expand Up @@ -209,8 +213,8 @@ type ActionAgent struct {
// _lockTablesConnection is used to get and release the table read locks to pause replication
_lockTablesConnection *dbconnpool.DBConnection
_lockTablesTimer *time.Timer
// unused
//_lockTablesTimeout *time.Duration
// _isBackupRunning tells us whether there is a backup that is currently running
_isBackupRunning bool
}

// NewActionAgent creates a new ActionAgent and registers all the
Expand Down Expand Up @@ -262,6 +266,7 @@ func NewActionAgent(
agent.exportStats = true
agent.statsTabletType = stats.NewString("TabletType")
agent.statsTabletTypeCount = stats.NewCountersWithSingleLabel("TabletTypeCount", "Number of times the tablet changed to the labeled type", "type")
agent.statsBackupIsRunning = stats.NewGaugesWithMultiLabels("BackupIsRunning", "Whether a backup is running", []string{"mode"})

var mysqlHost string
var mysqlPort int32
Expand Down
74 changes: 60 additions & 14 deletions go/vt/vttablet/tabletmanager/rpc_backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,13 @@ import (
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
)

const (
backupModeOnline = "online"
backupModeOffline = "offline"
)

// Backup takes a db backup and sends it to the BackupStorage
func (agent *ActionAgent) Backup(ctx context.Context, concurrency int, logger logutil.Logger, allowMaster bool) error {
if err := agent.lock(ctx); err != nil {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just had a thought: Should we have a separate lock to allow only one backup at a time? I guess technically it might work to have multiple xtrabackups running, but it's probably not a good idea?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would not locking allow it to be promoted to master? Is that ok?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 for cluster backup lock. It is unlikely you would want a new backup if one is already running, and limiting it helps ensure service only degrades so much.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm as far as the tablet is concerned, I think that would be acceptable. It's better than blocking promotion to master because xtrabackup is running; presumably you are promoting this one because the current master is in bad shape anyway. WDYT?

I don't know for sure if XtraBackup will still produce a consistent snapshot in that case, but I would expect it to.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know a specific reason for it to not be promoted to master, just bringing it up for discussion. Would the tablet type still read BACKUP? My guess is that would prevent most workloads from choosing it to failover to, even though they probably should like you said.

Copy link
Copy Markdown
Collaborator Author

@deepthi deepthi Aug 9, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we are not changing the tablet type to BACKUP while xtrabackup is running, which means a REPLICA would be eligible for master promotion.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense. Will there be any knowledge that a backup is running? I would imagine that when selecting a tablet for master promotion, you'd want to prefer a replica not currently running a backup. I would hope to get that info in wr.ShardReplicationStatuses or something like it.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there should be, but I don't think there is now. Another thing we should fix.

return err
}
defer agent.unlock()

if agent.Cnf == nil {
return fmt.Errorf("cannot perform backup without my.cnf, please restart vttablet with a my.cnf file specified")
}
Expand All @@ -49,22 +49,40 @@ func (agent *ActionAgent) Backup(ctx context.Context, concurrency int, logger lo
if !allowMaster && currentTablet.Type == topodatapb.TabletType_MASTER {
return fmt.Errorf("type MASTER cannot take backup. if you really need to do this, rerun the backup command with -allow_master")
}

engine, err := mysqlctl.GetBackupEngine()
if err != nil {
return vterrors.Wrap(err, "failed to find backup engine")
}
// get Tablet info from topo so that it is up to date
tablet, err := agent.TopoServer.GetTablet(ctx, agent.TabletAlias)
if err != nil {
return err
}
if !allowMaster && tablet.Type == topodatapb.TabletType_MASTER {
return fmt.Errorf("type MASTER cannot take backup. if you really need to do this, rerun the backup command with -allow_master")
}
originalType := tablet.Type

engine, err := mysqlctl.GetBackupEngine()
if err != nil {
return vterrors.Wrap(err, "failed to find backup engine")
// prevent concurrent backups, and record stats
backupMode := backupModeOnline
if engine.ShouldDrainForBackup() {
backupMode = backupModeOffline
}
builtin, _ := engine.(*mysqlctl.BuiltinBackupEngine)
if builtin != nil {
if err := agent.beginBackup(backupMode); err != nil {
return err
}
defer agent.endBackup(backupMode)

var originalType topodatapb.TabletType
if engine.ShouldDrainForBackup() {
if err := agent.lock(ctx); err != nil {
return err
}
defer agent.unlock()
tablet, err := agent.TopoServer.GetTablet(ctx, agent.TabletAlias)
if err != nil {
return err
}
originalType = tablet.Type
// update our type to BACKUP
if _, err := topotools.ChangeType(ctx, agent.TopoServer, tablet.Alias, topodatapb.TabletType_BACKUP); err != nil {
return err
Expand All @@ -83,8 +101,7 @@ func (agent *ActionAgent) Backup(ctx context.Context, concurrency int, logger lo
name := fmt.Sprintf("%v.%v", time.Now().UTC().Format("2006-01-02.150405"), topoproto.TabletAliasString(tablet.Alias))
returnErr := mysqlctl.Backup(ctx, agent.Cnf, agent.MysqlDaemon, l, dir, name, concurrency, agent.hookExtraEnv())

if builtin != nil {

if engine.ShouldDrainForBackup() {
bgCtx := context.Background()
// Starting from here we won't be able to recover if we get stopped by a cancelled
// context. It is also possible that the context already timed out during the
Expand Down Expand Up @@ -138,3 +155,32 @@ func (agent *ActionAgent) RestoreFromBackup(ctx context.Context, logger logutil.

return err
}

func (agent *ActionAgent) beginBackup(backupMode string) error {
agent.mutex.Lock()
defer agent.mutex.Unlock()
if agent._isBackupRunning {
return fmt.Errorf("a backup is already running on tablet: %v", agent.TabletAlias)
}
// when mode is online we don't take the action lock, so we continue to serve,
// but let's set _isBackupRunning to true
// so that we only allow one online backup at a time
// offline backups also run only one at a time because we take the action lock
// so this is not really needed in that case, however we are using it to record the state
agent._isBackupRunning = true
if agent.exportStats {
agent.statsBackupIsRunning.Set([]string{backupMode}, 1)
}
return nil
}

func (agent *ActionAgent) endBackup(backupMode string) {
// now we set _isBackupRunning back to false
// have to take the mutex lock before writing to _ fields
agent.mutex.Lock()
defer agent.mutex.Unlock()
agent._isBackupRunning = false
if agent.exportStats {
agent.statsBackupIsRunning.Set([]string{backupMode}, 0)
}
}