Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ require (
github.com/golang/mock v1.3.1
github.com/golang/protobuf v1.3.2
github.com/golang/snappy v0.0.0-20170215233205-553a64147049
github.com/google/btree v1.0.0 // indirect
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do these new entries persist after go mod tidy? I don't quite understand what's happening, but I've noticed the go tool adding some things that we don't actually need.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I hadn't intended to commit a new go.mod :(
But this one diff does persist after go mod tidy

github.com/gorilla/websocket v0.0.0-20160912153041-2d1e4548da23
github.com/grpc-ecosystem/go-grpc-middleware v1.1.0
github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0
Expand Down
4 changes: 4 additions & 0 deletions go/vt/mysqlctl/builtinbackupengine.go
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,9 @@ func (be *BuiltinBackupEngine) ExecuteBackup(ctx context.Context, params BackupP
}
if !replicationPosition.Equal(masterPos) {
for {
if err := ctx.Err(); err != nil {
return usable, err
}
status, err := mysqld.SlaveStatus()
if err != nil {
return usable, err
Expand All @@ -375,6 +378,7 @@ func (be *BuiltinBackupEngine) ExecuteBackup(ctx context.Context, params BackupP
if !newPos.Equal(replicationPosition) {
break
}
time.Sleep(1 * time.Second)
}
}
}
Expand Down
11 changes: 10 additions & 1 deletion go/vt/vttablet/tabletmanager/restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,12 @@ func (agent *ActionAgent) startReplication(ctx context.Context, pos mysql.Positi
defer remoteCancel()
posStr, err := tmc.MasterPosition(remoteCtx, ti.Tablet)
if err != nil {
return vterrors.Wrap(err, "can't get master replication position")
// It is possible that though MasterAlias is set, the master tablet is unreachable
// Log a warning and let tablet restore in that case
// If we had instead considered this fatal, all tablets would crash-loop
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we change one of the e2e test cases to take the master down before restoring one of the tablets? Would that have caught this?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was able to reproduce with a unit test, and verified that the fix works.

// until a master appears, which would make it impossible to elect a master.
log.Warningf("Can't get master replication position after restore: %v", err)
return nil
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't leave a line comment down there, so leaving it here.

The loop on line 248 seems like it will hot-loop indefinitely if replication never starts. Could we add a 1s delay between retries, and check if the context has been cancelled before each iteration?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. Done.

}
masterPos, err := mysql.DecodePosition(posStr)
if err != nil {
Expand All @@ -241,6 +246,9 @@ func (agent *ActionAgent) startReplication(ctx context.Context, pos mysql.Positi

if !pos.Equal(masterPos) {
for {
if err := ctx.Err(); err != nil {
return err
}
status, err := agent.MysqlDaemon.SlaveStatus()
if err != nil {
return vterrors.Wrap(err, "can't get slave status")
Expand All @@ -249,6 +257,7 @@ func (agent *ActionAgent) startReplication(ctx context.Context, pos mysql.Positi
if !newPos.Equal(pos) {
break
}
time.Sleep(1 * time.Second)
}
}

Expand Down
166 changes: 165 additions & 1 deletion go/vt/wrangler/testlib/backup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ func TestBackupRestore(t *testing.T) {
},
}

// start master so that slave can fetch master position from it
// start master so that replica can fetch master position from it
master.StartActionLoop(t, wr)
defer master.StopActionLoop(t)

Expand Down Expand Up @@ -210,3 +210,167 @@ func TestBackupRestore(t *testing.T) {
}

}

func TestRestoreUnreachableMaster(t *testing.T) {
// Initialize our environment
ctx := context.Background()
db := fakesqldb.New(t)
defer db.Close()
ts := memorytopo.NewServer("cell1")
wr := wrangler.New(logutil.NewConsoleLogger(), ts, tmclient.NewTabletManagerClient())
vp := NewVtctlPipe(t, ts)
defer vp.Close()

// Set up mock query results.
db.AddQuery("CREATE DATABASE IF NOT EXISTS _vt", &sqltypes.Result{})
db.AddQuery("BEGIN", &sqltypes.Result{})
db.AddQuery("COMMIT", &sqltypes.Result{})
db.AddQueryPattern(`SET @@session\.sql_log_bin = .*`, &sqltypes.Result{})
db.AddQueryPattern(`CREATE TABLE IF NOT EXISTS _vt\.shard_metadata .*`, &sqltypes.Result{})
db.AddQueryPattern(`CREATE TABLE IF NOT EXISTS _vt\.local_metadata .*`, &sqltypes.Result{})
db.AddQueryPattern(`ALTER TABLE _vt\.local_metadata .*`, &sqltypes.Result{})
db.AddQueryPattern(`ALTER TABLE _vt\.shard_metadata .*`, &sqltypes.Result{})
db.AddQueryPattern(`UPDATE _vt\.local_metadata SET db_name=.*`, &sqltypes.Result{})
db.AddQueryPattern(`UPDATE _vt\.shard_metadata SET db_name=.*`, &sqltypes.Result{})
db.AddQueryPattern(`INSERT INTO _vt\.local_metadata .*`, &sqltypes.Result{})

// Initialize our temp dirs
root, err := ioutil.TempDir("", "backuptest")
if err != nil {
t.Fatalf("os.TempDir failed: %v", err)
}
defer os.RemoveAll(root)

// Initialize BackupStorage
fbsRoot := path.Join(root, "fbs")
*filebackupstorage.FileBackupStorageRoot = fbsRoot
*backupstorage.BackupStorageImplementation = "file"

// Initialize the fake mysql root directories
sourceInnodbDataDir := path.Join(root, "source_innodb_data")
sourceInnodbLogDir := path.Join(root, "source_innodb_log")
sourceDataDir := path.Join(root, "source_data")
sourceDataDbDir := path.Join(sourceDataDir, "vt_db")
for _, s := range []string{sourceInnodbDataDir, sourceInnodbLogDir, sourceDataDbDir} {
if err := os.MkdirAll(s, os.ModePerm); err != nil {
t.Fatalf("failed to create directory %v: %v", s, err)
}
}
if err := ioutil.WriteFile(path.Join(sourceInnodbDataDir, "innodb_data_1"), []byte("innodb data 1 contents"), os.ModePerm); err != nil {
t.Fatalf("failed to write file innodb_data_1: %v", err)
}
if err := ioutil.WriteFile(path.Join(sourceInnodbLogDir, "innodb_log_1"), []byte("innodb log 1 contents"), os.ModePerm); err != nil {
t.Fatalf("failed to write file innodb_log_1: %v", err)
}
if err := ioutil.WriteFile(path.Join(sourceDataDbDir, "db.opt"), []byte("db opt file"), os.ModePerm); err != nil {
t.Fatalf("failed to write file db.opt: %v", err)
}

// create a master tablet, set its master position
master := NewFakeTablet(t, wr, "cell1", 0, topodatapb.TabletType_MASTER, db)
master.FakeMysqlDaemon.ReadOnly = false
master.FakeMysqlDaemon.Replicating = false
master.FakeMysqlDaemon.CurrentMasterPosition = mysql.Position{
GTIDSet: mysql.MariadbGTIDSet{
mysql.MariadbGTID{
Domain: 2,
Server: 123,
Sequence: 457,
},
},
}

// start master so that replica can fetch master position from it
master.StartActionLoop(t, wr)

// create a single tablet, set it up so we can do backups
// set its position same as that of master so that backup doesn't wait for catchup
sourceTablet := NewFakeTablet(t, wr, "cell1", 1, topodatapb.TabletType_REPLICA, db)
sourceTablet.FakeMysqlDaemon.ReadOnly = true
sourceTablet.FakeMysqlDaemon.Replicating = true
sourceTablet.FakeMysqlDaemon.CurrentMasterPosition = mysql.Position{
GTIDSet: mysql.MariadbGTIDSet{
mysql.MariadbGTID{
Domain: 2,
Server: 123,
Sequence: 457,
},
},
}
sourceTablet.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{
"STOP SLAVE",
"START SLAVE",
}
sourceTablet.StartActionLoop(t, wr)
defer sourceTablet.StopActionLoop(t)

sourceTablet.Agent.Cnf = &mysqlctl.Mycnf{
DataDir: sourceDataDir,
InnodbDataHomeDir: sourceInnodbDataDir,
InnodbLogGroupHomeDir: sourceInnodbLogDir,
}

// run the backup
if err := vp.Run([]string{"Backup", topoproto.TabletAliasString(sourceTablet.Tablet.Alias)}); err != nil {
t.Fatalf("Backup failed: %v", err)
}

// create a destination tablet, set it up so we can do restores
destTablet := NewFakeTablet(t, wr, "cell1", 2, topodatapb.TabletType_REPLICA, db)
destTablet.FakeMysqlDaemon.ReadOnly = true
destTablet.FakeMysqlDaemon.Replicating = true
destTablet.FakeMysqlDaemon.CurrentMasterPosition = mysql.Position{
GTIDSet: mysql.MariadbGTIDSet{
mysql.MariadbGTID{
Domain: 2,
Server: 123,
Sequence: 457,
},
},
}
destTablet.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{
"STOP SLAVE",
"RESET SLAVE ALL",
"FAKE SET SLAVE POSITION",
"FAKE SET MASTER",
"START SLAVE",
}
destTablet.FakeMysqlDaemon.FetchSuperQueryMap = map[string]*sqltypes.Result{
"SHOW DATABASES": {},
}
destTablet.FakeMysqlDaemon.SetSlavePositionPos = sourceTablet.FakeMysqlDaemon.CurrentMasterPosition
destTablet.FakeMysqlDaemon.SetMasterInput = topoproto.MysqlAddr(master.Tablet)

destTablet.StartActionLoop(t, wr)
defer destTablet.StopActionLoop(t)

destTablet.Agent.Cnf = &mysqlctl.Mycnf{
DataDir: sourceDataDir,
InnodbDataHomeDir: sourceInnodbDataDir,
InnodbLogGroupHomeDir: sourceInnodbLogDir,
BinLogPath: path.Join(root, "bin-logs/filename_prefix"),
RelayLogPath: path.Join(root, "relay-logs/filename_prefix"),
RelayLogIndexPath: path.Join(root, "relay-log.index"),
RelayLogInfoPath: path.Join(root, "relay-log.info"),
}

// stop master so that it is unreachable
master.StopActionLoop(t)

// Restore should still succeed
if err := destTablet.Agent.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */); err != nil {
t.Fatalf("RestoreData failed: %v", err)
}

// verify the full status
if err := destTablet.FakeMysqlDaemon.CheckSuperQueryList(); err != nil {
t.Errorf("destTablet.FakeMysqlDaemon.CheckSuperQueryList failed: %v", err)
}
if !destTablet.FakeMysqlDaemon.Replicating {
t.Errorf("destTablet.FakeMysqlDaemon.Replicating not set")
}
if !destTablet.FakeMysqlDaemon.Running {
t.Errorf("destTablet.FakeMysqlDaemon.Running not set")
}

}