From 353475b018758a3c759c9992111e8cdeaac5fe98 Mon Sep 17 00:00:00 2001 From: Maggie Zhou Date: Tue, 19 Jun 2018 22:58:36 -0700 Subject: [PATCH 1/6] Change approach entirely: During repairReplication, lock the shard in the topo as it's locked for reparenting events / delete events. Also during repairReplication, check to see if Orchestrator is doing any reparenting actively, and if so, do not repairReplication. Before repairingReplication, use the _orchestrator_ side lock of BeginMaintenance to indicate that orchestrator should not begin any reparenting. Signed-off-by: Maggie Zhou --- go/vt/vttablet/tabletmanager/orchestrator.go | 25 +++++++++++++++ .../tabletmanager/replication_reporter.go | 19 +++++++++++ .../vttablet/tabletmanager/rpc_replication.go | 32 +++++++++++-------- 3 files changed, 63 insertions(+), 13 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/orchestrator.go b/go/vt/vttablet/tabletmanager/orchestrator.go index 3d9a8a04910..1986d44b627 100644 --- a/go/vt/vttablet/tabletmanager/orchestrator.go +++ b/go/vt/vttablet/tabletmanager/orchestrator.go @@ -17,6 +17,7 @@ limitations under the License. package tabletmanager import ( + "encoding/json" "flag" "fmt" "io/ioutil" @@ -128,6 +129,30 @@ func (orc *orcClient) EndMaintenance(tablet *topodatapb.Tablet) error { return err } +func (orc *orcClient) InActiveShardRecovery(tablet *topodatapb.Tablet) (bool, error) { + alias := fmt.Sprintf("%v.%v", tablet.GetKeyspace(), tablet.GetShard()) + + // TODO(zmagg): Replace this with simpler call to active-cluster-recovery + // when call with alias parameter is supported. + resp, err := orc.apiGet("audit-recovery", "alias", alias) + + if err != nil { + return false, err + } + + var r []map[string]interface{} + + if err := json.Unmarshal(resp, &r); err != nil { + return false, err + } + + active, ok := r[0]["IsActive"].(bool) + if !ok { + return false, fmt.Errorf("Error parsing JSON response from Orchestrator") + } + return active, nil +} + func mysqlHostPort(tablet *topodatapb.Tablet) (host, port string, err error) { mysqlPort := int(topoproto.MysqlPort(tablet)) if mysqlPort == 0 { diff --git a/go/vt/vttablet/tabletmanager/replication_reporter.go b/go/vt/vttablet/tabletmanager/replication_reporter.go index 753507c9c89..a8445fcb8a8 100644 --- a/go/vt/vttablet/tabletmanager/replication_reporter.go +++ b/go/vt/vttablet/tabletmanager/replication_reporter.go @@ -115,6 +115,7 @@ func repairReplication(ctx context.Context, agent *ActionAgent) error { ts := agent.TopoServer tablet := agent.Tablet() + si, err := ts.GetShard(ctx, tablet.Keyspace, tablet.Shard) if err != nil { return err @@ -122,6 +123,24 @@ func repairReplication(ctx context.Context, agent *ActionAgent) error { if !si.HasMaster() { return fmt.Errorf("no master tablet for shard %v/%v", tablet.Keyspace, tablet.Shard) } + + // If Orchestrator is configured and if Orchestrator is actively reparenting, we should not repairReplication + if agent.orc != nil { + re, err := agent.orc.InActiveShardRecovery(tablet) + if err != nil { + return err + } + if re { + return fmt.Errorf("Orchestrator actively reparenting shard %v, skipping repairReplication", si) + } + + // Before repairing replication, tell Orchestrator to enter maintenance mode for this tablet and to + // lock any other actions on this tablet by Orchestrator. + if err := agent.orc.BeginMaintenance(agent.Tablet(), "vttablet has been told to StopSlave"); err != nil { + log.Warningf("Orchestrator BeginMaintenance failed: %v", err) + } + } + return agent.setMasterLocked(ctx, si.MasterAlias, 0, true) } diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index c4b109d776c..0403793cce0 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -404,26 +404,24 @@ func (agent *ActionAgent) SetMaster(ctx context.Context, parentAlias *topodatapb return agent.setMasterLocked(ctx, parentAlias, timeCreatedNS, forceStartSlave) } -func (agent *ActionAgent) setMasterLocked(ctx context.Context, parentAlias *topodatapb.TabletAlias, timeCreatedNS int64, forceStartSlave bool) error { +func (agent *ActionAgent) setMasterLocked(ctx context.Context, parentAlias *topodatapb.TabletAlias, timeCreatedNS int64, forceStartSlave bool) (err error) { parent, err := agent.TopoServer.GetTablet(ctx, parentAlias) if err != nil { return err } - // If this tablet used to be a master, end orchestrator maintenance after we are connected to the new master. + // End orchestrator maintenance at the end of fixing replication. // This is a best effort operation, so it should happen in a goroutine - if agent.Tablet().Type == topodatapb.TabletType_MASTER { - defer func() { - go func() { - if agent.orc == nil { - return - } - if err := agent.orc.EndMaintenance(agent.Tablet()); err != nil { - log.Warningf("Orchestrator EndMaintenance failed: %v", err) - } - }() + defer func() { + go func() { + if agent.orc == nil { + return + } + if err := agent.orc.EndMaintenance(agent.Tablet()); err != nil { + log.Warningf("Orchestrator EndMaintenance failed: %v", err) + } }() - } + }() // See if we were replicating at all, and should be replicating wasReplicating := false @@ -437,6 +435,14 @@ func (agent *ActionAgent) setMasterLocked(ctx context.Context, parentAlias *topo shouldbeReplicating = true } + // Lock the shard before doing any replication repair work. + ctx, unlock, lockErr := agent.TopoServer.LockShard(ctx, parent.Tablet.GetKeyspace(), parent.Tablet.GetShard(), fmt.Sprintf("repairReplication to %v as parent)", topoproto.TabletAliasString(parentAlias))) + if lockErr != nil { + return lockErr + } + + defer unlock(&err) + // If using semi-sync, we need to enable it before connecting to master. if *enableSemiSync { tt := agent.Tablet().Type From a1826fd288184a8b9620943e233b9586750cfbdd Mon Sep 17 00:00:00 2001 From: Maggie Zhou Date: Tue, 19 Jun 2018 23:03:14 -0700 Subject: [PATCH 2/6] Remove the named return. Signed-off-by: Maggie Zhou --- go/vt/vttablet/tabletmanager/rpc_replication.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index 0403793cce0..9bc68ac2b47 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -404,7 +404,7 @@ func (agent *ActionAgent) SetMaster(ctx context.Context, parentAlias *topodatapb return agent.setMasterLocked(ctx, parentAlias, timeCreatedNS, forceStartSlave) } -func (agent *ActionAgent) setMasterLocked(ctx context.Context, parentAlias *topodatapb.TabletAlias, timeCreatedNS int64, forceStartSlave bool) (err error) { +func (agent *ActionAgent) setMasterLocked(ctx context.Context, parentAlias *topodatapb.TabletAlias, timeCreatedNS int64, forceStartSlave bool) error { parent, err := agent.TopoServer.GetTablet(ctx, parentAlias) if err != nil { return err From 643f1477e51987d88844985aabaeb3ee04f7d3e1 Mon Sep 17 00:00:00 2001 From: Maggie Zhou Date: Tue, 19 Jun 2018 23:40:08 -0700 Subject: [PATCH 3/6] If BeginMaintenance fails we should skip repairReplication Signed-off-by: Maggie Zhou --- go/vt/vttablet/tabletmanager/replication_reporter.go | 1 + 1 file changed, 1 insertion(+) diff --git a/go/vt/vttablet/tabletmanager/replication_reporter.go b/go/vt/vttablet/tabletmanager/replication_reporter.go index a8445fcb8a8..15c0ee67cb2 100644 --- a/go/vt/vttablet/tabletmanager/replication_reporter.go +++ b/go/vt/vttablet/tabletmanager/replication_reporter.go @@ -138,6 +138,7 @@ func repairReplication(ctx context.Context, agent *ActionAgent) error { // lock any other actions on this tablet by Orchestrator. if err := agent.orc.BeginMaintenance(agent.Tablet(), "vttablet has been told to StopSlave"); err != nil { log.Warningf("Orchestrator BeginMaintenance failed: %v", err) + return fmt.Errorf("Orchestrator BeginMaintenance failed :%v, skipping repairReplication", err) } } From b79b1dde57c1ecf686c4bfff1d361af7c8a418f8 Mon Sep 17 00:00:00 2001 From: Maggie Zhou Date: Thu, 28 Jun 2018 17:27:25 -0700 Subject: [PATCH 4/6] Fix tests: Only acquire the shard lock if it isn't already acquired in another context (planned reparent / emergency reparent). Signed-off-by: Maggie Zhou --- go/vt/vttablet/tabletmanager/rpc_replication.go | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index 9bc68ac2b47..6fd1e3ed9e5 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -404,7 +404,7 @@ func (agent *ActionAgent) SetMaster(ctx context.Context, parentAlias *topodatapb return agent.setMasterLocked(ctx, parentAlias, timeCreatedNS, forceStartSlave) } -func (agent *ActionAgent) setMasterLocked(ctx context.Context, parentAlias *topodatapb.TabletAlias, timeCreatedNS int64, forceStartSlave bool) error { +func (agent *ActionAgent) setMasterLocked(ctx context.Context, parentAlias *topodatapb.TabletAlias, timeCreatedNS int64, forceStartSlave bool) (err error) { parent, err := agent.TopoServer.GetTablet(ctx, parentAlias) if err != nil { return err @@ -435,14 +435,15 @@ func (agent *ActionAgent) setMasterLocked(ctx context.Context, parentAlias *topo shouldbeReplicating = true } - // Lock the shard before doing any replication repair work. - ctx, unlock, lockErr := agent.TopoServer.LockShard(ctx, parent.Tablet.GetKeyspace(), parent.Tablet.GetShard(), fmt.Sprintf("repairReplication to %v as parent)", topoproto.TabletAliasString(parentAlias))) - if lockErr != nil { - return lockErr - } - - defer unlock(&err) + if topo.CheckShardLocked(ctx, parent.Tablet.GetKeyspace(), parent.Tablet.GetShard()); err != nil { + // Lock the shard before doing any replication repair work if the shard is not already locked. + _, unlock, lockErr := agent.TopoServer.LockShard(ctx, parent.Tablet.GetKeyspace(), parent.Tablet.GetShard(), fmt.Sprintf("repairReplication to %v as parent)", topoproto.TabletAliasString(parentAlias))) + if lockErr != nil { + return lockErr + } + defer unlock(&err) + } // If using semi-sync, we need to enable it before connecting to master. if *enableSemiSync { tt := agent.Tablet().Type From a4e74342e039d283c3792ca88c577dfacd232406 Mon Sep 17 00:00:00 2001 From: Maggie Zhou Date: Fri, 29 Jun 2018 13:44:19 -0700 Subject: [PATCH 5/6] Refactor things so that only repairReplication based setMaster paths try to acquire the shard lock, as reparenting codepaths already have the shardlock. Signed-off-by: Maggie Zhou --- .../tabletmanager/replication_reporter.go | 2 +- .../vttablet/tabletmanager/rpc_replication.go | 25 ++++++++++++------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/replication_reporter.go b/go/vt/vttablet/tabletmanager/replication_reporter.go index 15c0ee67cb2..8115db0710c 100644 --- a/go/vt/vttablet/tabletmanager/replication_reporter.go +++ b/go/vt/vttablet/tabletmanager/replication_reporter.go @@ -142,7 +142,7 @@ func repairReplication(ctx context.Context, agent *ActionAgent) error { } } - return agent.setMasterLocked(ctx, si.MasterAlias, 0, true) + return agent.setMasterRepairReplication(ctx, si.MasterAlias, 0, true) } func registerReplicationReporter(agent *ActionAgent) { diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index 6fd1e3ed9e5..5cb944fd64b 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -404,6 +404,22 @@ func (agent *ActionAgent) SetMaster(ctx context.Context, parentAlias *topodatapb return agent.setMasterLocked(ctx, parentAlias, timeCreatedNS, forceStartSlave) } +func (agent *ActionAgent) setMasterRepairReplication(ctx context.Context, parentAlias *topodatapb.TabletAlias, timeCreatedNS int64, forceStartSlave bool) (err error) { + parent, err := agent.TopoServer.GetTablet(ctx, parentAlias) + if err != nil { + return err + } + + ctx, unlock, lockErr := agent.TopoServer.LockShard(ctx, parent.Tablet.GetKeyspace(), parent.Tablet.GetShard(), fmt.Sprintf("repairReplication to %v as parent)", topoproto.TabletAliasString(parentAlias))) + if lockErr != nil { + return lockErr + } + + defer unlock(&err) + + return agent.setMasterLocked(ctx, parentAlias, timeCreatedNS, forceStartSlave) +} + func (agent *ActionAgent) setMasterLocked(ctx context.Context, parentAlias *topodatapb.TabletAlias, timeCreatedNS int64, forceStartSlave bool) (err error) { parent, err := agent.TopoServer.GetTablet(ctx, parentAlias) if err != nil { @@ -435,15 +451,6 @@ func (agent *ActionAgent) setMasterLocked(ctx context.Context, parentAlias *topo shouldbeReplicating = true } - if topo.CheckShardLocked(ctx, parent.Tablet.GetKeyspace(), parent.Tablet.GetShard()); err != nil { - // Lock the shard before doing any replication repair work if the shard is not already locked. - _, unlock, lockErr := agent.TopoServer.LockShard(ctx, parent.Tablet.GetKeyspace(), parent.Tablet.GetShard(), fmt.Sprintf("repairReplication to %v as parent)", topoproto.TabletAliasString(parentAlias))) - if lockErr != nil { - return lockErr - } - - defer unlock(&err) - } // If using semi-sync, we need to enable it before connecting to master. if *enableSemiSync { tt := agent.Tablet().Type From f3ebce3881c618a3cf736e0b2c46ad0aca790f1b Mon Sep 17 00:00:00 2001 From: Maggie Zhou Date: Fri, 29 Jun 2018 16:58:18 -0700 Subject: [PATCH 6/6] Handle the case where Orchestrator returns an empty message to audit-recovery Signed-off-by: Maggie Zhou --- go/vt/vttablet/tabletmanager/orchestrator.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/go/vt/vttablet/tabletmanager/orchestrator.go b/go/vt/vttablet/tabletmanager/orchestrator.go index 1986d44b627..c1266d18ddd 100644 --- a/go/vt/vttablet/tabletmanager/orchestrator.go +++ b/go/vt/vttablet/tabletmanager/orchestrator.go @@ -146,7 +146,12 @@ func (orc *orcClient) InActiveShardRecovery(tablet *topodatapb.Tablet) (bool, er return false, err } + if len(r) == 0 { + return false, fmt.Errorf("Orchestrator returned an empty audit-recovery response") + } + active, ok := r[0]["IsActive"].(bool) + if !ok { return false, fmt.Errorf("Error parsing JSON response from Orchestrator") }