From 8183eb006468ab12adf33c329c74a05078952d01 Mon Sep 17 00:00:00 2001 From: deepthi Date: Wed, 23 Sep 2020 15:42:41 -0700 Subject: [PATCH 1/2] orc: use contexts with timeout for remote operations Signed-off-by: deepthi --- .../inst/instance_topology_dao.go | 3 +- go/vt/orchestrator/inst/tablet_dao.go | 12 ++++++-- go/vt/orchestrator/logic/tablet_discovery.go | 28 +++++++++++++------ 3 files changed, 30 insertions(+), 13 deletions(-) diff --git a/go/vt/orchestrator/inst/instance_topology_dao.go b/go/vt/orchestrator/inst/instance_topology_dao.go index b5d93d900eb..98828c89774 100644 --- a/go/vt/orchestrator/inst/instance_topology_dao.go +++ b/go/vt/orchestrator/inst/instance_topology_dao.go @@ -806,7 +806,8 @@ func injectEmptyGTIDTransaction(instanceKey *InstanceKey, gtidEntry *OracleGtidS if err != nil { return err } - ctx := context.Background() + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() conn, err := db.Conn(ctx) if err != nil { return err diff --git a/go/vt/orchestrator/inst/tablet_dao.go b/go/vt/orchestrator/inst/tablet_dao.go index fc2959a003f..36973c3b642 100644 --- a/go/vt/orchestrator/inst/tablet_dao.go +++ b/go/vt/orchestrator/inst/tablet_dao.go @@ -52,7 +52,9 @@ func SwitchMaster(newMasterKey, oldMasterKey InstanceKey) error { log.Errorf("Unexpected: tablet type did not change to master: %v", newMasterTablet.Type) return nil } - _, err = TopoServ.UpdateShardFields(context.TODO(), newMasterTablet.Keyspace, newMasterTablet.Shard, func(si *topo.ShardInfo) error { + ctx, cancel := context.WithTimeout(context.Background(), *topo.RemoteOperationTimeout) + defer cancel() + _, err = TopoServ.UpdateShardFields(ctx, newMasterTablet.Keyspace, newMasterTablet.Shard, func(si *topo.ShardInfo) error { if proto.Equal(si.MasterAlias, newMasterTablet.Alias) && proto.Equal(si.MasterTermStartTime, newMasterTablet.MasterTermStartTime) { return topo.NewError(topo.NoUpdateNeeded, "") } @@ -92,10 +94,14 @@ func ChangeTabletType(instanceKey InstanceKey, tabletType topodatapb.TabletType) return nil, err } tmc := tmclient.NewTabletManagerClient() - if err := tmc.ChangeType(context.TODO(), tablet, tabletType); err != nil { + tmcCtx, tmcCancel := context.WithTimeout(context.Background(), *topo.RemoteOperationTimeout) + defer tmcCancel() + if err := tmc.ChangeType(tmcCtx, tablet, tabletType); err != nil { return nil, err } - ti, err := TopoServ.GetTablet(context.TODO(), tablet.Alias) + tsCtx, tsCancel := context.WithTimeout(context.Background(), *topo.RemoteOperationTimeout) + defer tsCancel() + ti, err := TopoServ.GetTablet(tsCtx, tablet.Alias) if err != nil { return nil, log.Errore(err) } diff --git a/go/vt/orchestrator/logic/tablet_discovery.go b/go/vt/orchestrator/logic/tablet_discovery.go index 0b1f6ddd3c5..1539d9ec3ef 100644 --- a/go/vt/orchestrator/logic/tablet_discovery.go +++ b/go/vt/orchestrator/logic/tablet_discovery.go @@ -67,26 +67,30 @@ func refreshTabletsUsing(loader func(instanceKey *inst.InstanceKey)) { if !IsLeaderOrActive() { return } - cells, err := ts.GetKnownCells(context.TODO()) + ctx, cancel := context.WithTimeout(context.Background(), *topo.RemoteOperationTimeout) + defer cancel() + cells, err := ts.GetKnownCells(ctx) if err != nil { log.Errore(err) return } + refreshCtx, refreshCancel := context.WithTimeout(context.Background(), *topo.RemoteOperationTimeout) + defer refreshCancel() var wg sync.WaitGroup for _, cell := range cells { wg.Add(1) go func(cell string) { defer wg.Done() - refreshTabletsInCell(cell, loader) + refreshTabletsInCell(refreshCtx, cell, loader) }(cell) } wg.Wait() } -func refreshTabletsInCell(cell string, loader func(instanceKey *inst.InstanceKey)) { +func refreshTabletsInCell(ctx context.Context, cell string, loader func(instanceKey *inst.InstanceKey)) { latestInstances := make(map[inst.InstanceKey]bool) - tablets, err := topotools.GetAllTablets(context.TODO(), ts, cell) + tablets, err := topotools.GetAllTablets(ctx, ts, cell) if err != nil { log.Errorf("Error fetching topo info for cell %v: %v", cell, err) return @@ -174,7 +178,7 @@ func LockShard(instanceKey inst.InstanceKey) (func(*error), error) { if err != nil { return nil, err } - ctx, cancel := context.WithTimeout(context.TODO(), 1*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) defer cancel() _, unlock, err := ts.LockShard(ctx, tablet.Keyspace, tablet.Shard, "Orc Recovery") return unlock, err @@ -186,7 +190,9 @@ func TabletRefresh(instanceKey inst.InstanceKey) (*topodatapb.Tablet, error) { if err != nil { return nil, err } - ti, err := ts.GetTablet(context.TODO(), tablet.Alias) + ctx, cancel := context.WithTimeout(context.Background(), *topo.RemoteOperationTimeout) + defer cancel() + ti, err := ts.GetTablet(ctx, tablet.Alias) if err != nil { return nil, err } @@ -217,7 +223,7 @@ func tabletDemoteMaster(instanceKey inst.InstanceKey, forward bool) error { tmc := tmclient.NewTabletManagerClient() // TODO(sougou): this should be controllable because we may want // to give a longer timeout for a graceful takeover. - ctx, cancel := context.WithTimeout(context.TODO(), 1*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) defer cancel() if forward { _, err = tmc.DemoteMaster(ctx, tablet) @@ -232,14 +238,18 @@ func ShardMaster(instanceKey *inst.InstanceKey) (masterKey *inst.InstanceKey, er if err != nil { return nil, err } - si, err := ts.GetShard(context.TODO(), tablet.Keyspace, tablet.Shard) + sCtx, sCancel := context.WithTimeout(context.Background(), *topo.RemoteOperationTimeout) + defer sCancel() + si, err := ts.GetShard(sCtx, tablet.Keyspace, tablet.Shard) if err != nil { return nil, err } if !si.HasMaster() { return nil, fmt.Errorf("no master tablet for shard %v/%v", tablet.Keyspace, tablet.Shard) } - master, err := ts.GetTablet(context.TODO(), si.MasterAlias) + tCtx, tCancel := context.WithTimeout(context.Background(), *topo.RemoteOperationTimeout) + defer tCancel() + master, err := ts.GetTablet(tCtx, si.MasterAlias) if err != nil { return nil, err } From aed9966b8ea4282d75f4c5509a518fea30783e72 Mon Sep 17 00:00:00 2001 From: deepthi Date: Thu, 24 Sep 2020 17:01:56 -0700 Subject: [PATCH 2/2] orc: add context timeouts to config Signed-off-by: deepthi --- go/vt/orchestrator/config/config.go | 4 ++++ go/vt/orchestrator/inst/instance_topology_dao.go | 2 +- go/vt/orchestrator/logic/tablet_discovery.go | 4 +++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/go/vt/orchestrator/config/config.go b/go/vt/orchestrator/config/config.go index 7d8c861ce87..360245a0ad1 100644 --- a/go/vt/orchestrator/config/config.go +++ b/go/vt/orchestrator/config/config.go @@ -253,6 +253,8 @@ type Configuration struct { KVClusterMasterPrefix string // Prefix to use for clusters' masters entries in KV stores (internal, consul, ZK), default: "mysql/master" WebMessage string // If provided, will be shown on all web pages below the title bar MaxConcurrentReplicaOperations int // Maximum number of concurrent operations on replicas + InstanceDBExecContextTimeoutSeconds int // Timeout on context used while calling ExecContext on instance database + LockShardTimeoutSeconds int // Timeout on context used to lock shard. Should be a small value because we should fail-fast } // ToJSONString will marshal this configuration as JSON @@ -421,6 +423,8 @@ func newConfiguration() *Configuration { KVClusterMasterPrefix: "mysql/master", WebMessage: "", MaxConcurrentReplicaOperations: 5, + InstanceDBExecContextTimeoutSeconds: 30, + LockShardTimeoutSeconds: 1, } } diff --git a/go/vt/orchestrator/inst/instance_topology_dao.go b/go/vt/orchestrator/inst/instance_topology_dao.go index 98828c89774..7b20bbfc9b1 100644 --- a/go/vt/orchestrator/inst/instance_topology_dao.go +++ b/go/vt/orchestrator/inst/instance_topology_dao.go @@ -806,7 +806,7 @@ func injectEmptyGTIDTransaction(instanceKey *InstanceKey, gtidEntry *OracleGtidS if err != nil { return err } - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), time.Duration(config.Config.InstanceDBExecContextTimeoutSeconds)*time.Second) defer cancel() conn, err := db.Conn(ctx) if err != nil { diff --git a/go/vt/orchestrator/logic/tablet_discovery.go b/go/vt/orchestrator/logic/tablet_discovery.go index 1539d9ec3ef..6a44273aab8 100644 --- a/go/vt/orchestrator/logic/tablet_discovery.go +++ b/go/vt/orchestrator/logic/tablet_discovery.go @@ -23,6 +23,8 @@ import ( "sync" "time" + "vitess.io/vitess/go/vt/orchestrator/config" + "github.com/golang/protobuf/proto" "vitess.io/vitess/go/vt/orchestrator/db" "vitess.io/vitess/go/vt/orchestrator/external/golib/log" @@ -178,7 +180,7 @@ func LockShard(instanceKey inst.InstanceKey) (func(*error), error) { if err != nil { return nil, err } - ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), time.Duration(config.Config.LockShardTimeoutSeconds)*time.Second) defer cancel() _, unlock, err := ts.LockShard(ctx, tablet.Keyspace, tablet.Shard, "Orc Recovery") return unlock, err