From c6f64d00d4621f8d1e4763f1638c1295d7069483 Mon Sep 17 00:00:00 2001 From: Sugu Sougoumarane Date: Sun, 21 Jun 2020 17:47:40 -0700 Subject: [PATCH 01/19] vttablet: fail if schema engine cannot open Signed-off-by: Sugu Sougoumarane --- go/vt/vttablet/tabletserver/tabletserver.go | 2 +- go/vt/vttablet/tabletserver/tabletserver_test.go | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/go/vt/vttablet/tabletserver/tabletserver.go b/go/vt/vttablet/tabletserver/tabletserver.go index 41dcfd8b4e4..4add08257a7 100644 --- a/go/vt/vttablet/tabletserver/tabletserver.go +++ b/go/vt/vttablet/tabletserver/tabletserver.go @@ -529,7 +529,7 @@ func (tsv *TabletServer) fullStart() (err error) { c.Close() if err := tsv.se.Open(); err != nil { - log.Errorf("Could not load historian, but starting the query service anyways: %v", err) + return err } if err := tsv.qe.Open(); err != nil { return err diff --git a/go/vt/vttablet/tabletserver/tabletserver_test.go b/go/vt/vttablet/tabletserver/tabletserver_test.go index 4ef64959352..4e5eb7eb523 100644 --- a/go/vt/vttablet/tabletserver/tabletserver_test.go +++ b/go/vt/vttablet/tabletserver/tabletserver_test.go @@ -297,10 +297,7 @@ func TestTabletServerSingleSchemaFailure(t *testing.T) { dbcfgs := newDBConfigs(db) target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} err := tsv.StartService(target, dbcfgs) - defer tsv.StopService() - if err != nil { - t.Fatalf("TabletServer should successfully start even if a table's schema is unloadable, but got error: %v", err) - } + assert.Error(t, err) } func TestTabletServerCheckMysql(t *testing.T) { From 7d8eccba3bedf7b955a31577ea3a1718141a5d9c Mon Sep 17 00:00:00 2001 From: Sugu Sougoumarane Date: Sun, 21 Jun 2020 17:49:00 -0700 Subject: [PATCH 02/19] vttablet: standardize heartbeat initialization Signed-off-by: Sugu Sougoumarane --- go/vt/vttablet/heartbeat/reader.go | 7 ++--- go/vt/vttablet/heartbeat/writer.go | 33 ++++++++------------- go/vt/vttablet/tabletserver/tabletserver.go | 14 ++++----- 3 files changed, 20 insertions(+), 34 deletions(-) diff --git a/go/vt/vttablet/heartbeat/reader.go b/go/vt/vttablet/heartbeat/reader.go index e7a492a30a8..e6b4afcb096 100644 --- a/go/vt/vttablet/heartbeat/reader.go +++ b/go/vt/vttablet/heartbeat/reader.go @@ -89,11 +89,8 @@ func NewReader(env tabletenv.Env) *Reader { } } -// Init does last minute initialization of db settings, such as keyspaceShard. -func (r *Reader) Init(target querypb.Target) { - if !r.enabled { - return - } +// InitDBConfig initializes the target name for the Reader. +func (r *Reader) InitDBConfig(target querypb.Target) { r.keyspaceShard = fmt.Sprintf("%s:%s", target.Keyspace, target.Shard) } diff --git a/go/vt/vttablet/heartbeat/writer.go b/go/vt/vttablet/heartbeat/writer.go index 8363d099650..74f1db56a7a 100644 --- a/go/vt/vttablet/heartbeat/writer.go +++ b/go/vt/vttablet/heartbeat/writer.go @@ -90,44 +90,35 @@ func NewWriter(env tabletenv.Env, alias topodatapb.TabletAlias) *Writer { } } -// Init runs at tablet startup and last minute initialization of db settings, and -// creates the necessary tables for heartbeat. -func (w *Writer) Init(target querypb.Target) error { - if !w.enabled { - return nil - } - w.mu.Lock() - defer w.mu.Unlock() - log.Info("Initializing heartbeat table.") +// InitDBConfig initializes the target name for the Writer. +func (w *Writer) InitDBConfig(target querypb.Target) { w.keyspaceShard = fmt.Sprintf("%s:%s", target.Keyspace, target.Shard) - - if target.TabletType == topodatapb.TabletType_MASTER { - err := w.initializeTables(w.env.Config().DB.AppWithDB()) - if err != nil { - w.recordError(err) - return err - } - } - return nil } // Open sets up the Writer's db connection and launches the ticker // responsible for periodically writing to the heartbeat table. // Open may be called multiple times, as long as it was closed since // last invocation. -func (w *Writer) Open() { +func (w *Writer) Open() error { if !w.enabled { - return + return nil } w.mu.Lock() defer w.mu.Unlock() if w.isOpen { - return + return nil } + + if err := w.initializeTables(w.env.Config().DB.AppWithDB()); err != nil { + w.recordError(err) + return err + } + log.Info("Beginning heartbeat writes") w.pool.Open(w.env.Config().DB.AppWithDB(), w.env.Config().DB.DbaWithDB(), w.env.Config().DB.AppDebugWithDB()) w.ticks.Start(func() { w.writeHeartbeat() }) w.isOpen = true + return nil } // Close closes the Writer's db connection and stops the periodic ticker. A writer diff --git a/go/vt/vttablet/tabletserver/tabletserver.go b/go/vt/vttablet/tabletserver/tabletserver.go index 4add08257a7..d42e5457a46 100644 --- a/go/vt/vttablet/tabletserver/tabletserver.go +++ b/go/vt/vttablet/tabletserver/tabletserver.go @@ -369,6 +369,8 @@ func (tsv *TabletServer) InitDBConfig(target querypb.Target, dbcfgs *dbconfigs.D tsv.config.DB = dbcfgs tsv.se.InitDBConfig(tsv.config.DB.DbaWithDB()) + tsv.hw.InitDBConfig(target) + tsv.hr.InitDBConfig(target) return nil } @@ -413,13 +415,11 @@ func (tsv *TabletServer) InitACL(tableACLConfigFile string, enforceTableACLConfi // StartService is a convenience function for InitDBConfig->SetServingType // with serving=true. func (tsv *TabletServer) StartService(target querypb.Target, dbcfgs *dbconfigs.DBConfigs) (err error) { - // Save tablet type away to prevent data races - tabletType := target.TabletType err = tsv.InitDBConfig(target, dbcfgs) if err != nil { return err } - _ /* state changed */, err = tsv.SetServingType(tabletType, true, nil) + _ /* state changed */, err = tsv.SetServingType(target.TabletType, true, nil) return err } @@ -537,10 +537,6 @@ func (tsv *TabletServer) fullStart() (err error) { if err := tsv.te.Init(); err != nil { return err } - if err := tsv.hw.Init(tsv.target); err != nil { - return err - } - tsv.hr.Init(tsv.target) tsv.vstreamer.Open(tsv.target.Keyspace, tsv.alias.Cell) return tsv.serveNewType() } @@ -551,7 +547,9 @@ func (tsv *TabletServer) serveNewType() (err error) { tsv.hr.Close() tsv.te.AcceptReadWrite() - tsv.hw.Open() + if err := tsv.hw.Open(); err != nil { + return err + } tsv.tracker.Open() if err := tsv.txThrottler.Open(tsv.target.Keyspace, tsv.target.Shard); err != nil { return err From 7881fe13670247559a069d43de514b335b14d4b9 Mon Sep 17 00:00:00 2001 From: Sugu Sougoumarane Date: Sun, 21 Jun 2020 17:52:34 -0700 Subject: [PATCH 03/19] vttablet: standardize query_engine Signed-off-by: Sugu Sougoumarane --- go/vt/vttablet/tabletserver/query_engine.go | 6 ++++++ go/vt/vttablet/tabletserver/tabletserver.go | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/go/vt/vttablet/tabletserver/query_engine.go b/go/vt/vttablet/tabletserver/query_engine.go index 0c1842961ea..416782b5313 100644 --- a/go/vt/vttablet/tabletserver/query_engine.go +++ b/go/vt/vttablet/tabletserver/query_engine.go @@ -258,6 +258,12 @@ func (qe *QueryEngine) Open() error { return nil } +// StopServing kills all streaming queries. +// Other queries are handled by the tsv.requests Waitgroup. +func (qe *QueryEngine) StopServing() { + qe.streamQList.TerminateAll() +} + // Close must be called to shut down QueryEngine. // You must ensure that no more queries will be sent // before calling Close. diff --git a/go/vt/vttablet/tabletserver/tabletserver.go b/go/vt/vttablet/tabletserver/tabletserver.go index d42e5457a46..d1e75a1e505 100644 --- a/go/vt/vttablet/tabletserver/tabletserver.go +++ b/go/vt/vttablet/tabletserver/tabletserver.go @@ -609,7 +609,7 @@ func (tsv *TabletServer) waitForShutdown() { tsv.txThrottler.Close() tsv.watcher.Close() tsv.te.Close() - tsv.qe.streamQList.TerminateAll() + tsv.qe.StopServing() tsv.requests.Wait() } @@ -624,7 +624,7 @@ func (tsv *TabletServer) closeAll() { tsv.hr.Close() tsv.hw.Close() tsv.te.Close() - tsv.qe.streamQList.TerminateAll() + tsv.qe.StopServing() tsv.qe.Close() tsv.se.Close() tsv.transition(StateNotConnected) From eb93c7317c635ce18458774776ea8eb7d13eb3df Mon Sep 17 00:00:00 2001 From: Sugu Sougoumarane Date: Sun, 21 Jun 2020 18:07:50 -0700 Subject: [PATCH 04/19] vttablet: standardize txThrottler initialization Signed-off-by: Sugu Sougoumarane --- go/vt/vttablet/tabletserver/tabletserver.go | 35 ++++++++++--------- .../tabletserver/txthrottler/tx_throttler.go | 12 +++++-- .../txthrottler/tx_throttler_test.go | 12 +++++-- 3 files changed, 38 insertions(+), 21 deletions(-) diff --git a/go/vt/vttablet/tabletserver/tabletserver.go b/go/vt/vttablet/tabletserver/tabletserver.go index d1e75a1e505..3127b701827 100644 --- a/go/vt/vttablet/tabletserver/tabletserver.go +++ b/go/vt/vttablet/tabletserver/tabletserver.go @@ -161,14 +161,14 @@ type TabletServer struct { // These are sub-components of TabletServer. se *schema.Engine - qe *QueryEngine - te *TxEngine hw *heartbeat.Writer hr *heartbeat.Reader vstreamer *vstreamer.Engine tracker *schema.Tracker watcher *ReplicationWatcher + qe *QueryEngine txThrottler *txthrottler.TxThrottler + te *TxEngine messager *messager.Engine // streamHealthMutex protects all the following fields @@ -228,14 +228,14 @@ func NewTabletServer(name string, config *tabletenv.TabletConfig, topoServer *to // However, gracefulStop is slightly different because only // some services must be closed, while others should remain open. tsv.se = schema.NewEngine(tsv) - tsv.qe = NewQueryEngine(tsv, tsv.se) - tsv.te = NewTxEngine(tsv) tsv.hw = heartbeat.NewWriter(tsv, alias) tsv.hr = heartbeat.NewReader(tsv) tsv.vstreamer = vstreamer.NewEngine(tsv, srvTopoServer, tsv.se) tsv.tracker = schema.NewTracker(tsv, tsv.vstreamer, tsv.se) tsv.watcher = NewReplicationWatcher(tsv, tsv.vstreamer, tsv.config) + tsv.qe = NewQueryEngine(tsv, tsv.se) tsv.txThrottler = txthrottler.NewTxThrottler(tsv.config, topoServer) + tsv.te = NewTxEngine(tsv) tsv.messager = messager.NewEngine(tsv, tsv.se, tsv.vstreamer) tsv.exporter.NewGaugeFunc("TabletState", "Tablet server state", func() int64 { @@ -371,6 +371,7 @@ func (tsv *TabletServer) InitDBConfig(target querypb.Target, dbcfgs *dbconfigs.D tsv.se.InitDBConfig(tsv.config.DB.DbaWithDB()) tsv.hw.InitDBConfig(target) tsv.hr.InitDBConfig(target) + tsv.txThrottler.InitDBConfig(target) return nil } @@ -531,13 +532,16 @@ func (tsv *TabletServer) fullStart() (err error) { if err := tsv.se.Open(); err != nil { return err } + tsv.vstreamer.Open(tsv.target.Keyspace, tsv.alias.Cell) if err := tsv.qe.Open(); err != nil { return err } + if err := tsv.txThrottler.Open(); err != nil { + return err + } if err := tsv.te.Init(); err != nil { return err } - tsv.vstreamer.Open(tsv.target.Keyspace, tsv.alias.Cell) return tsv.serveNewType() } @@ -546,22 +550,19 @@ func (tsv *TabletServer) serveNewType() (err error) { tsv.watcher.Close() tsv.hr.Close() - tsv.te.AcceptReadWrite() if err := tsv.hw.Open(); err != nil { return err } tsv.tracker.Open() - if err := tsv.txThrottler.Open(tsv.target.Keyspace, tsv.target.Shard); err != nil { - return err - } + tsv.te.AcceptReadWrite() tsv.messager.Open() } else { tsv.messager.Close() + tsv.te.AcceptReadOnly() tsv.tracker.Close() tsv.hw.Close() tsv.se.MakeNonMaster() - tsv.te.AcceptReadOnly() tsv.hr.Open() tsv.watcher.Open() } @@ -594,11 +595,11 @@ func (tsv *TabletServer) StopService() { log.Info("Executing complete shutdown.") tsv.waitForShutdown() - tsv.tracker.Close() + tsv.qe.Close() + tsv.watcher.Close() tsv.vstreamer.Close() tsv.hr.Close() tsv.hw.Close() - tsv.qe.Close() tsv.se.Close() log.Info("Shutdown complete.") tsv.transition(StateNotConnected) @@ -606,9 +607,9 @@ func (tsv *TabletServer) StopService() { func (tsv *TabletServer) waitForShutdown() { tsv.messager.Close() - tsv.txThrottler.Close() - tsv.watcher.Close() tsv.te.Close() + tsv.txThrottler.Close() + tsv.tracker.Close() tsv.qe.StopServing() tsv.requests.Wait() } @@ -617,15 +618,15 @@ func (tsv *TabletServer) waitForShutdown() { // It forcibly shuts down everything. func (tsv *TabletServer) closeAll() { tsv.messager.Close() + tsv.te.Close() tsv.txThrottler.Close() + tsv.qe.StopServing() + tsv.qe.Close() tsv.watcher.Close() tsv.tracker.Close() tsv.vstreamer.Close() tsv.hr.Close() tsv.hw.Close() - tsv.te.Close() - tsv.qe.StopServing() - tsv.qe.Close() tsv.se.Close() tsv.transition(StateNotConnected) } diff --git a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go index 001435e92f5..3bcc88706b6 100644 --- a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go +++ b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go @@ -30,6 +30,7 @@ import ( "vitess.io/vitess/go/vt/topo" "vitess.io/vitess/go/vt/vttablet/tabletserver/tabletenv" + querypb "vitess.io/vitess/go/vt/proto/query" throttlerdatapb "vitess.io/vitess/go/vt/proto/throttlerdata" topodatapb "vitess.io/vitess/go/vt/proto/topodata" ) @@ -69,6 +70,8 @@ type TxThrottler struct { // state holds an open transaction throttler state. It is nil // if the TransactionThrottler is closed. state *txThrottlerState + + target querypb.Target } // NewTxThrottler tries to construct a TxThrottler from the @@ -91,6 +94,11 @@ func NewTxThrottler(config *tabletenv.TabletConfig, topoServer *topo.Server) *Tx return txThrottler } +// InitDBConfig initializes the target parameters for the throttler. +func (t *TxThrottler) InitDBConfig(target querypb.Target) { + t.target = target +} + func tryCreateTxThrottler(config *tabletenv.TabletConfig, topoServer *topo.Server) (*TxThrottler, error) { if !config.EnableTxThrottler { return newTxThrottler(&txThrottlerConfig{enabled: false}) @@ -210,7 +218,7 @@ func newTxThrottler(config *txThrottlerConfig) (*TxThrottler, error) { } // Open opens the transaction throttler. It must be called prior to 'Throttle'. -func (t *TxThrottler) Open(keyspace, shard string) error { +func (t *TxThrottler) Open() error { if !t.config.enabled { return nil } @@ -218,7 +226,7 @@ func (t *TxThrottler) Open(keyspace, shard string) error { return fmt.Errorf("transaction throttler already opened") } var err error - t.state, err = newTxThrottlerState(t.config, keyspace, shard) + t.state, err = newTxThrottlerState(t.config, t.target.Keyspace, t.target.Shard) return err } diff --git a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go index 6fea2c65379..e0b3b6d85a7 100644 --- a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go +++ b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go @@ -39,7 +39,11 @@ func TestDisabledThrottler(t *testing.T) { config := tabletenv.NewDefaultConfig() config.EnableTxThrottler = false throttler := NewTxThrottler(config, nil) - if err := throttler.Open("keyspace", "shard"); err != nil { + throttler.InitDBConfig(querypb.Target{ + Keyspace: "keyspace", + Shard: "shard", + }) + if err := throttler.Open(); err != nil { t.Fatalf("want: nil, got: %v", err) } if result := throttler.Throttle(); result != false { @@ -117,7 +121,11 @@ func TestEnabledThrottler(t *testing.T) { if err != nil { t.Fatalf("want: nil, got: %v", err) } - if err := throttler.Open("keyspace", "shard"); err != nil { + throttler.InitDBConfig(querypb.Target{ + Keyspace: "keyspace", + Shard: "shard", + }) + if err := throttler.Open(); err != nil { t.Fatalf("want: nil, got: %v", err) } if result := throttler.Throttle(); result != false { From caae5ee18e18f4bf0f0a2c1c7a8a90a83ccdbee8 Mon Sep 17 00:00:00 2001 From: Sugu Sougoumarane Date: Sun, 21 Jun 2020 19:51:36 -0700 Subject: [PATCH 05/19] vttablet: standardize tx_engine initialization Signed-off-by: Sugu Sougoumarane --- .../tabletserver/query_executor_test.go | 1 - go/vt/vttablet/tabletserver/schema/engine.go | 2 + go/vt/vttablet/tabletserver/tabletserver.go | 3 - .../tabletserver/tabletserver_test.go | 1 - go/vt/vttablet/tabletserver/twopc.go | 57 +++++++++---------- go/vt/vttablet/tabletserver/tx_engine.go | 20 +++---- .../tabletserver/vstreamer/vstreamer.go | 5 +- 7 files changed, 39 insertions(+), 50 deletions(-) diff --git a/go/vt/vttablet/tabletserver/query_executor_test.go b/go/vt/vttablet/tabletserver/query_executor_test.go index 5068eddfea2..35c3527fdbf 100644 --- a/go/vt/vttablet/tabletserver/query_executor_test.go +++ b/go/vt/vttablet/tabletserver/query_executor_test.go @@ -1158,7 +1158,6 @@ func getTestTableFields() []*querypb.Field { func getQueryExecutorSupportedQueries(testTableHasMultipleUniqueKeys bool) map[string]*sqltypes.Result { return map[string]*sqltypes.Result{ // queries for twopc - sqlTurnoffBinlog: {}, fmt.Sprintf(sqlCreateSidecarDB, "_vt"): {}, fmt.Sprintf(sqlDropLegacy1, "_vt"): {}, fmt.Sprintf(sqlDropLegacy2, "_vt"): {}, diff --git a/go/vt/vttablet/tabletserver/schema/engine.go b/go/vt/vttablet/tabletserver/schema/engine.go index 5a258dd3c63..d8de480677c 100644 --- a/go/vt/vttablet/tabletserver/schema/engine.go +++ b/go/vt/vttablet/tabletserver/schema/engine.go @@ -261,6 +261,8 @@ func (se *Engine) reload(ctx context.Context) error { tableName := row[0].ToString() curTables[tableName] = true createTime, _ := evalengine.ToInt64(row[2]) + // TODO(sougou); find a better way detect changed tables. This method + // seems unreliable. The endtoend test flags all tables as changed. if _, ok := se.tables[tableName]; ok && createTime < se.lastChange { continue } diff --git a/go/vt/vttablet/tabletserver/tabletserver.go b/go/vt/vttablet/tabletserver/tabletserver.go index 3127b701827..ee0744a14d8 100644 --- a/go/vt/vttablet/tabletserver/tabletserver.go +++ b/go/vt/vttablet/tabletserver/tabletserver.go @@ -539,9 +539,6 @@ func (tsv *TabletServer) fullStart() (err error) { if err := tsv.txThrottler.Open(); err != nil { return err } - if err := tsv.te.Init(); err != nil { - return err - } return tsv.serveNewType() } diff --git a/go/vt/vttablet/tabletserver/tabletserver_test.go b/go/vt/vttablet/tabletserver/tabletserver_test.go index 4e5eb7eb523..6241e1d9b87 100644 --- a/go/vt/vttablet/tabletserver/tabletserver_test.go +++ b/go/vt/vttablet/tabletserver/tabletserver_test.go @@ -2756,7 +2756,6 @@ func getSupportedQueries() map[string]*sqltypes.Result { RowsAffected: 1, }, // queries for twopc - sqlTurnoffBinlog: {}, fmt.Sprintf(sqlCreateSidecarDB, "_vt"): {}, fmt.Sprintf(sqlDropLegacy1, "_vt"): {}, fmt.Sprintf(sqlDropLegacy2, "_vt"): {}, diff --git a/go/vt/vttablet/tabletserver/twopc.go b/go/vt/vttablet/tabletserver/twopc.go index d990a6344a5..af8cc6a70eb 100644 --- a/go/vt/vttablet/tabletserver/twopc.go +++ b/go/vt/vttablet/tabletserver/twopc.go @@ -40,7 +40,6 @@ import ( ) const ( - sqlTurnoffBinlog = "set @@session.sql_log_bin = 0" sqlCreateSidecarDB = "create database if not exists %s" sqlDropLegacy1 = "drop table if exists %s.redo_log_transaction" @@ -123,35 +122,8 @@ type TwoPC struct { // NewTwoPC creates a TwoPC variable. func NewTwoPC(readPool *connpool.Pool) *TwoPC { - return &TwoPC{readPool: readPool} -} - -// Init initializes TwoPC. If the metadata database or tables -// are not present, they are created. -func (tpc *TwoPC) Init(dbaparams dbconfigs.Connector) error { + tpc := &TwoPC{readPool: readPool} dbname := "_vt" - conn, err := dbconnpool.NewDBConnection(context.TODO(), dbaparams) - if err != nil { - return err - } - defer conn.Close() - statements := []string{ - sqlTurnoffBinlog, - fmt.Sprintf(sqlCreateSidecarDB, dbname), - fmt.Sprintf(sqlDropLegacy1, dbname), - fmt.Sprintf(sqlDropLegacy2, dbname), - fmt.Sprintf(sqlDropLegacy3, dbname), - fmt.Sprintf(sqlDropLegacy4, dbname), - fmt.Sprintf(sqlCreateTableRedoState, dbname), - fmt.Sprintf(sqlCreateTableRedoStatement, dbname), - fmt.Sprintf(sqlCreateTableDTState, dbname), - fmt.Sprintf(sqlCreateTableDTParticipant, dbname), - } - for _, s := range statements { - if _, err := conn.ExecuteFetch(s, 0, false); err != nil { - return err - } - } tpc.insertRedoTx = sqlparser.BuildParsedQuery( "insert into %s.redo_state(dtid, state, time_created) values (%a, %a, %a)", dbname, ":dtid", ":state", ":time_created") @@ -197,12 +169,35 @@ func (tpc *TwoPC) Init(dbaparams dbconfigs.Connector) error { "select dtid, time_created from %s.dt_state where time_created < %a", dbname, ":time_created") tpc.readAllTransactions = fmt.Sprintf(sqlReadAllTransactions, dbname, dbname) - return nil + return tpc } // Open starts the TwoPC service. -func (tpc *TwoPC) Open(dbconfigs *dbconfigs.DBConfigs) { +func (tpc *TwoPC) Open(dbconfigs *dbconfigs.DBConfigs) error { + dbname := "_vt" + conn, err := dbconnpool.NewDBConnection(context.TODO(), dbconfigs.DbaWithDB()) + if err != nil { + return err + } + defer conn.Close() + statements := []string{ + fmt.Sprintf(sqlCreateSidecarDB, dbname), + fmt.Sprintf(sqlDropLegacy1, dbname), + fmt.Sprintf(sqlDropLegacy2, dbname), + fmt.Sprintf(sqlDropLegacy3, dbname), + fmt.Sprintf(sqlDropLegacy4, dbname), + fmt.Sprintf(sqlCreateTableRedoState, dbname), + fmt.Sprintf(sqlCreateTableRedoStatement, dbname), + fmt.Sprintf(sqlCreateTableDTState, dbname), + fmt.Sprintf(sqlCreateTableDTParticipant, dbname), + } + for _, s := range statements { + if _, err := conn.ExecuteFetch(s, 0, false); err != nil { + return err + } + } tpc.readPool.Open(dbconfigs.AppWithDB(), dbconfigs.DbaWithDB(), dbconfigs.DbaWithDB()) + return nil } // Close closes the TwoPC service. diff --git a/go/vt/vttablet/tabletserver/tx_engine.go b/go/vt/vttablet/tabletserver/tx_engine.go index 428ef850408..ab778b438b5 100644 --- a/go/vt/vttablet/tabletserver/tx_engine.go +++ b/go/vt/vttablet/tabletserver/tx_engine.go @@ -337,15 +337,6 @@ func (te *TxEngine) transitionTo(nextState txEngineState) error { return nil } -// Init must be called once when vttablet starts for setting -// up the metadata tables. -func (te *TxEngine) Init() error { - if te.twopcEnabled { - return te.twoPC.Init(te.env.Config().DB.DbaWithDB()) - } - return nil -} - // open opens the TxEngine. If 2pc is enabled, it restores // all previously prepared transactions from the redo log. // this should only be called when the state is already locked @@ -353,11 +344,14 @@ func (te *TxEngine) open() { te.txPool.Open(te.env.Config().DB.AppWithDB(), te.env.Config().DB.DbaWithDB(), te.env.Config().DB.AppDebugWithDB()) if te.twopcEnabled && te.state == AcceptingReadAndWrite { - te.twoPC.Open(te.env.Config().DB) + // If there are errors, we choose to raise an alert and + // continue anyway. Serving traffic is considered more important + // than blocking everything for the sake of a few transactions. + if err := te.twoPC.Open(te.env.Config().DB); err != nil { + te.env.Stats().InternalErrors.Add("TwopcOpen", 1) + log.Errorf("Could not open TwoPC engine: %v", err) + } if err := te.prepareFromRedo(); err != nil { - // If this operation fails, we choose to raise an alert and - // continue anyway. Serving traffic is considered more important - // than blocking everything for the sake of a few transactions. te.env.Stats().InternalErrors.Add("TwopcResurrection", 1) log.Errorf("Could not prepare transactions: %v", err) } diff --git a/go/vt/vttablet/tabletserver/vstreamer/vstreamer.go b/go/vt/vttablet/tabletserver/vstreamer/vstreamer.go index d7f13005720..445eff89464 100644 --- a/go/vt/vttablet/tabletserver/vstreamer/vstreamer.go +++ b/go/vt/vttablet/tabletserver/vstreamer/vstreamer.go @@ -424,6 +424,10 @@ func (vs *vstreamer) parseEvent(ev mysql.BinlogEvent) ([]*binlogdatapb.VEvent, e Type: binlogdatapb.VEventType_DDL, Ddl: q.SQL, }) + // Reload schema only if the DDL change is relevant. + // TODO(sougou): move this back to always load after + // the schema reload bug is fixed. + vs.se.ReloadAt(context.Background(), vs.pos) } else { // If the DDL need not be sent, send a dummy OTHER event. vevents = append(vevents, &binlogdatapb.VEvent{ @@ -433,7 +437,6 @@ func (vs *vstreamer) parseEvent(ev mysql.BinlogEvent) ([]*binlogdatapb.VEvent, e Type: binlogdatapb.VEventType_OTHER, }) } - vs.se.ReloadAt(context.Background(), vs.pos) case sqlparser.StmtOther, sqlparser.StmtPriv: // These are either: // 1) DBA statements like REPAIR that can be ignored. From 7c0b7f65b6dd162afc9d320a6e04ddb6a218c669 Mon Sep 17 00:00:00 2001 From: Sugu Sougoumarane Date: Sun, 21 Jun 2020 20:03:23 -0700 Subject: [PATCH 06/19] vttablet: standardize vstreamer initialization Signed-off-by: Sugu Sougoumarane --- go/vt/vttablet/endtoend/vstreamer_test.go | 1 + .../vreplication/external_connector.go | 5 +- .../vreplication/framework_test.go | 5 +- .../vreplication/vplayer_flaky_test.go | 2 +- go/vt/vttablet/tabletserver/tabletserver.go | 5 +- .../vttablet/tabletserver/vstreamer/engine.go | 53 +++++++++++-------- .../tabletserver/vstreamer/main_test.go | 10 ++-- .../tabletserver/vstreamer/vstreamer_test.go | 5 +- 8 files changed, 51 insertions(+), 35 deletions(-) diff --git a/go/vt/vttablet/endtoend/vstreamer_test.go b/go/vt/vttablet/endtoend/vstreamer_test.go index 67e7aac05bf..76d98f5b398 100644 --- a/go/vt/vttablet/endtoend/vstreamer_test.go +++ b/go/vt/vttablet/endtoend/vstreamer_test.go @@ -409,6 +409,7 @@ func TestSchemaVersioningLongDDL(t *testing.T) { } func runCases(ctx context.Context, t *testing.T, tests []test, eventCh chan []*binlogdatapb.VEvent) { + t.Helper() client := framework.NewClient() for _, test := range tests { diff --git a/go/vt/vttablet/tabletmanager/vreplication/external_connector.go b/go/vt/vttablet/tabletmanager/vreplication/external_connector.go index 9f1b2121c6a..e5c49209418 100644 --- a/go/vt/vttablet/tabletmanager/vreplication/external_connector.go +++ b/go/vt/vttablet/tabletmanager/vreplication/external_connector.go @@ -88,14 +88,15 @@ func (ec *externalConnector) Get(name string) (*mysqlConnector, error) { c := &mysqlConnector{} c.env = tabletenv.NewEnv(config, name) c.se = schema.NewEngine(c.env) - c.vstreamer = vstreamer.NewEngine(c.env, nil, c.se) + c.vstreamer = vstreamer.NewEngine(c.env, nil, c.se, "") + c.vstreamer.InitDBConfig("") c.se.InitDBConfig(c.env.Config().DB.DbaWithDB()) // Open if err := c.se.Open(); err != nil { return nil, vterrors.Wrapf(err, "external mysqlConnector: %v", name) } - c.vstreamer.Open("", "") + c.vstreamer.Open() // Register ec.connectors[name] = c diff --git a/go/vt/vttablet/tabletmanager/vreplication/framework_test.go b/go/vt/vttablet/tabletmanager/vreplication/framework_test.go index f30bac4223f..9f18e8ac656 100644 --- a/go/vt/vttablet/tabletmanager/vreplication/framework_test.go +++ b/go/vt/vttablet/tabletmanager/vreplication/framework_test.go @@ -89,8 +89,9 @@ func TestMain(m *testing.M) { // engines cannot be initialized in testenv because it introduces // circular dependencies. - streamerEngine = vstreamer.NewEngine(env.TabletEnv, env.SrvTopo, env.SchemaEngine) - streamerEngine.Open(env.KeyspaceName, env.Cells[0]) + streamerEngine = vstreamer.NewEngine(env.TabletEnv, env.SrvTopo, env.SchemaEngine, env.Cells[0]) + streamerEngine.InitDBConfig(env.KeyspaceName) + streamerEngine.Open() defer streamerEngine.Close() if err := env.Mysqld.ExecuteSuperQuery(context.Background(), fmt.Sprintf("create database %s", vrepldb)); err != nil { diff --git a/go/vt/vttablet/tabletmanager/vreplication/vplayer_flaky_test.go b/go/vt/vttablet/tabletmanager/vreplication/vplayer_flaky_test.go index 0ccc4fbd011..d4977d33308 100644 --- a/go/vt/vttablet/tabletmanager/vreplication/vplayer_flaky_test.go +++ b/go/vt/vttablet/tabletmanager/vreplication/vplayer_flaky_test.go @@ -1901,7 +1901,7 @@ func TestRestartOnVStreamEnd(t *testing.T) { expectDBClientQueries(t, []string{ "/update _vt.vreplication set message='vstream ended'", }) - if err := streamerEngine.Open(env.KeyspaceName, env.ShardName); err != nil { + if err := streamerEngine.Open(); err != nil { t.Fatal(err) } diff --git a/go/vt/vttablet/tabletserver/tabletserver.go b/go/vt/vttablet/tabletserver/tabletserver.go index ee0744a14d8..527a6202c72 100644 --- a/go/vt/vttablet/tabletserver/tabletserver.go +++ b/go/vt/vttablet/tabletserver/tabletserver.go @@ -230,7 +230,7 @@ func NewTabletServer(name string, config *tabletenv.TabletConfig, topoServer *to tsv.se = schema.NewEngine(tsv) tsv.hw = heartbeat.NewWriter(tsv, alias) tsv.hr = heartbeat.NewReader(tsv) - tsv.vstreamer = vstreamer.NewEngine(tsv, srvTopoServer, tsv.se) + tsv.vstreamer = vstreamer.NewEngine(tsv, srvTopoServer, tsv.se, alias.Cell) tsv.tracker = schema.NewTracker(tsv, tsv.vstreamer, tsv.se) tsv.watcher = NewReplicationWatcher(tsv, tsv.vstreamer, tsv.config) tsv.qe = NewQueryEngine(tsv, tsv.se) @@ -372,6 +372,7 @@ func (tsv *TabletServer) InitDBConfig(target querypb.Target, dbcfgs *dbconfigs.D tsv.hw.InitDBConfig(target) tsv.hr.InitDBConfig(target) tsv.txThrottler.InitDBConfig(target) + tsv.vstreamer.InitDBConfig(target.Keyspace) return nil } @@ -532,7 +533,7 @@ func (tsv *TabletServer) fullStart() (err error) { if err := tsv.se.Open(); err != nil { return err } - tsv.vstreamer.Open(tsv.target.Keyspace, tsv.alias.Cell) + tsv.vstreamer.Open() if err := tsv.qe.Open(); err != nil { return err } diff --git a/go/vt/vttablet/tabletserver/vstreamer/engine.go b/go/vt/vttablet/tabletserver/vstreamer/engine.go index c8d3455827f..2fb209d3b7a 100644 --- a/go/vt/vttablet/tabletserver/vstreamer/engine.go +++ b/go/vt/vttablet/tabletserver/vstreamer/engine.go @@ -40,19 +40,24 @@ import ( // Engine is the engine for handling vreplication streaming requests. type Engine struct { - env tabletenv.Env + env tabletenv.Env + ts srvtopo.Server + se *schema.Engine + cell string - // mu protects isOpen, streamers, streamIdx and vschema. - mu sync.Mutex + // keyspace is initialized by InitDBConfig + keyspace string - isOpen bool // wg is incremented for every Stream, and decremented on end. // Close waits for all current streams to end by waiting on wg. - wg sync.WaitGroup + wg sync.WaitGroup + + mu sync.Mutex + isOpen bool + streamIdx int streamers map[int]*uvstreamer rowStreamers map[int]*rowStreamer resultStreamers map[int]*resultStreamer - streamIdx int // watcherOnce is used for initializing vschema // and setting up the vschema watch. It's guaranteed that @@ -61,12 +66,7 @@ type Engine struct { watcherOnce sync.Once lvschema *localVSchema - // The following members are initialized once at the beginning. - ts srvtopo.Server - se *schema.Engine - keyspace string - cell string - + // stats variables vschemaErrors *stats.Counter vschemaUpdates *stats.Counter } @@ -74,32 +74,40 @@ type Engine struct { // NewEngine creates a new Engine. // Initialization sequence is: NewEngine->InitDBConfig->Open. // Open and Close can be called multiple times and are idempotent. -func NewEngine(env tabletenv.Env, ts srvtopo.Server, se *schema.Engine) *Engine { +func NewEngine(env tabletenv.Env, ts srvtopo.Server, se *schema.Engine, cell string) *Engine { vse := &Engine{ - env: env, + env: env, + ts: ts, + se: se, + cell: cell, + streamers: make(map[int]*uvstreamer), rowStreamers: make(map[int]*rowStreamer), resultStreamers: make(map[int]*resultStreamer), - lvschema: &localVSchema{vschema: &vindexes.VSchema{}}, - ts: ts, - se: se, - vschemaErrors: env.Exporter().NewCounter("VSchemaErrors", "Count of VSchema errors"), - vschemaUpdates: env.Exporter().NewCounter("VSchemaUpdates", "Count of VSchema updates. Does not include errors"), + + lvschema: &localVSchema{vschema: &vindexes.VSchema{}}, + + vschemaErrors: env.Exporter().NewCounter("VSchemaErrors", "Count of VSchema errors"), + vschemaUpdates: env.Exporter().NewCounter("VSchemaUpdates", "Count of VSchema updates. Does not include errors"), } env.Exporter().HandleFunc("/debug/tablet_vschema", vse.ServeHTTP) return vse } +// InitDBConfig initializes the target parameters for the Engine. +func (vse *Engine) InitDBConfig(keyspace string) { + vse.keyspace = keyspace +} + // Open starts the Engine service. -func (vse *Engine) Open(keyspace, cell string) error { +func (vse *Engine) Open() error { vse.mu.Lock() defer vse.mu.Unlock() if vse.isOpen { return nil } + log.Info("VStreamer is open.") vse.isOpen = true - vse.keyspace = keyspace - vse.cell = cell return nil } @@ -134,6 +142,7 @@ func (vse *Engine) Close() { // Wait only after releasing the lock because the end of every // stream will use the lock to remove the entry from streamers. vse.wg.Wait() + log.Info("VStreamer is closed.") } func (vse *Engine) vschema() *vindexes.VSchema { diff --git a/go/vt/vttablet/tabletserver/vstreamer/main_test.go b/go/vt/vttablet/tabletserver/vstreamer/main_test.go index c5b0f6ac0e0..37c178d8765 100644 --- a/go/vt/vttablet/tabletserver/vstreamer/main_test.go +++ b/go/vt/vttablet/tabletserver/vstreamer/main_test.go @@ -52,8 +52,9 @@ func TestMain(m *testing.M) { // engine cannot be initialized in testenv because it introduces // circular dependencies - engine = NewEngine(env.TabletEnv, env.SrvTopo, env.SchemaEngine) - engine.Open(env.KeyspaceName, env.Cells[0]) + engine = NewEngine(env.TabletEnv, env.SrvTopo, env.SchemaEngine, env.Cells[0]) + engine.InitDBConfig(env.KeyspaceName) + engine.Open() defer engine.Close() return m.Run() @@ -68,7 +69,8 @@ func customEngine(t *testing.T, modifier func(mysql.ConnParams) mysql.ConnParams config := env.TabletEnv.Config().Clone() config.DB = dbconfigs.NewTestDBConfigs(modified, modified, modified.DbName) - engine := NewEngine(tabletenv.NewEnv(config, "VStreamerTest"), env.SrvTopo, env.SchemaEngine) - engine.Open(env.KeyspaceName, env.Cells[0]) + engine := NewEngine(tabletenv.NewEnv(config, "VStreamerTest"), env.SrvTopo, env.SchemaEngine, env.Cells[0]) + engine.InitDBConfig(env.KeyspaceName) + engine.Open() return engine } diff --git a/go/vt/vttablet/tabletserver/vstreamer/vstreamer_test.go b/go/vt/vttablet/tabletserver/vstreamer/vstreamer_test.go index ab8431d2748..60f093c44f8 100644 --- a/go/vt/vttablet/tabletserver/vstreamer/vstreamer_test.go +++ b/go/vt/vttablet/tabletserver/vstreamer/vstreamer_test.go @@ -57,8 +57,9 @@ func TestVersion(t *testing.T) { require.NoError(t, err) defer env.SchemaEngine.EnableHistorian(false) - engine = NewEngine(engine.env, env.SrvTopo, env.SchemaEngine) - engine.Open(env.KeyspaceName, env.Cells[0]) + engine = NewEngine(engine.env, env.SrvTopo, env.SchemaEngine, env.Cells[0]) + engine.InitDBConfig(env.KeyspaceName) + engine.Open() defer engine.Close() execStatements(t, []string{ From 0491e9b097a3ac2acb5d46d9c0fb16b478fd569a Mon Sep 17 00:00:00 2001 From: Sugu Sougoumarane Date: Sun, 21 Jun 2020 21:33:27 -0700 Subject: [PATCH 07/19] vttablet: further standardize heartbeat Signed-off-by: Sugu Sougoumarane --- go/vt/vttablet/heartbeat/writer.go | 79 ++++++--------------- go/vt/vttablet/heartbeat/writer_test.go | 54 ++++++-------- go/vt/vttablet/tabletserver/tabletserver.go | 4 +- go/vt/withddl/withddl.go | 3 +- 4 files changed, 47 insertions(+), 93 deletions(-) diff --git a/go/vt/vttablet/heartbeat/writer.go b/go/vt/vttablet/heartbeat/writer.go index 74f1db56a7a..10b9ae975d2 100644 --- a/go/vt/vttablet/heartbeat/writer.go +++ b/go/vt/vttablet/heartbeat/writer.go @@ -21,14 +21,12 @@ import ( "sync" "time" - "vitess.io/vitess/go/vt/vterrors" + "vitess.io/vitess/go/vt/withddl" "golang.org/x/net/context" "vitess.io/vitess/go/sqltypes" "vitess.io/vitess/go/timer" - "vitess.io/vitess/go/vt/dbconfigs" - "vitess.io/vitess/go/vt/dbconnpool" "vitess.io/vitess/go/vt/log" "vitess.io/vitess/go/vt/logutil" "vitess.io/vitess/go/vt/sqlparser" @@ -46,10 +44,15 @@ const ( tabletUid INT UNSIGNED NOT NULL, ts BIGINT UNSIGNED NOT NULL ) engine=InnoDB` - sqlInsertInitialRow = "INSERT INTO %s.heartbeat (ts, tabletUid, keyspaceShard) VALUES (%a, %a, %a) ON DUPLICATE KEY UPDATE ts=VALUES(ts)" - sqlUpdateHeartbeat = "UPDATE %s.heartbeat SET ts=%a, tabletUid=%a WHERE keyspaceShard=%a" + sqlUpsertHeartbeat = "INSERT INTO %s.heartbeat (ts, tabletUid, keyspaceShard) VALUES (%a, %a, %a) ON DUPLICATE KEY UPDATE ts=VALUES(ts), tabletUid=VALUES(tabletUid)" + sqlUpdateHeartbeat = "UPDATE %s.heartbeat SET ts=%a, tabletUid=%a WHERE keyspaceShard=%a" ) +var withDDL = withddl.New([]string{ + fmt.Sprintf(sqlCreateSidecarDB, "_vt"), + fmt.Sprintf(sqlCreateHeartbeatTable, "_vt"), +}) + // Writer runs on master tablets and writes heartbeats to the _vt.heartbeat // table at a regular interval, defined by heartbeat_interval. type Writer struct { @@ -99,26 +102,20 @@ func (w *Writer) InitDBConfig(target querypb.Target) { // responsible for periodically writing to the heartbeat table. // Open may be called multiple times, as long as it was closed since // last invocation. -func (w *Writer) Open() error { +func (w *Writer) Open() { if !w.enabled { - return nil + return } w.mu.Lock() defer w.mu.Unlock() if w.isOpen { - return nil - } - - if err := w.initializeTables(w.env.Config().DB.AppWithDB()); err != nil { - w.recordError(err) - return err + return } log.Info("Beginning heartbeat writes") w.pool.Open(w.env.Config().DB.AppWithDB(), w.env.Config().DB.DbaWithDB(), w.env.Config().DB.AppDebugWithDB()) - w.ticks.Start(func() { w.writeHeartbeat() }) + w.ticks.Start(w.writeHeartbeat) w.isOpen = true - return nil } // Close closes the Writer's db connection and stops the periodic ticker. A writer @@ -138,36 +135,6 @@ func (w *Writer) Close() { w.isOpen = false } -// initializeTables attempts to create the heartbeat tables and record an -// initial row. The row is created only on master and is replicated to all -// other servers. -func (w *Writer) initializeTables(cp dbconfigs.Connector) error { - conn, err := dbconnpool.NewDBConnection(context.TODO(), cp) - if err != nil { - return vterrors.Wrap(err, "Failed to create connection for heartbeat") - } - defer conn.Close() - statements := []string{ - fmt.Sprintf(sqlCreateSidecarDB, "_vt"), - fmt.Sprintf(sqlCreateHeartbeatTable, "_vt"), - } - for _, s := range statements { - if _, err := conn.ExecuteFetch(s, 0, false); err != nil { - return vterrors.Wrap(err, "Failed to execute heartbeat init query") - } - } - insert, err := w.bindHeartbeatVars(sqlInsertInitialRow) - if err != nil { - return vterrors.Wrap(err, "Failed to bindHeartbeatVars initial heartbeat insert") - } - _, err = conn.ExecuteFetch(insert, 0, false) - if err != nil { - return vterrors.Wrap(err, "Failed to execute initial heartbeat insert") - } - writes.Add(1) - return nil -} - // bindHeartbeatVars takes a heartbeat write (insert or update) and // adds the necessary fields to the query as bind vars. This is done // to protect ourselves against a badly formed keyspace or shard name. @@ -187,29 +154,27 @@ func (w *Writer) bindHeartbeatVars(query string) (string, error) { // writeHeartbeat updates the heartbeat row for this tablet with the current time in nanoseconds. func (w *Writer) writeHeartbeat() { - defer w.env.LogError() - ctx, cancel := context.WithDeadline(context.Background(), w.now().Add(w.interval)) - defer cancel() - update, err := w.bindHeartbeatVars(sqlUpdateHeartbeat) - if err != nil { - w.recordError(err) - return - } - err = w.exec(ctx, update) - if err != nil { + if err := w.write(); err != nil { w.recordError(err) return } writes.Add(1) } -func (w *Writer) exec(ctx context.Context, query string) error { +func (w *Writer) write() error { + defer w.env.LogError() + ctx, cancel := context.WithDeadline(context.Background(), w.now().Add(w.interval)) + defer cancel() + upsert, err := w.bindHeartbeatVars(sqlUpsertHeartbeat) + if err != nil { + return err + } conn, err := w.pool.Get(ctx) if err != nil { return err } defer conn.Recycle() - _, err = conn.Exec(ctx, query, 0, false) + _, err = withDDL.Exec(ctx, upsert, conn.Exec) if err != nil { return err } diff --git a/go/vt/vttablet/heartbeat/writer_test.go b/go/vt/vttablet/heartbeat/writer_test.go index d0d8303d467..0d80c676236 100644 --- a/go/vt/vttablet/heartbeat/writer_test.go +++ b/go/vt/vttablet/heartbeat/writer_test.go @@ -21,6 +21,9 @@ import ( "testing" "time" + "github.com/stretchr/testify/require" + "gotest.tools/assert" + "vitess.io/vitess/go/mysql" "vitess.io/vitess/go/mysql/fakesqldb" "vitess.io/vitess/go/sqltypes" "vitess.io/vitess/go/vt/dbconfigs" @@ -35,10 +38,6 @@ var ( } ) -// TestCreateSchema tests that our initial INSERT uses -// the proper arguments. It also sanity checks the other init -// queries for completeness, and verifies that we return any -// failure that is encountered. func TestCreateSchema(t *testing.T) { db := fakesqldb.New(t) defer db.Close() @@ -46,44 +45,39 @@ func TestCreateSchema(t *testing.T) { defer tw.Close() writes.Reset() - db.AddQuery(fmt.Sprintf(sqlCreateHeartbeatTable, "_vt"), &sqltypes.Result{}) - db.AddQuery(fmt.Sprintf("INSERT INTO %s.heartbeat (ts, tabletUid, keyspaceShard) VALUES (%d, %d, '%s') ON DUPLICATE KEY UPDATE ts=VALUES(ts)", "_vt", now.UnixNano(), tw.tabletAlias.Uid, tw.keyspaceShard), &sqltypes.Result{}) - if err := tw.initializeTables(db.ConnParams()); err == nil { - t.Fatal("initializeTables() should not have succeeded") + db.OrderMatters() + upsert := fmt.Sprintf("INSERT INTO %s.heartbeat (ts, tabletUid, keyspaceShard) VALUES (%d, %d, '%s') ON DUPLICATE KEY UPDATE ts=VALUES(ts), tabletUid=VALUES(tabletUid)", + "_vt", now.UnixNano(), tw.tabletAlias.Uid, tw.keyspaceShard) + failInsert := fakesqldb.ExpectedExecuteFetch{ + Query: upsert, + Error: mysql.NewSQLError(mysql.ERBadDb, "", "bad db error"), } + db.AddExpectedExecuteFetch(failInsert) + db.AddExpectedQuery(fmt.Sprintf(sqlCreateSidecarDB, "_vt"), nil) + db.AddExpectedQuery(fmt.Sprintf(sqlCreateHeartbeatTable, "_vt"), nil) + db.AddExpectedQuery(upsert, nil) - db.AddQuery(fmt.Sprintf(sqlCreateSidecarDB, "_vt"), &sqltypes.Result{}) - if err := tw.initializeTables(db.ConnParams()); err != nil { - t.Fatalf("Should not be in error: %v", err) - } - - if got, want := writes.Get(), int64(1); got != want { - t.Fatalf("wrong writes count: got = %v, want = %v", got, want) - } + err := tw.write() + require.NoError(t, err) } -// TestWriteHearbeat ensures the proper arguments for the UPDATE query -// and writes get recorded in counters. func TestWriteHeartbeat(t *testing.T) { db := fakesqldb.New(t) defer db.Close() tw := newTestWriter(db, mockNowFunc) - db.AddQuery(fmt.Sprintf("UPDATE %s.heartbeat SET ts=%d, tabletUid=%d WHERE keyspaceShard='%s'", "_vt", now.UnixNano(), tw.tabletAlias.Uid, tw.keyspaceShard), &sqltypes.Result{}) + upsert := fmt.Sprintf("INSERT INTO %s.heartbeat (ts, tabletUid, keyspaceShard) VALUES (%d, %d, '%s') ON DUPLICATE KEY UPDATE ts=VALUES(ts), tabletUid=VALUES(tabletUid)", + "_vt", now.UnixNano(), tw.tabletAlias.Uid, tw.keyspaceShard) + db.AddQuery(upsert, &sqltypes.Result{}) writes.Reset() writeErrors.Reset() tw.writeHeartbeat() - if got, want := writes.Get(), int64(1); got != want { - t.Fatalf("wrong writes count: got = %v; want = %v", got, want) - } - if got, want := writeErrors.Get(), int64(0); got != want { - t.Fatalf("wrong write errors count: got = %v; want = %v", got, want) - } + assert.Equal(t, int64(1), writes.Get()) + assert.Equal(t, int64(0), writeErrors.Get()) } -// TestWriteHeartbeatError ensures that we properly account for write errors. func TestWriteHeartbeatError(t *testing.T) { db := fakesqldb.New(t) defer db.Close() @@ -94,12 +88,8 @@ func TestWriteHeartbeatError(t *testing.T) { writeErrors.Reset() tw.writeHeartbeat() - if got, want := writes.Get(), int64(0); got != want { - t.Fatalf("wrong writes count: got = %v; want = %v", got, want) - } - if got, want := writeErrors.Get(), int64(1); got != want { - t.Fatalf("wrong write errors count: got = %v; want = %v", got, want) - } + assert.Equal(t, int64(0), writes.Get()) + assert.Equal(t, int64(1), writeErrors.Get()) } func newTestWriter(db *fakesqldb.DB, nowFunc func() time.Time) *Writer { diff --git a/go/vt/vttablet/tabletserver/tabletserver.go b/go/vt/vttablet/tabletserver/tabletserver.go index 527a6202c72..741e838cf63 100644 --- a/go/vt/vttablet/tabletserver/tabletserver.go +++ b/go/vt/vttablet/tabletserver/tabletserver.go @@ -548,9 +548,7 @@ func (tsv *TabletServer) serveNewType() (err error) { tsv.watcher.Close() tsv.hr.Close() - if err := tsv.hw.Open(); err != nil { - return err - } + tsv.hw.Open() tsv.tracker.Open() tsv.te.AcceptReadWrite() tsv.messager.Open() diff --git a/go/vt/withddl/withddl.go b/go/vt/withddl/withddl.go index 95fb3fd3aaf..c00afc35e68 100644 --- a/go/vt/withddl/withddl.go +++ b/go/vt/withddl/withddl.go @@ -26,6 +26,7 @@ import ( "vitess.io/vitess/go/mysql" "vitess.io/vitess/go/sqltypes" "vitess.io/vitess/go/vt/log" + "vitess.io/vitess/go/vt/sqlparser" ) // WithDDL allows you to execute statements against @@ -64,7 +65,7 @@ func (wd *WithDDL) Exec(ctx context.Context, query string, f interface{}) (*sqlt return nil, err } - log.Infof("Updating schema for %v and retrying: %v", query, err) + log.Infof("Updating schema for %v and retrying: %v", sqlparser.TruncateForUI(err.Error()), err) for _, query := range wd.ddls { _, merr := exec(query) if merr == nil { From be69987b90a6568565a684e084eabd934bddf1ec Mon Sep 17 00:00:00 2001 From: Sugu Sougoumarane Date: Sat, 27 Jun 2020 12:28:21 -0700 Subject: [PATCH 08/19] vttablet: move functions to state_manager.go Signed-off-by: Sugu Sougoumarane --- go/vt/vttablet/tabletserver/query_engine.go | 16 +- go/vt/vttablet/tabletserver/state_manager.go | 466 ++++++++++++++++++ go/vt/vttablet/tabletserver/tabletserver.go | 492 ++----------------- 3 files changed, 500 insertions(+), 474 deletions(-) create mode 100644 go/vt/vttablet/tabletserver/state_manager.go diff --git a/go/vt/vttablet/tabletserver/query_engine.go b/go/vt/vttablet/tabletserver/query_engine.go index 416782b5313..7bbec2e9ede 100644 --- a/go/vt/vttablet/tabletserver/query_engine.go +++ b/go/vt/vttablet/tabletserver/query_engine.go @@ -28,7 +28,6 @@ import ( "vitess.io/vitess/go/acl" "vitess.io/vitess/go/cache" - "vitess.io/vitess/go/mysql" "vitess.io/vitess/go/stats" "vitess.io/vitess/go/streamlog" "vitess.io/vitess/go/sync2" @@ -364,18 +363,15 @@ func (qe *QueryEngine) ClearQueryPlanCache() { qe.plans.Clear() } -// IsMySQLReachable returns true if we can connect to MySQL. -func (qe *QueryEngine) IsMySQLReachable() bool { - conn, err := dbconnpool.NewDBConnection(context.TODO(), qe.env.Config().DB.DbaWithDB()) +// IsMySQLReachable returns an error if it cannot connect to MySQL. +// This can be called before opening the QueryEngine. +func (qe *QueryEngine) IsMySQLReachable() error { + conn, err := dbconnpool.NewDBConnection(context.TODO(), qe.env.Config().DB.AppWithDB()) if err != nil { - if mysql.IsConnErr(err) { - return false - } - log.Warningf("checking MySQL, unexpected error: %v", err) - return true + return err } conn.Close() - return true + return nil } func (qe *QueryEngine) schemaChanged(tables map[string]*schema.Table, created, altered, dropped []string) { diff --git a/go/vt/vttablet/tabletserver/state_manager.go b/go/vt/vttablet/tabletserver/state_manager.go new file mode 100644 index 00000000000..19c4f04bd8b --- /dev/null +++ b/go/vt/vttablet/tabletserver/state_manager.go @@ -0,0 +1,466 @@ +/* +Copyright 2020 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package tabletserver + +import ( + "fmt" + "time" + + "golang.org/x/net/context" + "vitess.io/vitess/go/vt/dbconfigs" + "vitess.io/vitess/go/vt/log" + querypb "vitess.io/vitess/go/vt/proto/query" + topodatapb "vitess.io/vitess/go/vt/proto/topodata" + vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" + "vitess.io/vitess/go/vt/vterrors" + "vitess.io/vitess/go/vt/vttablet/tabletserver/tabletenv" +) + +const ( + // StateNotConnected is the state where tabletserver is not + // connected to an underlying mysql instance. + StateNotConnected = iota + // StateNotServing is the state where tabletserver is connected + // to an underlying mysql instance, but is not serving queries. + StateNotServing + // StateServing is where queries are allowed. + StateServing + // StateTransitioning is a transient state indicating that + // the tabletserver is tranisitioning to a new state. + // In order to achieve clean transitions, no requests are + // allowed during this state. + StateTransitioning + // StateShuttingDown indicates that the tabletserver + // is shutting down. In this state, we wait for outstanding + // requests and transactions to conclude. + StateShuttingDown +) + +// stateName names every state. The number of elements must +// match the number of states. Names can overlap. +var stateName = []string{ + "NOT_SERVING", + "NOT_SERVING", + "SERVING", + "NOT_SERVING", + "SHUTTING_DOWN", +} + +// stateDetail matches every state and optionally more information about the reason +// why the state is serving / not serving. +var stateDetail = []string{ + "Not Connected", + "Not Serving", + "", + "Transitioning", + "Shutting Down", +} + +// stateInfo returns a string representation of the state and optional detail +// about the reason for the state transition +func stateInfo(state int64) string { + if state == StateServing { + return "SERVING" + } + return fmt.Sprintf("%s (%s)", stateName[state], stateDetail[state]) +} + +// EnterLameduck causes tabletserver to enter the lameduck state. This +// state causes health checks to fail, but the behavior of tabletserver +// otherwise remains the same. Any subsequent calls to SetServingType will +// cause the tabletserver to exit this mode. +func (tsv *TabletServer) EnterLameduck() { + tsv.lameduck.Set(1) +} + +// ExitLameduck causes the tabletserver to exit the lameduck mode. +func (tsv *TabletServer) ExitLameduck() { + tsv.lameduck.Set(0) +} + +// GetState returns the name of the current TabletServer state. +func (tsv *TabletServer) GetState() string { + if tsv.lameduck.Get() != 0 { + return "NOT_SERVING" + } + tsv.mu.Lock() + name := stateName[tsv.state] + tsv.mu.Unlock() + return name +} + +// setState changes the state and logs the event. +// It requires the caller to hold a lock on mu. +func (tsv *TabletServer) setState(state int64) { + log.Infof("TabletServer state: %s -> %s", stateInfo(tsv.state), stateInfo(state)) + tsv.state = state + tsv.history.Add(&historyRecord{ + Time: time.Now(), + ServingState: stateInfo(state), + TabletType: tsv.target.TabletType.String(), + }) +} + +// transition obtains a lock and changes the state. +func (tsv *TabletServer) transition(newState int64) { + tsv.mu.Lock() + tsv.setState(newState) + tsv.mu.Unlock() +} + +// IsServing returns true if TabletServer is in SERVING state. +func (tsv *TabletServer) IsServing() bool { + return tsv.GetState() == "SERVING" +} + +// StartService is a convenience function for InitDBConfig->SetServingType +// with serving=true. +func (tsv *TabletServer) StartService(target querypb.Target, dbcfgs *dbconfigs.DBConfigs) (err error) { + err = tsv.InitDBConfig(target, dbcfgs) + if err != nil { + return err + } + _ /* state changed */, err = tsv.SetServingType(target.TabletType, true, nil) + return err +} + +const ( + actionNone = iota + actionFullStart + actionServeNewType + actionGracefulStop +) + +// SetServingType changes the serving type of the tabletserver. It starts or +// stops internal services as deemed necessary. The tabletType determines the +// primary serving type, while alsoAllow specifies other tablet types that +// should also be honored for serving. +// Returns true if the state of QueryService or the tablet type changed. +func (tsv *TabletServer) SetServingType(tabletType topodatapb.TabletType, serving bool, alsoAllow []topodatapb.TabletType) (stateChanged bool, err error) { + defer tsv.ExitLameduck() + + action, err := tsv.decideAction(tabletType, serving, alsoAllow) + if err != nil { + return false, err + } + switch action { + case actionNone: + return false, nil + case actionFullStart: + if err := tsv.fullStart(); err != nil { + tsv.closeAll() + return true, err + } + return true, nil + case actionServeNewType: + if err := tsv.serveNewType(); err != nil { + tsv.closeAll() + return true, err + } + return true, nil + case actionGracefulStop: + tsv.gracefulStop() + return true, nil + } + panic("unreachable") +} + +func (tsv *TabletServer) decideAction(tabletType topodatapb.TabletType, serving bool, alsoAllow []topodatapb.TabletType) (action int, err error) { + tsv.mu.Lock() + defer tsv.mu.Unlock() + + tsv.alsoAllow = alsoAllow + + // Handle the case where the requested TabletType and serving state + // match our current state. This avoids an unnecessary transition. + // There's no similar shortcut if serving is false, because there + // are different 'not serving' states that require different actions. + if tsv.target.TabletType == tabletType { + if serving && tsv.state == StateServing { + // We're already in the desired state. + return actionNone, nil + } + } + tsv.target.TabletType = tabletType + switch tsv.state { + case StateNotConnected: + if serving { + tsv.setState(StateTransitioning) + return actionFullStart, nil + } + case StateNotServing: + if serving { + tsv.setState(StateTransitioning) + return actionServeNewType, nil + } + case StateServing: + if !serving { + tsv.setState(StateShuttingDown) + return actionGracefulStop, nil + } + tsv.setState(StateTransitioning) + return actionServeNewType, nil + case StateTransitioning, StateShuttingDown: + return actionNone, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "cannot SetServingType, current state: %s", stateName[tsv.state]) + default: + panic("unreachable") + } + return actionNone, nil +} + +func (tsv *TabletServer) fullStart() error { + if err := tsv.qe.IsMySQLReachable(); err != nil { + return err + } + if err := tsv.se.Open(); err != nil { + return err + } + tsv.vstreamer.Open() + if err := tsv.qe.Open(); err != nil { + return err + } + if err := tsv.txThrottler.Open(); err != nil { + return err + } + return tsv.serveNewType() +} + +func (tsv *TabletServer) serveNewType() (err error) { + if tsv.target.TabletType == topodatapb.TabletType_MASTER { + tsv.watcher.Close() + tsv.hr.Close() + + tsv.hw.Open() + tsv.tracker.Open() + tsv.te.AcceptReadWrite() + tsv.messager.Open() + } else { + tsv.messager.Close() + tsv.te.AcceptReadOnly() + tsv.tracker.Close() + tsv.hw.Close() + tsv.se.MakeNonMaster() + + tsv.hr.Open() + tsv.watcher.Open() + } + tsv.transition(StateServing) + return nil +} + +func (tsv *TabletServer) gracefulStop() { + defer close(tsv.setTimeBomb()) + tsv.waitForShutdown() + tsv.transition(StateNotServing) +} + +// StopService shuts down the tabletserver to the uninitialized state. +// It first transitions to StateShuttingDown, then waits for active +// services to shut down. Then it shuts down the rest. This function +// should be called before process termination, or if MySQL is unreachable. +// Under normal circumstances, SetServingType should be called. +func (tsv *TabletServer) StopService() { + defer close(tsv.setTimeBomb()) + defer tsv.LogError() + + tsv.mu.Lock() + if tsv.state != StateServing && tsv.state != StateNotServing { + tsv.mu.Unlock() + return + } + tsv.setState(StateShuttingDown) + tsv.mu.Unlock() + + log.Info("Executing complete shutdown.") + tsv.waitForShutdown() + tsv.qe.Close() + tsv.watcher.Close() + tsv.vstreamer.Close() + tsv.hr.Close() + tsv.hw.Close() + tsv.se.Close() + log.Info("Shutdown complete.") + tsv.transition(StateNotConnected) +} + +func (tsv *TabletServer) waitForShutdown() { + tsv.messager.Close() + tsv.te.Close() + tsv.txThrottler.Close() + tsv.tracker.Close() + tsv.qe.StopServing() + tsv.requests.Wait() +} + +// closeAll is called if TabletServer fails to start. +// It forcibly shuts down everything. +func (tsv *TabletServer) closeAll() { + tsv.messager.Close() + tsv.te.Close() + tsv.txThrottler.Close() + tsv.qe.StopServing() + tsv.qe.Close() + tsv.watcher.Close() + tsv.tracker.Close() + tsv.vstreamer.Close() + tsv.hr.Close() + tsv.hw.Close() + tsv.se.Close() + tsv.transition(StateNotConnected) +} + +func (tsv *TabletServer) setTimeBomb() chan struct{} { + done := make(chan struct{}) + go func() { + qt := tsv.QueryTimeout.Get() + if qt == 0 { + return + } + tmr := time.NewTimer(10 * qt) + defer tmr.Stop() + select { + case <-tmr.C: + log.Fatal("Shutdown took too long. Crashing") + case <-done: + } + }() + return done +} + +// CheckMySQL initiates a check to see if MySQL is reachable. +// If not, it shuts down the query service. The check is rate-limited +// to no more than once per second. +// The function satisfies tabletenv.Env. +func (tsv *TabletServer) CheckMySQL() { + if !tsv.checkMySQLThrottler.TryAcquire() { + return + } + go func() { + defer func() { + tsv.LogError() + time.Sleep(1 * time.Second) + tsv.checkMySQLThrottler.Release() + }() + if tsv.isMySQLReachable() { + return + } + log.Info("Check MySQL failed. Shutting down query service") + tsv.StopService() + }() +} + +// isMySQLReachable returns true if we can connect to MySQL. +// The function returns false only if the query service is +// in StateServing or StateNotServing. +func (tsv *TabletServer) isMySQLReachable() bool { + tsv.mu.Lock() + switch tsv.state { + case StateServing: + // Prevent transition out of this state by + // reserving a request. + tsv.requests.Add(1) + defer tsv.requests.Done() + case StateNotServing: + // Prevent transition out of this state by + // temporarily switching to StateTransitioning. + tsv.setState(StateTransitioning) + defer func() { + tsv.transition(StateNotServing) + }() + default: + tsv.mu.Unlock() + return true + } + tsv.mu.Unlock() + if err := tsv.qe.IsMySQLReachable(); err != nil { + log.Errorf("Cannot connect to MySQL: %v", err) + return false + } + return true +} + +// startRequest validates the current state and target and registers +// the request (a waitgroup) as started. Every startRequest requires +// one and only one corresponding endRequest. When the service shuts +// down, StopService will wait on this waitgroup to ensure that there +// are no requests in flight. +func (tsv *TabletServer) startRequest(ctx context.Context, target *querypb.Target, allowOnShutdown bool) (err error) { + tsv.mu.Lock() + defer tsv.mu.Unlock() + if tsv.state == StateServing { + goto verifyTarget + } + if allowOnShutdown && tsv.state == StateShuttingDown { + goto verifyTarget + } + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "operation not allowed in state %s", stateName[tsv.state]) + +verifyTarget: + if target != nil { + // a valid target needs to be used + switch { + case target.Keyspace != tsv.target.Keyspace: + return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid keyspace %v", target.Keyspace) + case target.Shard != tsv.target.Shard: + return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid shard %v", target.Shard) + case target.TabletType != tsv.target.TabletType: + for _, otherType := range tsv.alsoAllow { + if target.TabletType == otherType { + goto ok + } + } + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "invalid tablet type: %v, want: %v or %v", target.TabletType, tsv.target.TabletType, tsv.alsoAllow) + } + } else if !tabletenv.IsLocalContext(ctx) { + return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "No target") + } + +ok: + tsv.requests.Add(1) + return nil +} + +// endRequest unregisters the current request (a waitgroup) as done. +func (tsv *TabletServer) endRequest() { + tsv.requests.Done() +} + +// verifyTarget allows requests to be executed even in non-serving state. +func (tsv *TabletServer) verifyTarget(ctx context.Context, target *querypb.Target) error { + tsv.mu.Lock() + defer tsv.mu.Unlock() + + if target != nil { + // a valid target needs to be used + switch { + case target.Keyspace != tsv.target.Keyspace: + return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid keyspace %v", target.Keyspace) + case target.Shard != tsv.target.Shard: + return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid shard %v", target.Shard) + case target.TabletType != tsv.target.TabletType: + for _, otherType := range tsv.alsoAllow { + if target.TabletType == otherType { + return nil + } + } + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "invalid tablet type: %v, want: %v or %v", target.TabletType, tsv.target.TabletType, tsv.alsoAllow) + } + } else if !tabletenv.IsLocalContext(ctx) { + return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "No target") + } + return nil +} diff --git a/go/vt/vttablet/tabletserver/tabletserver.go b/go/vt/vttablet/tabletserver/tabletserver.go index 741e838cf63..71b871ebbc0 100644 --- a/go/vt/vttablet/tabletserver/tabletserver.go +++ b/go/vt/vttablet/tabletserver/tabletserver.go @@ -40,7 +40,6 @@ import ( "vitess.io/vitess/go/trace" "vitess.io/vitess/go/vt/callerid" "vitess.io/vitess/go/vt/dbconfigs" - "vitess.io/vitess/go/vt/dbconnpool" "vitess.io/vitess/go/vt/log" "vitess.io/vitess/go/vt/logutil" binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" @@ -65,60 +64,11 @@ import ( "vitess.io/vitess/go/vt/vttablet/tabletserver/vstreamer" ) -const ( - // StateNotConnected is the state where tabletserver is not - // connected to an underlying mysql instance. - StateNotConnected = iota - // StateNotServing is the state where tabletserver is connected - // to an underlying mysql instance, but is not serving queries. - StateNotServing - // StateServing is where queries are allowed. - StateServing - // StateTransitioning is a transient state indicating that - // the tabletserver is tranisitioning to a new state. - // In order to achieve clean transitions, no requests are - // allowed during this state. - StateTransitioning - // StateShuttingDown indicates that the tabletserver - // is shutting down. In this state, we wait for outstanding - // requests and transactions to conclude. - StateShuttingDown -) - // logPoolFull is for throttling transaction / query pool full messages in the log. var logPoolFull = logutil.NewThrottledLogger("PoolFull", 1*time.Minute) var logComputeRowSerializerKey = logutil.NewThrottledLogger("ComputeRowSerializerKey", 1*time.Minute) -// stateName names every state. The number of elements must -// match the number of states. Names can overlap. -var stateName = []string{ - "NOT_SERVING", - "NOT_SERVING", - "SERVING", - "NOT_SERVING", - "SHUTTING_DOWN", -} - -// stateDetail matches every state and optionally more information about the reason -// why the state is serving / not serving. -var stateDetail = []string{ - "Not Connected", - "Not Serving", - "", - "Transitioning", - "Shutting Down", -} - -// stateInfo returns a string representation of the state and optional detail -// about the reason for the state transition -func stateInfo(state int64) string { - if state == StateServing { - return "SERVING" - } - return fmt.Sprintf("%s (%s)", stateName[state], stateDetail[state]) -} - // TabletServer implements the RPC interface for the query service. // TabletServer is initialized in the following sequence: // NewTabletServer->InitDBConfig->SetServingType. @@ -259,16 +209,23 @@ func NewTabletServer(name string, config *tabletenv.TabletConfig, topoServer *to return tsv } -// SetTracking forces tracking to be on or off. -// Only to be used for testing. -func (tsv *TabletServer) SetTracking(enabled bool) { - tsv.tracker.Enable(enabled) -} +// InitDBConfig initializes the db config variables for TabletServer. You must call this function before +// calling SetServingType. +func (tsv *TabletServer) InitDBConfig(target querypb.Target, dbcfgs *dbconfigs.DBConfigs) error { + tsv.mu.Lock() + defer tsv.mu.Unlock() + if tsv.state != StateNotConnected { + return vterrors.Errorf(vtrpcpb.Code_UNKNOWN, "InitDBConfig failed, current state: %s", stateName[tsv.state]) + } + tsv.target = target + tsv.config.DB = dbcfgs -// EnableHistorian forces historian to be on or off. -// Only to be used for testing. -func (tsv *TabletServer) EnableHistorian(enabled bool) { - _ = tsv.se.EnableHistorian(enabled) + tsv.se.InitDBConfig(tsv.config.DB.DbaWithDB()) + tsv.hw.InitDBConfig(target) + tsv.hr.InitDBConfig(target) + tsv.txThrottler.InitDBConfig(target) + tsv.vstreamer.InitDBConfig(target.Keyspace) + return nil } // Register prepares TabletServer for serving by calling @@ -322,60 +279,6 @@ func (tsv *TabletServer) SetQueryRules(ruleSource string, qrs *rules.Rules) erro return nil } -// GetState returns the name of the current TabletServer state. -func (tsv *TabletServer) GetState() string { - if tsv.lameduck.Get() != 0 { - return "NOT_SERVING" - } - tsv.mu.Lock() - name := stateName[tsv.state] - tsv.mu.Unlock() - return name -} - -// setState changes the state and logs the event. -// It requires the caller to hold a lock on mu. -func (tsv *TabletServer) setState(state int64) { - log.Infof("TabletServer state: %s -> %s", stateInfo(tsv.state), stateInfo(state)) - tsv.state = state - tsv.history.Add(&historyRecord{ - Time: time.Now(), - ServingState: stateInfo(state), - TabletType: tsv.target.TabletType.String(), - }) -} - -// transition obtains a lock and changes the state. -func (tsv *TabletServer) transition(newState int64) { - tsv.mu.Lock() - tsv.setState(newState) - tsv.mu.Unlock() -} - -// IsServing returns true if TabletServer is in SERVING state. -func (tsv *TabletServer) IsServing() bool { - return tsv.GetState() == "SERVING" -} - -// InitDBConfig initializes the db config variables for TabletServer. You must call this function before -// calling SetServingType. -func (tsv *TabletServer) InitDBConfig(target querypb.Target, dbcfgs *dbconfigs.DBConfigs) error { - tsv.mu.Lock() - defer tsv.mu.Unlock() - if tsv.state != StateNotConnected { - return vterrors.Errorf(vtrpcpb.Code_UNKNOWN, "InitDBConfig failed, current state: %s", stateName[tsv.state]) - } - tsv.target = target - tsv.config.DB = dbcfgs - - tsv.se.InitDBConfig(tsv.config.DB.DbaWithDB()) - tsv.hw.InitDBConfig(target) - tsv.hr.InitDBConfig(target) - tsv.txThrottler.InitDBConfig(target) - tsv.vstreamer.InitDBConfig(target.Keyspace) - return nil -} - func (tsv *TabletServer) initACL(tableACLConfigFile string, enforceTableACLConfig bool) { // tabletacl.Init loads ACL from file if *tableACLConfig is not empty err := tableacl.Init( @@ -414,237 +317,6 @@ func (tsv *TabletServer) InitACL(tableACLConfigFile string, enforceTableACLConfi } } -// StartService is a convenience function for InitDBConfig->SetServingType -// with serving=true. -func (tsv *TabletServer) StartService(target querypb.Target, dbcfgs *dbconfigs.DBConfigs) (err error) { - err = tsv.InitDBConfig(target, dbcfgs) - if err != nil { - return err - } - _ /* state changed */, err = tsv.SetServingType(target.TabletType, true, nil) - return err -} - -// EnterLameduck causes tabletserver to enter the lameduck state. This -// state causes health checks to fail, but the behavior of tabletserver -// otherwise remains the same. Any subsequent calls to SetServingType will -// cause the tabletserver to exit this mode. -func (tsv *TabletServer) EnterLameduck() { - tsv.lameduck.Set(1) -} - -// ExitLameduck causes the tabletserver to exit the lameduck mode. -func (tsv *TabletServer) ExitLameduck() { - tsv.lameduck.Set(0) -} - -const ( - actionNone = iota - actionFullStart - actionServeNewType - actionGracefulStop -) - -// SetServingType changes the serving type of the tabletserver. It starts or -// stops internal services as deemed necessary. The tabletType determines the -// primary serving type, while alsoAllow specifies other tablet types that -// should also be honored for serving. -// Returns true if the state of QueryService or the tablet type changed. -func (tsv *TabletServer) SetServingType(tabletType topodatapb.TabletType, serving bool, alsoAllow []topodatapb.TabletType) (stateChanged bool, err error) { - defer tsv.ExitLameduck() - - action, err := tsv.decideAction(tabletType, serving, alsoAllow) - if err != nil { - return false, err - } - switch action { - case actionNone: - return false, nil - case actionFullStart: - if err := tsv.fullStart(); err != nil { - tsv.closeAll() - return true, err - } - return true, nil - case actionServeNewType: - if err := tsv.serveNewType(); err != nil { - tsv.closeAll() - return true, err - } - return true, nil - case actionGracefulStop: - tsv.gracefulStop() - return true, nil - } - panic("unreachable") -} - -func (tsv *TabletServer) decideAction(tabletType topodatapb.TabletType, serving bool, alsoAllow []topodatapb.TabletType) (action int, err error) { - tsv.mu.Lock() - defer tsv.mu.Unlock() - - tsv.alsoAllow = alsoAllow - - // Handle the case where the requested TabletType and serving state - // match our current state. This avoids an unnecessary transition. - // There's no similar shortcut if serving is false, because there - // are different 'not serving' states that require different actions. - if tsv.target.TabletType == tabletType { - if serving && tsv.state == StateServing { - // We're already in the desired state. - return actionNone, nil - } - } - tsv.target.TabletType = tabletType - switch tsv.state { - case StateNotConnected: - if serving { - tsv.setState(StateTransitioning) - return actionFullStart, nil - } - case StateNotServing: - if serving { - tsv.setState(StateTransitioning) - return actionServeNewType, nil - } - case StateServing: - if !serving { - tsv.setState(StateShuttingDown) - return actionGracefulStop, nil - } - tsv.setState(StateTransitioning) - return actionServeNewType, nil - case StateTransitioning, StateShuttingDown: - return actionNone, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "cannot SetServingType, current state: %s", stateName[tsv.state]) - default: - panic("unreachable") - } - return actionNone, nil -} - -func (tsv *TabletServer) fullStart() (err error) { - c, err := dbconnpool.NewDBConnection(context.TODO(), tsv.config.DB.AppWithDB()) - if err != nil { - log.Errorf("error creating db app connection: %v", err) - return err - } - c.Close() - - if err := tsv.se.Open(); err != nil { - return err - } - tsv.vstreamer.Open() - if err := tsv.qe.Open(); err != nil { - return err - } - if err := tsv.txThrottler.Open(); err != nil { - return err - } - return tsv.serveNewType() -} - -func (tsv *TabletServer) serveNewType() (err error) { - if tsv.target.TabletType == topodatapb.TabletType_MASTER { - tsv.watcher.Close() - tsv.hr.Close() - - tsv.hw.Open() - tsv.tracker.Open() - tsv.te.AcceptReadWrite() - tsv.messager.Open() - } else { - tsv.messager.Close() - tsv.te.AcceptReadOnly() - tsv.tracker.Close() - tsv.hw.Close() - tsv.se.MakeNonMaster() - - tsv.hr.Open() - tsv.watcher.Open() - } - tsv.transition(StateServing) - return nil -} - -func (tsv *TabletServer) gracefulStop() { - defer close(tsv.setTimeBomb()) - tsv.waitForShutdown() - tsv.transition(StateNotServing) -} - -// StopService shuts down the tabletserver to the uninitialized state. -// It first transitions to StateShuttingDown, then waits for active -// services to shut down. Then it shuts down the rest. This function -// should be called before process termination, or if MySQL is unreachable. -// Under normal circumstances, SetServingType should be called. -func (tsv *TabletServer) StopService() { - defer close(tsv.setTimeBomb()) - defer tsv.LogError() - - tsv.mu.Lock() - if tsv.state != StateServing && tsv.state != StateNotServing { - tsv.mu.Unlock() - return - } - tsv.setState(StateShuttingDown) - tsv.mu.Unlock() - - log.Info("Executing complete shutdown.") - tsv.waitForShutdown() - tsv.qe.Close() - tsv.watcher.Close() - tsv.vstreamer.Close() - tsv.hr.Close() - tsv.hw.Close() - tsv.se.Close() - log.Info("Shutdown complete.") - tsv.transition(StateNotConnected) -} - -func (tsv *TabletServer) waitForShutdown() { - tsv.messager.Close() - tsv.te.Close() - tsv.txThrottler.Close() - tsv.tracker.Close() - tsv.qe.StopServing() - tsv.requests.Wait() -} - -// closeAll is called if TabletServer fails to start. -// It forcibly shuts down everything. -func (tsv *TabletServer) closeAll() { - tsv.messager.Close() - tsv.te.Close() - tsv.txThrottler.Close() - tsv.qe.StopServing() - tsv.qe.Close() - tsv.watcher.Close() - tsv.tracker.Close() - tsv.vstreamer.Close() - tsv.hr.Close() - tsv.hw.Close() - tsv.se.Close() - tsv.transition(StateNotConnected) -} - -func (tsv *TabletServer) setTimeBomb() chan struct{} { - done := make(chan struct{}) - go func() { - qt := tsv.QueryTimeout.Get() - if qt == 0 { - return - } - tmr := time.NewTimer(10 * qt) - defer tmr.Stop() - select { - case <-tmr.C: - log.Fatal("Shutdown took too long. Crashing") - case <-done: - } - }() - return done -} - // IsHealthy returns nil for non-serving types or if the query service is healthy (able to // connect to the database and serving traffic), or an error explaining // the unhealthiness otherwise. @@ -669,54 +341,6 @@ func (tsv *TabletServer) IsHealthy() error { } } -// CheckMySQL initiates a check to see if MySQL is reachable. -// If not, it shuts down the query service. The check is rate-limited -// to no more than once per second. -// The function satisfies tabletenv.Env. -func (tsv *TabletServer) CheckMySQL() { - if !tsv.checkMySQLThrottler.TryAcquire() { - return - } - go func() { - defer func() { - tsv.LogError() - time.Sleep(1 * time.Second) - tsv.checkMySQLThrottler.Release() - }() - if tsv.isMySQLReachable() { - return - } - log.Info("Check MySQL failed. Shutting down query service") - tsv.StopService() - }() -} - -// isMySQLReachable returns true if we can connect to MySQL. -// The function returns false only if the query service is -// in StateServing or StateNotServing. -func (tsv *TabletServer) isMySQLReachable() bool { - tsv.mu.Lock() - switch tsv.state { - case StateServing: - // Prevent transition out of this state by - // reserving a request. - tsv.requests.Add(1) - defer tsv.requests.Done() - case StateNotServing: - // Prevent transition out of this state by - // temporarily switching to StateTransitioning. - tsv.setState(StateTransitioning) - defer func() { - tsv.transition(StateNotServing) - }() - default: - tsv.mu.Unlock() - return true - } - tsv.mu.Unlock() - return tsv.qe.IsMySQLReachable() -} - // ReloadSchema reloads the schema. func (tsv *TabletServer) ReloadSchema(ctx context.Context) error { return tsv.se.Reload(ctx) @@ -1508,32 +1132,6 @@ func (tsv *TabletServer) execRequest( return nil } -// verifyTarget allows requests to be executed even in non-serving state. -func (tsv *TabletServer) verifyTarget(ctx context.Context, target *querypb.Target) error { - tsv.mu.Lock() - defer tsv.mu.Unlock() - - if target != nil { - // a valid target needs to be used - switch { - case target.Keyspace != tsv.target.Keyspace: - return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid keyspace %v", target.Keyspace) - case target.Shard != tsv.target.Shard: - return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid shard %v", target.Shard) - case target.TabletType != tsv.target.TabletType: - for _, otherType := range tsv.alsoAllow { - if target.TabletType == otherType { - return nil - } - } - return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "invalid tablet type: %v, want: %v or %v", target.TabletType, tsv.target.TabletType, tsv.alsoAllow) - } - } else if !tabletenv.IsLocalContext(ctx) { - return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "No target") - } - return nil -} - func (tsv *TabletServer) handlePanicAndSendLogStats( sql string, bindVariables map[string]*querypb.BindVariable, @@ -1852,52 +1450,6 @@ func (tsv *TabletServer) Close(ctx context.Context) error { return nil } -// startRequest validates the current state and target and registers -// the request (a waitgroup) as started. Every startRequest requires -// one and only one corresponding endRequest. When the service shuts -// down, StopService will wait on this waitgroup to ensure that there -// are no requests in flight. -func (tsv *TabletServer) startRequest(ctx context.Context, target *querypb.Target, allowOnShutdown bool) (err error) { - tsv.mu.Lock() - defer tsv.mu.Unlock() - if tsv.state == StateServing { - goto verifyTarget - } - if allowOnShutdown && tsv.state == StateShuttingDown { - goto verifyTarget - } - return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "operation not allowed in state %s", stateName[tsv.state]) - -verifyTarget: - if target != nil { - // a valid target needs to be used - switch { - case target.Keyspace != tsv.target.Keyspace: - return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid keyspace %v", target.Keyspace) - case target.Shard != tsv.target.Shard: - return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid shard %v", target.Shard) - case target.TabletType != tsv.target.TabletType: - for _, otherType := range tsv.alsoAllow { - if target.TabletType == otherType { - goto ok - } - } - return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "invalid tablet type: %v, want: %v or %v", target.TabletType, tsv.target.TabletType, tsv.alsoAllow) - } - } else if !tabletenv.IsLocalContext(ctx) { - return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "No target") - } - -ok: - tsv.requests.Add(1) - return nil -} - -// endRequest unregisters the current request (a waitgroup) as done. -func (tsv *TabletServer) endRequest() { - tsv.requests.Done() -} - func (tsv *TabletServer) registerDebugHealthHandler() { tsv.exporter.HandleFunc("/debug/health", func(w http.ResponseWriter, r *http.Request) { if err := acl.CheckAccessHTTP(r, acl.MONITORING); err != nil { @@ -1940,6 +1492,18 @@ func (tsv *TabletServer) registerTwopczHandler() { }) } +// SetTracking forces tracking to be on or off. +// Only to be used for testing. +func (tsv *TabletServer) SetTracking(enabled bool) { + tsv.tracker.Enable(enabled) +} + +// EnableHistorian forces historian to be on or off. +// Only to be used for testing. +func (tsv *TabletServer) EnableHistorian(enabled bool) { + _ = tsv.se.EnableHistorian(enabled) +} + // SetPoolSize changes the pool size to the specified value. // This function should only be used for testing. func (tsv *TabletServer) SetPoolSize(val int) { From 92905e365f9a6cd935ea81a448939404ab06a4d3 Mon Sep 17 00:00:00 2001 From: Sugu Sougoumarane Date: Sat, 27 Jun 2020 15:34:02 -0700 Subject: [PATCH 09/19] vttablet: componentize states into stateManager Signed-off-by: Sugu Sougoumarane --- .../vreplication/vplayer_flaky_test.go | 4 +- .../vttablet/tabletserver/messager/engine.go | 11 +- .../tabletserver/query_executor_test.go | 18 +- go/vt/vttablet/tabletserver/state_manager.go | 410 ++++++++++-------- go/vt/vttablet/tabletserver/status.go | 8 +- go/vt/vttablet/tabletserver/tabletserver.go | 147 ++++--- .../tabletserver/tabletserver_test.go | 96 ++-- .../vttablet/tabletserver/vstreamer/engine.go | 5 +- 8 files changed, 393 insertions(+), 306 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/vreplication/vplayer_flaky_test.go b/go/vt/vttablet/tabletmanager/vreplication/vplayer_flaky_test.go index d4977d33308..f7f79f5c8d9 100644 --- a/go/vt/vttablet/tabletmanager/vreplication/vplayer_flaky_test.go +++ b/go/vt/vttablet/tabletmanager/vreplication/vplayer_flaky_test.go @@ -1901,9 +1901,7 @@ func TestRestartOnVStreamEnd(t *testing.T) { expectDBClientQueries(t, []string{ "/update _vt.vreplication set message='vstream ended'", }) - if err := streamerEngine.Open(); err != nil { - t.Fatal(err) - } + streamerEngine.Open() execStatements(t, []string{ "insert into t1 values(2, 'aaa')", diff --git a/go/vt/vttablet/tabletserver/messager/engine.go b/go/vt/vttablet/tabletserver/messager/engine.go index 3a84acea713..470622ed07b 100644 --- a/go/vt/vttablet/tabletserver/messager/engine.go +++ b/go/vt/vttablet/tabletserver/messager/engine.go @@ -72,14 +72,17 @@ func NewEngine(tsv TabletService, se *schema.Engine, vs VStreamer) *Engine { } // Open starts the Engine service. -func (me *Engine) Open() error { +func (me *Engine) Open() { + me.mu.Lock() if me.isOpen { - return nil + me.mu.Unlock() + return } - + me.mu.Unlock() + // Unlock before invoking RegisterNotifier because it + // obtains the same lock. me.se.RegisterNotifier("messages", me.schemaChanged) me.isOpen = true - return nil } // Close closes the Engine service. diff --git a/go/vt/vttablet/tabletserver/query_executor_test.go b/go/vt/vttablet/tabletserver/query_executor_test.go index 35c3527fdbf..2f297078310 100644 --- a/go/vt/vttablet/tabletserver/query_executor_test.go +++ b/go/vt/vttablet/tabletserver/query_executor_test.go @@ -262,11 +262,12 @@ func TestQueryExecutorPlans(t *testing.T) { assert.Equal(t, tcase.logWant, qre.logStats.RewrittenSQL(), tcase.input) // Test inside a transaction. - txid, alias, err := tsv.Begin(ctx, &tsv.target, nil) + target := tsv.sm.Target() + txid, alias, err := tsv.Begin(ctx, &target, nil) require.NoError(t, err) require.NotNil(t, alias, "alias should not be nil") assert.Equal(t, tsv.alias, *alias, "Wrong alias returned by Begin") - defer tsv.Commit(ctx, &tsv.target, txid) + defer tsv.Commit(ctx, &target, txid) qre = newTestQueryExecutor(ctx, tsv, tcase.input, txid) got, err = qre.Execute() @@ -328,11 +329,12 @@ func TestQueryExecutorSelectImpossible(t *testing.T) { assert.Equal(t, tcase.resultWant, got, tcase.input) assert.Equal(t, tcase.planWant, qre.logStats.PlanType, tcase.input) assert.Equal(t, tcase.logWant, qre.logStats.RewrittenSQL(), tcase.input) - txid, alias, err := tsv.Begin(ctx, &tsv.target, nil) + target := tsv.sm.Target() + txid, alias, err := tsv.Begin(ctx, &target, nil) require.NoError(t, err) require.NotNil(t, alias, "alias should not be nil") assert.Equal(t, tsv.alias, *alias, "Wrong tablet alias from Begin") - defer tsv.Commit(ctx, &tsv.target, txid) + defer tsv.Commit(ctx, &target, txid) qre = newTestQueryExecutor(ctx, tsv, tcase.input, txid) got, err = qre.Execute() @@ -435,11 +437,12 @@ func TestQueryExecutorLimitFailure(t *testing.T) { assert.Equal(t, tcase.logWant, qre.logStats.RewrittenSQL(), tcase.input) // Test inside a transaction. - txid, alias, err := tsv.Begin(ctx, &tsv.target, nil) + target := tsv.sm.Target() + txid, alias, err := tsv.Begin(ctx, &target, nil) require.NoError(t, err) require.NotNil(t, alias, "alias should not be nil") assert.Equal(t, tsv.alias, *alias, "Wrong tablet alias from Begin") - defer tsv.Commit(ctx, &tsv.target, txid) + defer tsv.Commit(ctx, &target, txid) qre = newTestQueryExecutor(ctx, tsv, tcase.input, txid) _, err = qre.Execute() @@ -1111,7 +1114,8 @@ func newTestTabletServer(ctx context.Context, flags executorFlags, db *fakesqldb } func newTransaction(tsv *TabletServer, options *querypb.ExecuteOptions) int64 { - transactionID, _, err := tsv.Begin(context.Background(), &tsv.target, options) + target := tsv.sm.Target() + transactionID, _, err := tsv.Begin(context.Background(), &target, options) if err != nil { panic(vterrors.Wrap(err, "failed to start a transaction")) } diff --git a/go/vt/vttablet/tabletserver/state_manager.go b/go/vt/vttablet/tabletserver/state_manager.go index 19c4f04bd8b..2c7eaaec05d 100644 --- a/go/vt/vttablet/tabletserver/state_manager.go +++ b/go/vt/vttablet/tabletserver/state_manager.go @@ -18,10 +18,12 @@ package tabletserver import ( "fmt" + "sync" "time" "golang.org/x/net/context" - "vitess.io/vitess/go/vt/dbconfigs" + "vitess.io/vitess/go/history" + "vitess.io/vitess/go/sync2" "vitess.io/vitess/go/vt/log" querypb "vitess.io/vitess/go/vt/proto/query" topodatapb "vitess.io/vitess/go/vt/proto/topodata" @@ -70,72 +72,120 @@ var stateDetail = []string{ "Shutting Down", } -// stateInfo returns a string representation of the state and optional detail -// about the reason for the state transition -func stateInfo(state int64) string { - if state == StateServing { - return "SERVING" - } - return fmt.Sprintf("%s (%s)", stateName[state], stateDetail[state]) +// stateManager manages state transition for all the TabletServer +// subcomponents. +type stateManager struct { + mu sync.Mutex + state int64 + lameduck sync2.AtomicInt32 + target querypb.Target + alsoAllow []topodatapb.TabletType + requests sync.WaitGroup + + se schemaEngine + hw subComponent + hr subComponent + vstreamer subComponent + tracker subComponent + watcher subComponent + qe queryEngine + txThrottler txThrottler + te txEngine + messager subComponent + + checkMySQLThrottler *sync2.Semaphore + history *history.History + timebombDuration time.Duration +} + +type schemaEngine interface { + Open() error + MakeNonMaster() + Close() +} + +type queryEngine interface { + Open() error + IsMySQLReachable() error + StopServing() + Close() +} + +type txEngine interface { + AcceptReadWrite() error + AcceptReadOnly() error + Close() +} + +type subComponent interface { + Open() + Close() +} + +type txThrottler interface { + Open() error + Close() } // EnterLameduck causes tabletserver to enter the lameduck state. This // state causes health checks to fail, but the behavior of tabletserver // otherwise remains the same. Any subsequent calls to SetServingType will // cause the tabletserver to exit this mode. -func (tsv *TabletServer) EnterLameduck() { - tsv.lameduck.Set(1) +func (sm *stateManager) EnterLameduck() { + sm.lameduck.Set(1) } // ExitLameduck causes the tabletserver to exit the lameduck mode. -func (tsv *TabletServer) ExitLameduck() { - tsv.lameduck.Set(0) +func (sm *stateManager) ExitLameduck() { + sm.lameduck.Set(0) } -// GetState returns the name of the current TabletServer state. -func (tsv *TabletServer) GetState() string { - if tsv.lameduck.Get() != 0 { +func (sm *stateManager) State() int64 { + sm.mu.Lock() + defer sm.mu.Unlock() + return sm.state +} + +func (sm *stateManager) Target() querypb.Target { + sm.mu.Lock() + defer sm.mu.Unlock() + target := sm.target + return target +} + +// StateByName returns the name of the current TabletServer state. +func (sm *stateManager) StateByName() string { + if sm.lameduck.Get() != 0 { return "NOT_SERVING" } - tsv.mu.Lock() - name := stateName[tsv.state] - tsv.mu.Unlock() + sm.mu.Lock() + name := stateName[sm.state] + sm.mu.Unlock() return name } // setState changes the state and logs the event. // It requires the caller to hold a lock on mu. -func (tsv *TabletServer) setState(state int64) { - log.Infof("TabletServer state: %s -> %s", stateInfo(tsv.state), stateInfo(state)) - tsv.state = state - tsv.history.Add(&historyRecord{ +func (sm *stateManager) setState(state int64) { + log.Infof("TabletServer state: %s -> %s", stateInfo(sm.state), stateInfo(state)) + sm.state = state + sm.history.Add(&historyRecord{ Time: time.Now(), ServingState: stateInfo(state), - TabletType: tsv.target.TabletType.String(), + TabletType: sm.target.TabletType.String(), }) } // transition obtains a lock and changes the state. -func (tsv *TabletServer) transition(newState int64) { - tsv.mu.Lock() - tsv.setState(newState) - tsv.mu.Unlock() +func (sm *stateManager) transition(newState int64) { + sm.mu.Lock() + sm.setState(newState) + sm.mu.Unlock() } // IsServing returns true if TabletServer is in SERVING state. -func (tsv *TabletServer) IsServing() bool { - return tsv.GetState() == "SERVING" -} - -// StartService is a convenience function for InitDBConfig->SetServingType -// with serving=true. -func (tsv *TabletServer) StartService(target querypb.Target, dbcfgs *dbconfigs.DBConfigs) (err error) { - err = tsv.InitDBConfig(target, dbcfgs) - if err != nil { - return err - } - _ /* state changed */, err = tsv.SetServingType(target.TabletType, true, nil) - return err +func (sm *stateManager) IsServing() bool { + return sm.StateByName() == "SERVING" } const ( @@ -145,15 +195,10 @@ const ( actionGracefulStop ) -// SetServingType changes the serving type of the tabletserver. It starts or -// stops internal services as deemed necessary. The tabletType determines the -// primary serving type, while alsoAllow specifies other tablet types that -// should also be honored for serving. -// Returns true if the state of QueryService or the tablet type changed. -func (tsv *TabletServer) SetServingType(tabletType topodatapb.TabletType, serving bool, alsoAllow []topodatapb.TabletType) (stateChanged bool, err error) { - defer tsv.ExitLameduck() +func (sm *stateManager) SetServingType(tabletType topodatapb.TabletType, serving bool, alsoAllow []topodatapb.TabletType) (stateChanged bool, err error) { + defer sm.ExitLameduck() - action, err := tsv.decideAction(tabletType, serving, alsoAllow) + action, err := sm.decideAction(tabletType, serving, alsoAllow) if err != nil { return false, err } @@ -161,176 +206,173 @@ func (tsv *TabletServer) SetServingType(tabletType topodatapb.TabletType, servin case actionNone: return false, nil case actionFullStart: - if err := tsv.fullStart(); err != nil { - tsv.closeAll() + if err := sm.fullStart(); err != nil { + sm.closeAll() return true, err } return true, nil case actionServeNewType: - if err := tsv.serveNewType(); err != nil { - tsv.closeAll() + if err := sm.serveNewType(); err != nil { + sm.closeAll() return true, err } return true, nil case actionGracefulStop: - tsv.gracefulStop() + sm.gracefulStop() return true, nil } panic("unreachable") } -func (tsv *TabletServer) decideAction(tabletType topodatapb.TabletType, serving bool, alsoAllow []topodatapb.TabletType) (action int, err error) { - tsv.mu.Lock() - defer tsv.mu.Unlock() +func (sm *stateManager) decideAction(tabletType topodatapb.TabletType, serving bool, alsoAllow []topodatapb.TabletType) (action int, err error) { + sm.mu.Lock() + defer sm.mu.Unlock() - tsv.alsoAllow = alsoAllow + sm.alsoAllow = alsoAllow // Handle the case where the requested TabletType and serving state // match our current state. This avoids an unnecessary transition. // There's no similar shortcut if serving is false, because there // are different 'not serving' states that require different actions. - if tsv.target.TabletType == tabletType { - if serving && tsv.state == StateServing { + if sm.target.TabletType == tabletType { + if serving && sm.state == StateServing { // We're already in the desired state. return actionNone, nil } } - tsv.target.TabletType = tabletType - switch tsv.state { + sm.target.TabletType = tabletType + switch sm.state { case StateNotConnected: if serving { - tsv.setState(StateTransitioning) + sm.setState(StateTransitioning) return actionFullStart, nil } case StateNotServing: if serving { - tsv.setState(StateTransitioning) + sm.setState(StateTransitioning) return actionServeNewType, nil } case StateServing: if !serving { - tsv.setState(StateShuttingDown) + sm.setState(StateShuttingDown) return actionGracefulStop, nil } - tsv.setState(StateTransitioning) + sm.setState(StateTransitioning) return actionServeNewType, nil case StateTransitioning, StateShuttingDown: - return actionNone, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "cannot SetServingType, current state: %s", stateName[tsv.state]) + return actionNone, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "cannot SetServingType, current state: %s", stateName[sm.state]) default: panic("unreachable") } return actionNone, nil } -func (tsv *TabletServer) fullStart() error { - if err := tsv.qe.IsMySQLReachable(); err != nil { +func (sm *stateManager) fullStart() error { + if err := sm.qe.IsMySQLReachable(); err != nil { return err } - if err := tsv.se.Open(); err != nil { + if err := sm.se.Open(); err != nil { return err } - tsv.vstreamer.Open() - if err := tsv.qe.Open(); err != nil { + sm.vstreamer.Open() + if err := sm.qe.Open(); err != nil { return err } - if err := tsv.txThrottler.Open(); err != nil { + if err := sm.txThrottler.Open(); err != nil { return err } - return tsv.serveNewType() + return sm.serveNewType() } -func (tsv *TabletServer) serveNewType() (err error) { - if tsv.target.TabletType == topodatapb.TabletType_MASTER { - tsv.watcher.Close() - tsv.hr.Close() +func (sm *stateManager) serveNewType() (err error) { + if sm.target.TabletType == topodatapb.TabletType_MASTER { + sm.watcher.Close() + sm.hr.Close() - tsv.hw.Open() - tsv.tracker.Open() - tsv.te.AcceptReadWrite() - tsv.messager.Open() + sm.hw.Open() + sm.tracker.Open() + if err := sm.te.AcceptReadWrite(); err != nil { + return err + } + sm.messager.Open() } else { - tsv.messager.Close() - tsv.te.AcceptReadOnly() - tsv.tracker.Close() - tsv.hw.Close() - tsv.se.MakeNonMaster() - - tsv.hr.Open() - tsv.watcher.Open() + sm.messager.Close() + if err := sm.te.AcceptReadOnly(); err != nil { + return err + } + sm.tracker.Close() + sm.hw.Close() + sm.se.MakeNonMaster() + + sm.hr.Open() + sm.watcher.Open() } - tsv.transition(StateServing) + sm.transition(StateServing) return nil } -func (tsv *TabletServer) gracefulStop() { - defer close(tsv.setTimeBomb()) - tsv.waitForShutdown() - tsv.transition(StateNotServing) +func (sm *stateManager) gracefulStop() { + defer close(sm.setTimeBomb()) + sm.waitForShutdown() + sm.transition(StateNotServing) } -// StopService shuts down the tabletserver to the uninitialized state. -// It first transitions to StateShuttingDown, then waits for active -// services to shut down. Then it shuts down the rest. This function -// should be called before process termination, or if MySQL is unreachable. -// Under normal circumstances, SetServingType should be called. -func (tsv *TabletServer) StopService() { - defer close(tsv.setTimeBomb()) - defer tsv.LogError() - - tsv.mu.Lock() - if tsv.state != StateServing && tsv.state != StateNotServing { - tsv.mu.Unlock() +func (sm *stateManager) StopService() { + defer close(sm.setTimeBomb()) + + sm.mu.Lock() + if sm.state != StateServing && sm.state != StateNotServing { + sm.mu.Unlock() return } - tsv.setState(StateShuttingDown) - tsv.mu.Unlock() + sm.setState(StateShuttingDown) + sm.mu.Unlock() log.Info("Executing complete shutdown.") - tsv.waitForShutdown() - tsv.qe.Close() - tsv.watcher.Close() - tsv.vstreamer.Close() - tsv.hr.Close() - tsv.hw.Close() - tsv.se.Close() + sm.waitForShutdown() + sm.qe.Close() + sm.watcher.Close() + sm.vstreamer.Close() + sm.hr.Close() + sm.hw.Close() + sm.se.Close() log.Info("Shutdown complete.") - tsv.transition(StateNotConnected) + sm.transition(StateNotConnected) } -func (tsv *TabletServer) waitForShutdown() { - tsv.messager.Close() - tsv.te.Close() - tsv.txThrottler.Close() - tsv.tracker.Close() - tsv.qe.StopServing() - tsv.requests.Wait() +func (sm *stateManager) waitForShutdown() { + sm.messager.Close() + sm.te.Close() + sm.txThrottler.Close() + sm.tracker.Close() + sm.qe.StopServing() + sm.requests.Wait() } // closeAll is called if TabletServer fails to start. // It forcibly shuts down everything. -func (tsv *TabletServer) closeAll() { - tsv.messager.Close() - tsv.te.Close() - tsv.txThrottler.Close() - tsv.qe.StopServing() - tsv.qe.Close() - tsv.watcher.Close() - tsv.tracker.Close() - tsv.vstreamer.Close() - tsv.hr.Close() - tsv.hw.Close() - tsv.se.Close() - tsv.transition(StateNotConnected) +func (sm *stateManager) closeAll() { + sm.messager.Close() + sm.te.Close() + sm.txThrottler.Close() + sm.qe.StopServing() + sm.qe.Close() + sm.watcher.Close() + sm.tracker.Close() + sm.vstreamer.Close() + sm.hr.Close() + sm.hw.Close() + sm.se.Close() + sm.transition(StateNotConnected) } -func (tsv *TabletServer) setTimeBomb() chan struct{} { +func (sm *stateManager) setTimeBomb() chan struct{} { done := make(chan struct{}) go func() { - qt := tsv.QueryTimeout.Get() - if qt == 0 { + if sm.timebombDuration == 0 { return } - tmr := time.NewTimer(10 * qt) + tmr := time.NewTimer(sm.timebombDuration) defer tmr.Stop() select { case <-tmr.C: @@ -341,52 +383,47 @@ func (tsv *TabletServer) setTimeBomb() chan struct{} { return done } -// CheckMySQL initiates a check to see if MySQL is reachable. -// If not, it shuts down the query service. The check is rate-limited -// to no more than once per second. -// The function satisfies tabletenv.Env. -func (tsv *TabletServer) CheckMySQL() { - if !tsv.checkMySQLThrottler.TryAcquire() { +func (sm *stateManager) CheckMySQL() { + if !sm.checkMySQLThrottler.TryAcquire() { return } go func() { defer func() { - tsv.LogError() time.Sleep(1 * time.Second) - tsv.checkMySQLThrottler.Release() + sm.checkMySQLThrottler.Release() }() - if tsv.isMySQLReachable() { + if sm.isMySQLReachable() { return } log.Info("Check MySQL failed. Shutting down query service") - tsv.StopService() + sm.StopService() }() } // isMySQLReachable returns true if we can connect to MySQL. // The function returns false only if the query service is // in StateServing or StateNotServing. -func (tsv *TabletServer) isMySQLReachable() bool { - tsv.mu.Lock() - switch tsv.state { +func (sm *stateManager) isMySQLReachable() bool { + sm.mu.Lock() + switch sm.state { case StateServing: // Prevent transition out of this state by // reserving a request. - tsv.requests.Add(1) - defer tsv.requests.Done() + sm.requests.Add(1) + defer sm.requests.Done() case StateNotServing: // Prevent transition out of this state by // temporarily switching to StateTransitioning. - tsv.setState(StateTransitioning) + sm.setState(StateTransitioning) defer func() { - tsv.transition(StateNotServing) + sm.transition(StateNotServing) }() default: - tsv.mu.Unlock() + sm.mu.Unlock() return true } - tsv.mu.Unlock() - if err := tsv.qe.IsMySQLReachable(); err != nil { + sm.mu.Unlock() + if err := sm.qe.IsMySQLReachable(); err != nil { log.Errorf("Cannot connect to MySQL: %v", err) return false } @@ -398,69 +435,78 @@ func (tsv *TabletServer) isMySQLReachable() bool { // one and only one corresponding endRequest. When the service shuts // down, StopService will wait on this waitgroup to ensure that there // are no requests in flight. -func (tsv *TabletServer) startRequest(ctx context.Context, target *querypb.Target, allowOnShutdown bool) (err error) { - tsv.mu.Lock() - defer tsv.mu.Unlock() - if tsv.state == StateServing { +func (sm *stateManager) startRequest(ctx context.Context, target *querypb.Target, allowOnShutdown bool) (err error) { + sm.mu.Lock() + defer sm.mu.Unlock() + if sm.state == StateServing { goto verifyTarget } - if allowOnShutdown && tsv.state == StateShuttingDown { + if allowOnShutdown && sm.state == StateShuttingDown { goto verifyTarget } - return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "operation not allowed in state %s", stateName[tsv.state]) + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "operation not allowed in state %s", stateName[sm.state]) verifyTarget: if target != nil { // a valid target needs to be used switch { - case target.Keyspace != tsv.target.Keyspace: + case target.Keyspace != sm.target.Keyspace: return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid keyspace %v", target.Keyspace) - case target.Shard != tsv.target.Shard: + case target.Shard != sm.target.Shard: return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid shard %v", target.Shard) - case target.TabletType != tsv.target.TabletType: - for _, otherType := range tsv.alsoAllow { + case target.TabletType != sm.target.TabletType: + for _, otherType := range sm.alsoAllow { if target.TabletType == otherType { goto ok } } - return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "invalid tablet type: %v, want: %v or %v", target.TabletType, tsv.target.TabletType, tsv.alsoAllow) + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "invalid tablet type: %v, want: %v or %v", target.TabletType, sm.target.TabletType, sm.alsoAllow) } } else if !tabletenv.IsLocalContext(ctx) { return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "No target") } ok: - tsv.requests.Add(1) + sm.requests.Add(1) return nil } // endRequest unregisters the current request (a waitgroup) as done. -func (tsv *TabletServer) endRequest() { - tsv.requests.Done() +func (sm *stateManager) endRequest() { + sm.requests.Done() } // verifyTarget allows requests to be executed even in non-serving state. -func (tsv *TabletServer) verifyTarget(ctx context.Context, target *querypb.Target) error { - tsv.mu.Lock() - defer tsv.mu.Unlock() +func (sm *stateManager) verifyTarget(ctx context.Context, target *querypb.Target) error { + sm.mu.Lock() + defer sm.mu.Unlock() if target != nil { // a valid target needs to be used switch { - case target.Keyspace != tsv.target.Keyspace: + case target.Keyspace != sm.target.Keyspace: return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid keyspace %v", target.Keyspace) - case target.Shard != tsv.target.Shard: + case target.Shard != sm.target.Shard: return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid shard %v", target.Shard) - case target.TabletType != tsv.target.TabletType: - for _, otherType := range tsv.alsoAllow { + case target.TabletType != sm.target.TabletType: + for _, otherType := range sm.alsoAllow { if target.TabletType == otherType { return nil } } - return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "invalid tablet type: %v, want: %v or %v", target.TabletType, tsv.target.TabletType, tsv.alsoAllow) + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "invalid tablet type: %v, want: %v or %v", target.TabletType, sm.target.TabletType, sm.alsoAllow) } } else if !tabletenv.IsLocalContext(ctx) { return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "No target") } return nil } + +// stateInfo returns a string representation of the state and optional detail +// about the reason for the state transition +func stateInfo(state int64) string { + if state == StateServing { + return "SERVING" + } + return fmt.Sprintf("%s (%s)", stateName[state], stateDetail[state]) +} diff --git a/go/vt/vttablet/tabletserver/status.go b/go/vt/vttablet/tabletserver/status.go index aed2c7de858..2e9cfec53e6 100644 --- a/go/vt/vttablet/tabletserver/status.go +++ b/go/vt/vttablet/tabletserver/status.go @@ -182,12 +182,10 @@ type queryserviceStatus struct { // AddStatusHeader registers a standlone header for the status page. func (tsv *TabletServer) AddStatusHeader() { tsv.exporter.AddStatusPart("Tablet Server", headerTemplate, func() interface{} { - tsv.mu.Lock() - defer tsv.mu.Unlock() return map[string]interface{}{ "Alias": tsv.exporter.Name(), "Prefix": tsv.exporter.URLPrefix(), - "Target": tsv.target, + "Target": tsv.sm.Target(), } }) } @@ -196,8 +194,8 @@ func (tsv *TabletServer) AddStatusHeader() { func (tsv *TabletServer) AddStatusPart() { tsv.exporter.AddStatusPart("Queryservice", queryserviceStatusTemplate, func() interface{} { status := queryserviceStatus{ - State: tsv.GetState(), - History: tsv.history.Records(), + State: tsv.sm.StateByName(), + History: tsv.sm.history.Records(), } rates := tsv.stats.QPSRates.Get() if qps, ok := rates["All"]; ok && len(qps) > 0 { diff --git a/go/vt/vttablet/tabletserver/tabletserver.go b/go/vt/vttablet/tabletserver/tabletserver.go index 71b871ebbc0..2d74d084ea3 100644 --- a/go/vt/vttablet/tabletserver/tabletserver.go +++ b/go/vt/vttablet/tabletserver/tabletserver.go @@ -91,23 +91,6 @@ type TabletServer struct { TerseErrors bool enableHotRowProtection bool topoServer *topo.Server - checkMySQLThrottler *sync2.Semaphore - - // mu is used to access state. The lock should only be held - // for short periods. For longer periods, you have to transition - // the state to a transient value and release the lock. - // Once the operation is complete, you can then transition - // the state back to a stable value. - // The lameduck mode causes tablet server to respond as unhealthy - // for health checks. This does not affect how queries are served. - // target specifies the primary target type, and also allow specifies - // secondary types that should be additionally allowed. - mu sync.Mutex - state int64 - lameduck sync2.AtomicInt32 - target querypb.Target - alsoAllow []topodatapb.TabletType - requests sync.WaitGroup // These are sub-components of TabletServer. se *schema.Engine @@ -121,6 +104,9 @@ type TabletServer struct { te *TxEngine messager *messager.Engine + // sm manages state transitions. + sm *stateManager + // streamHealthMutex protects all the following fields streamHealthMutex sync.Mutex streamHealthIndex int @@ -128,10 +114,6 @@ type TabletServer struct { lastStreamHealthResponse *querypb.StreamHealthResponse lastStreamHealthExpiration time.Time - // history records changes in state for display on the status page. - // It has its own internal mutex. - history *history.History - // alias is used for identifying this tabletserver in healthcheck responses. alias topodatapb.TabletAlias } @@ -165,9 +147,7 @@ func NewTabletServer(name string, config *tabletenv.TabletConfig, topoServer *to TerseErrors: config.TerseErrors, enableHotRowProtection: config.HotRowProtection.Mode != tabletenv.Disable, topoServer: topoServer, - checkMySQLThrottler: sync2.NewSemaphore(1, 0), streamHealthMap: make(map[int]chan<- *querypb.StreamHealthResponse), - history: history.New(10), alias: alias, } @@ -188,17 +168,30 @@ func NewTabletServer(name string, config *tabletenv.TabletConfig, topoServer *to tsv.te = NewTxEngine(tsv) tsv.messager = messager.NewEngine(tsv, tsv.se, tsv.vstreamer) - tsv.exporter.NewGaugeFunc("TabletState", "Tablet server state", func() int64 { - tsv.mu.Lock() - defer tsv.mu.Unlock() - return tsv.state - }) - tsv.exporter.Publish("TabletStateName", stats.StringFunc(tsv.GetState)) + tsv.sm = &stateManager{ + se: tsv.se, + hw: tsv.hw, + hr: tsv.hr, + vstreamer: tsv.vstreamer, + tracker: tsv.tracker, + watcher: tsv.watcher, + qe: tsv.qe, + txThrottler: tsv.txThrottler, + te: tsv.te, + messager: tsv.messager, + + checkMySQLThrottler: sync2.NewSemaphore(1, 0), + history: history.New(10), + timebombDuration: time.Duration(config.OltpReadPool.TimeoutSeconds * 10), + } + + tsv.exporter.NewGaugeFunc("TabletState", "Tablet server state", tsv.sm.State) + tsv.exporter.Publish("TabletStateName", stats.StringFunc(tsv.sm.StateByName)) // TabletServerState exports the same information as the above two stats (TabletState / TabletStateName), // but exported with TabletStateName as a label for Prometheus, which doesn't support exporting strings as stat values. tsv.exporter.NewGaugesFuncWithMultiLabels("TabletServerState", "Tablet server state labeled by state name", []string{"name"}, func() map[string]int64 { - return map[string]int64{tsv.GetState(): 1} + return map[string]int64{tsv.sm.StateByName(): 1} }) tsv.exporter.NewGaugeDurationFunc("QueryTimeout", "Tablet server query timeout", tsv.QueryTimeout.Get) @@ -209,15 +202,13 @@ func NewTabletServer(name string, config *tabletenv.TabletConfig, topoServer *to return tsv } -// InitDBConfig initializes the db config variables for TabletServer. You must call this function before -// calling SetServingType. +// InitDBConfig initializes the db config variables for TabletServer. You must call this function +// to complete the creation of TabletServer. func (tsv *TabletServer) InitDBConfig(target querypb.Target, dbcfgs *dbconfigs.DBConfigs) error { - tsv.mu.Lock() - defer tsv.mu.Unlock() - if tsv.state != StateNotConnected { - return vterrors.Errorf(vtrpcpb.Code_UNKNOWN, "InitDBConfig failed, current state: %s", stateName[tsv.state]) + if tsv.sm.State() != StateNotConnected { + return vterrors.Errorf(vtrpcpb.Code_UNKNOWN, "InitDBConfig failed, current state: %s", tsv.sm.StateByName()) } - tsv.target = target + tsv.sm.target = target tsv.config.DB = dbcfgs tsv.se.InitDBConfig(tsv.config.DB.DbaWithDB()) @@ -317,14 +308,40 @@ func (tsv *TabletServer) InitACL(tableACLConfigFile string, enforceTableACLConfi } } +// SetServingType changes the serving type of the tabletserver. It starts or +// stops internal services as deemed necessary. The tabletType determines the +// primary serving type, while alsoAllow specifies other tablet types that +// should also be honored for serving. +// Returns true if the state of QueryService or the tablet type changed. +func (tsv *TabletServer) SetServingType(tabletType topodatapb.TabletType, serving bool, alsoAllow []topodatapb.TabletType) (stateChanged bool, err error) { + return tsv.sm.SetServingType(tabletType, serving, alsoAllow) +} + +// StartService is a convenience function for InitDBConfig->SetServingType +// with serving=true. +func (tsv *TabletServer) StartService(target querypb.Target, dbcfgs *dbconfigs.DBConfigs) (err error) { + err = tsv.InitDBConfig(target, dbcfgs) + if err != nil { + return err + } + _ /* state changed */, err = tsv.sm.SetServingType(target.TabletType, true, nil) + return err +} + +// StopService shuts down the tabletserver to the uninitialized state. +// It first transitions to StateShuttingDown, then waits for active +// services to shut down. Then it shuts down the rest. This function +// should be called before process termination, or if MySQL is unreachable. +// Under normal circumstances, SetServingType should be called. +func (tsv *TabletServer) StopService() { + tsv.sm.StopService() +} + // IsHealthy returns nil for non-serving types or if the query service is healthy (able to // connect to the database and serving traffic), or an error explaining // the unhealthiness otherwise. func (tsv *TabletServer) IsHealthy() error { - tsv.mu.Lock() - tabletType := tsv.target.TabletType - tsv.mu.Unlock() - switch tabletType { + switch tsv.sm.Target().TabletType { case topodatapb.TabletType_MASTER, topodatapb.TabletType_REPLICA, topodatapb.TabletType_BATCH, topodatapb.TabletType_EXPERIMENTAL: _, err := tsv.Execute( tabletenv.LocalContext(), @@ -717,10 +734,10 @@ func (tsv *TabletServer) ExecuteBatch(ctx context.Context, target *querypb.Targe // tsv.convertAndLogError. That's because the methods which returned "err", // e.g. tsv.Execute(), already called that function and therefore already // converted and logged the error. - if err = tsv.startRequest(ctx, target, allowOnShutdown); err != nil { + if err = tsv.sm.startRequest(ctx, target, allowOnShutdown); err != nil { return nil, err } - defer tsv.endRequest() + defer tsv.sm.endRequest() defer tsv.handlePanicAndSendLogStats("batch", nil, nil) if options == nil { @@ -941,10 +958,10 @@ func (tsv *TabletServer) PurgeMessages(ctx context.Context, target *querypb.Targ } func (tsv *TabletServer) execDML(ctx context.Context, target *querypb.Target, queryGenerator func() (string, map[string]*querypb.BindVariable, error)) (count int64, err error) { - if err = tsv.startRequest(ctx, target, false /* allowOnShutdown */); err != nil { + if err = tsv.sm.startRequest(ctx, target, false /* allowOnShutdown */); err != nil { return 0, err } - defer tsv.endRequest() + defer tsv.sm.endRequest() defer tsv.handlePanicAndSendLogStats("ack", nil, nil) query, bv, err := queryGenerator() @@ -977,7 +994,7 @@ func (tsv *TabletServer) execDML(ctx context.Context, target *querypb.Target, qu // VStream streams VReplication events. func (tsv *TabletServer) VStream(ctx context.Context, target *querypb.Target, startPos string, tablePKs []*binlogdatapb.TableLastPK, filter *binlogdatapb.Filter, send func([]*binlogdatapb.VEvent) error) error { - if err := tsv.verifyTarget(ctx, target); err != nil { + if err := tsv.sm.verifyTarget(ctx, target); err != nil { return err } return tsv.vstreamer.Stream(ctx, startPos, tablePKs, filter, send) @@ -985,7 +1002,7 @@ func (tsv *TabletServer) VStream(ctx context.Context, target *querypb.Target, st // VStreamRows streams rows from the specified starting point. func (tsv *TabletServer) VStreamRows(ctx context.Context, target *querypb.Target, query string, lastpk *querypb.QueryResult, send func(*binlogdatapb.VStreamRowsResponse) error) error { - if err := tsv.verifyTarget(ctx, target); err != nil { + if err := tsv.sm.verifyTarget(ctx, target); err != nil { return err } var row []sqltypes.Value @@ -1001,7 +1018,7 @@ func (tsv *TabletServer) VStreamRows(ctx context.Context, target *querypb.Target // VStreamResults streams rows from the specified starting point. func (tsv *TabletServer) VStreamResults(ctx context.Context, target *querypb.Target, query string, send func(*binlogdatapb.VStreamResultsResponse) error) error { - if err := tsv.verifyTarget(ctx, target); err != nil { + if err := tsv.sm.verifyTarget(ctx, target); err != nil { return err } return tsv.vstreamer.StreamResults(ctx, query, send) @@ -1115,14 +1132,14 @@ func (tsv *TabletServer) execRequest( logStats.OriginalSQL = sql logStats.BindVariables = bindVariables defer tsv.handlePanicAndSendLogStats(sql, bindVariables, logStats) - if err = tsv.startRequest(ctx, target, allowOnShutdown); err != nil { + if err = tsv.sm.startRequest(ctx, target, allowOnShutdown); err != nil { return err } ctx, cancel := withTimeout(ctx, timeout, options) defer func() { cancel() - tsv.endRequest() + tsv.sm.endRequest() }() err = exec(ctx, logStats) @@ -1394,9 +1411,7 @@ func (tsv *TabletServer) streamHealthUnregister(id int) { // BroadcastHealth will broadcast the current health to all listeners func (tsv *TabletServer) BroadcastHealth(terTimestamp int64, stats *querypb.RealtimeStats, maxCache time.Duration) { - tsv.mu.Lock() - target := tsv.target - tsv.mu.Unlock() + target := tsv.sm.Target() shr := &querypb.StreamHealthResponse{ Target: &target, TabletAlias: &tsv.alias, @@ -1433,6 +1448,32 @@ func (tsv *TabletServer) HeartbeatLag() (time.Duration, error) { return tsv.hr.GetLatest() } +// EnterLameduck causes tabletserver to enter the lameduck state. This +// state causes health checks to fail, but the behavior of tabletserver +// otherwise remains the same. Any subsequent calls to SetServingType will +// cause the tabletserver to exit this mode. +func (tsv *TabletServer) EnterLameduck() { + tsv.sm.EnterLameduck() +} + +// ExitLameduck causes the tabletserver to exit the lameduck mode. +func (tsv *TabletServer) ExitLameduck() { + tsv.sm.ExitLameduck() +} + +// IsServing returns true if TabletServer is in SERVING state. +func (tsv *TabletServer) IsServing() bool { + return tsv.sm.IsServing() +} + +// CheckMySQL initiates a check to see if MySQL is reachable. +// If not, it shuts down the query service. The check is rate-limited +// to no more than once per second. +// The function satisfies tabletenv.Env. +func (tsv *TabletServer) CheckMySQL() { + tsv.sm.CheckMySQL() +} + // TopoServer returns the topo server. func (tsv *TabletServer) TopoServer() *topo.Server { return tsv.topoServer diff --git a/go/vt/vttablet/tabletserver/tabletserver_test.go b/go/vt/vttablet/tabletserver/tabletserver_test.go index 6241e1d9b87..17c0ffaa6cf 100644 --- a/go/vt/vttablet/tabletserver/tabletserver_test.go +++ b/go/vt/vttablet/tabletserver/tabletserver_test.go @@ -75,11 +75,11 @@ func TestTabletServerGetState(t *testing.T) { config := tabletenv.NewDefaultConfig() tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) for i, state := range states { - tsv.setState(state) - require.Equal(t, names[i], tsv.GetState(), "GetState") + tsv.sm.setState(state) + require.Equal(t, names[i], tsv.sm.StateByName(), "StateByName") } tsv.EnterLameduck() - require.Equal(t, "NOT_SERVING", tsv.GetState(), "GetState") + require.Equal(t, "NOT_SERVING", tsv.sm.StateByName(), "StateByName") } func TestTabletServerAllowQueriesFailBadConn(t *testing.T) { @@ -103,14 +103,14 @@ func TestTabletServerAllowQueries(t *testing.T) { tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) checkTabletServerState(t, tsv, StateNotConnected) dbcfgs := newDBConfigs(db) - tsv.setState(StateServing) + tsv.sm.setState(StateServing) target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} err := tsv.StartService(target, dbcfgs) tsv.StopService() want := "InitDBConfig failed" require.Error(t, err) assert.Contains(t, err.Error(), want) - tsv.setState(StateShuttingDown) + tsv.sm.setState(StateShuttingDown) err = tsv.StartService(target, dbcfgs) require.Error(t, err, "TabletServer.StartService should fail") tsv.StopService() @@ -121,14 +121,14 @@ func TestTabletServerInitDBConfig(t *testing.T) { defer db.Close() config := tabletenv.NewDefaultConfig() tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - tsv.setState(StateServing) + tsv.sm.setState(StateServing) target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} dbcfgs := newDBConfigs(db) err := tsv.InitDBConfig(target, dbcfgs) want := "InitDBConfig failed" require.Error(t, err) assert.Contains(t, err.Error(), want) - tsv.setState(StateNotConnected) + tsv.sm.setState(StateNotConnected) err = tsv.InitDBConfig(target, dbcfgs) require.NoError(t, err) } @@ -143,79 +143,79 @@ func TestDecideAction(t *testing.T) { err := tsv.InitDBConfig(target, dbcfgs) require.NoError(t, err) - tsv.setState(StateNotConnected) - action, err := tsv.decideAction(topodatapb.TabletType_MASTER, false, nil) + tsv.sm.setState(StateNotConnected) + action, err := tsv.sm.decideAction(topodatapb.TabletType_MASTER, false, nil) require.NoError(t, err) if action != actionNone { t.Errorf("decideAction: %v, want %v", action, actionNone) } - tsv.setState(StateNotConnected) - action, err = tsv.decideAction(topodatapb.TabletType_MASTER, true, nil) + tsv.sm.setState(StateNotConnected) + action, err = tsv.sm.decideAction(topodatapb.TabletType_MASTER, true, nil) require.NoError(t, err) if action != actionFullStart { t.Errorf("decideAction: %v, want %v", action, actionFullStart) } - if tsv.state != StateTransitioning { - t.Errorf("tsv.state: %v, want %v", tsv.state, StateTransitioning) + if tsv.sm.State() != StateTransitioning { + t.Errorf("tsv.sm.state: %v, want %v", tsv.sm.State(), StateTransitioning) } - tsv.setState(StateNotServing) - action, err = tsv.decideAction(topodatapb.TabletType_MASTER, false, nil) + tsv.sm.setState(StateNotServing) + action, err = tsv.sm.decideAction(topodatapb.TabletType_MASTER, false, nil) require.NoError(t, err) if action != actionNone { t.Errorf("decideAction: %v, want %v", action, actionNone) } - tsv.setState(StateNotServing) - action, err = tsv.decideAction(topodatapb.TabletType_MASTER, true, nil) + tsv.sm.setState(StateNotServing) + action, err = tsv.sm.decideAction(topodatapb.TabletType_MASTER, true, nil) require.NoError(t, err) if action != actionServeNewType { t.Errorf("decideAction: %v, want %v", action, actionServeNewType) } - if tsv.state != StateTransitioning { - t.Errorf("tsv.state: %v, want %v", tsv.state, StateTransitioning) + if tsv.sm.State() != StateTransitioning { + t.Errorf("tsv.sm.state: %v, want %v", tsv.sm.State(), StateTransitioning) } - tsv.setState(StateServing) - action, err = tsv.decideAction(topodatapb.TabletType_MASTER, false, nil) + tsv.sm.setState(StateServing) + action, err = tsv.sm.decideAction(topodatapb.TabletType_MASTER, false, nil) require.NoError(t, err) if action != actionGracefulStop { t.Errorf("decideAction: %v, want %v", action, actionGracefulStop) } - if tsv.state != StateShuttingDown { - t.Errorf("tsv.state: %v, want %v", tsv.state, StateShuttingDown) + if tsv.sm.State() != StateShuttingDown { + t.Errorf("tsv.sm.state: %v, want %v", tsv.sm.State(), StateShuttingDown) } - tsv.setState(StateServing) - action, err = tsv.decideAction(topodatapb.TabletType_REPLICA, true, nil) + tsv.sm.setState(StateServing) + action, err = tsv.sm.decideAction(topodatapb.TabletType_REPLICA, true, nil) require.NoError(t, err) if action != actionServeNewType { t.Errorf("decideAction: %v, want %v", action, actionServeNewType) } - if tsv.state != StateTransitioning { - t.Errorf("tsv.state: %v, want %v", tsv.state, StateTransitioning) + if tsv.sm.State() != StateTransitioning { + t.Errorf("tsv.sm.state: %v, want %v", tsv.sm.State(), StateTransitioning) } - tsv.target.TabletType = topodatapb.TabletType_MASTER + tsv.sm.target.TabletType = topodatapb.TabletType_MASTER - tsv.setState(StateServing) - action, err = tsv.decideAction(topodatapb.TabletType_MASTER, true, nil) + tsv.sm.setState(StateServing) + action, err = tsv.sm.decideAction(topodatapb.TabletType_MASTER, true, nil) require.NoError(t, err) if action != actionNone { t.Errorf("decideAction: %v, want %v", action, actionNone) } - if tsv.state != StateServing { - t.Errorf("tsv.state: %v, want %v", tsv.state, StateServing) + if tsv.sm.State() != StateServing { + t.Errorf("tsv.sm.state: %v, want %v", tsv.sm.State(), StateServing) } - tsv.setState(StateTransitioning) - _, err = tsv.decideAction(topodatapb.TabletType_MASTER, false, nil) + tsv.sm.setState(StateTransitioning) + _, err = tsv.sm.decideAction(topodatapb.TabletType_MASTER, false, nil) want := "cannot SetServingType" require.Error(t, err) assert.Contains(t, err.Error(), want) - tsv.setState(StateShuttingDown) - _, err = tsv.decideAction(topodatapb.TabletType_MASTER, false, nil) + tsv.sm.setState(StateShuttingDown) + _, err = tsv.sm.decideAction(topodatapb.TabletType_MASTER, false, nil) want = "cannot SetServingType" require.Error(t, err) assert.Contains(t, err.Error(), want) @@ -261,8 +261,8 @@ func TestSetServingType(t *testing.T) { // Verify that we exit lameduck when SetServingType is called. tsv.EnterLameduck() - if stateName := tsv.GetState(); stateName != "NOT_SERVING" { - t.Errorf("GetState: %s, want NOT_SERVING", stateName) + if stateName := tsv.sm.StateByName(); stateName != "NOT_SERVING" { + t.Errorf("StateByName: %s, want NOT_SERVING", stateName) } stateChanged, err = tsv.SetServingType(topodatapb.TabletType_REPLICA, true, nil) if stateChanged != true { @@ -270,8 +270,8 @@ func TestSetServingType(t *testing.T) { } require.NoError(t, err) checkTabletServerState(t, tsv, StateServing) - if stateName := tsv.GetState(); stateName != "SERVING" { - t.Errorf("GetState: %s, want SERVING", stateName) + if stateName := tsv.sm.StateByName(); stateName != "SERVING" { + t.Errorf("StateByName: %s, want SERVING", stateName) } tsv.StopService() @@ -310,7 +310,7 @@ func TestTabletServerCheckMysql(t *testing.T) { err := tsv.StartService(target, dbcfgs) defer tsv.StopService() require.NoError(t, err) - if !tsv.isMySQLReachable() { + if !tsv.sm.isMySQLReachable() { t.Error("isMySQLReachable should return true") } stateChanged, err := tsv.SetServingType(topodatapb.TabletType_SPARE, false, nil) @@ -318,7 +318,7 @@ func TestTabletServerCheckMysql(t *testing.T) { if stateChanged != true { t.Errorf("SetServingType() should have changed the QueryService state, but did not") } - if !tsv.isMySQLReachable() { + if !tsv.sm.isMySQLReachable() { t.Error("isMySQLReachable should return true") } checkTabletServerState(t, tsv, StateNotServing) @@ -338,8 +338,8 @@ func TestTabletServerReconnect(t *testing.T) { err := tsv.StartService(target, dbcfgs) defer tsv.StopService() - if tsv.GetState() != "SERVING" { - t.Errorf("GetState: %s, must be SERVING", tsv.GetState()) + if tsv.sm.StateByName() != "SERVING" { + t.Errorf("StateByName: %s, must be SERVING", tsv.sm.StateByName()) } if err != nil { t.Fatalf("TabletServer.StartService should success but get error: %v", err) @@ -354,8 +354,8 @@ func TestTabletServerReconnect(t *testing.T) { t.Error("Execute: want error, got nil") } time.Sleep(50 * time.Millisecond) - if tsv.GetState() == "SERVING" { - t.Error("GetState is still SERVING, must be NOT_SERVING") + if tsv.sm.StateByName() == "SERVING" { + t.Error("StateByName is still SERVING, must be NOT_SERVING") } // make mysql conn work @@ -2731,9 +2731,7 @@ func setUpTabletServerTest(t *testing.T) *fakesqldb.DB { } func checkTabletServerState(t *testing.T, tsv *TabletServer, expectState int64) { - tsv.mu.Lock() - state := tsv.state - tsv.mu.Unlock() + state := tsv.sm.State() if state != expectState { t.Fatalf("TabletServer should in state: %d, but get state: %d", expectState, state) } diff --git a/go/vt/vttablet/tabletserver/vstreamer/engine.go b/go/vt/vttablet/tabletserver/vstreamer/engine.go index 2fb209d3b7a..6cf1eeb7391 100644 --- a/go/vt/vttablet/tabletserver/vstreamer/engine.go +++ b/go/vt/vttablet/tabletserver/vstreamer/engine.go @@ -100,15 +100,14 @@ func (vse *Engine) InitDBConfig(keyspace string) { } // Open starts the Engine service. -func (vse *Engine) Open() error { +func (vse *Engine) Open() { vse.mu.Lock() defer vse.mu.Unlock() if vse.isOpen { - return nil + return } log.Info("VStreamer is open.") vse.isOpen = true - return nil } // IsOpen checks if the engine is opened From 35f65b0a5326882b5d1bd62dcabe11729ba369da Mon Sep 17 00:00:00 2001 From: Sugu Sougoumarane Date: Sun, 28 Jun 2020 12:31:35 -0700 Subject: [PATCH 10/19] vttablet: implement new state transitions Tests still need fixing Signed-off-by: Sugu Sougoumarane --- go/vt/vttablet/tabletserver/query_engine.go | 13 +- go/vt/vttablet/tabletserver/state_manager.go | 599 +++++++++--------- go/vt/vttablet/tabletserver/tabletserver.go | 75 +-- .../tabletserver/tabletserver_test.go | 4 - .../tabletserver/txthrottler/tx_throttler.go | 2 +- 5 files changed, 359 insertions(+), 334 deletions(-) diff --git a/go/vt/vttablet/tabletserver/query_engine.go b/go/vt/vttablet/tabletserver/query_engine.go index 7bbec2e9ede..1ce97b70b87 100644 --- a/go/vt/vttablet/tabletserver/query_engine.go +++ b/go/vt/vttablet/tabletserver/query_engine.go @@ -112,8 +112,9 @@ func (ep *TabletPlan) buildAuthorized() { // Close: There should be no more pending queries when this // function is called. type QueryEngine struct { - env tabletenv.Env - se *schema.Engine + isOpen bool + env tabletenv.Env + se *schema.Engine // mu protects the following fields. mu sync.RWMutex @@ -235,6 +236,9 @@ func NewQueryEngine(env tabletenv.Env, se *schema.Engine) *QueryEngine { // Open must be called before sending requests to QueryEngine. func (qe *QueryEngine) Open() error { + if qe.isOpen { + return nil + } qe.conns.Open(qe.env.Config().DB.AppWithDB(), qe.env.Config().DB.DbaWithDB(), qe.env.Config().DB.AppDebugWithDB()) conn, err := qe.conns.Get(tabletenv.LocalContext()) @@ -254,6 +258,7 @@ func (qe *QueryEngine) Open() error { qe.streamConns.Open(qe.env.Config().DB.AppWithDB(), qe.env.Config().DB.DbaWithDB(), qe.env.Config().DB.AppDebugWithDB()) qe.se.RegisterNotifier("qe", qe.schemaChanged) + qe.isOpen = true return nil } @@ -267,12 +272,16 @@ func (qe *QueryEngine) StopServing() { // You must ensure that no more queries will be sent // before calling Close. func (qe *QueryEngine) Close() { + if !qe.isOpen { + return + } // Close in reverse order of Open. qe.se.UnregisterNotifier("qe") qe.plans.Clear() qe.tables = make(map[string]*schema.Table) qe.streamConns.Close() qe.conns.Close() + qe.isOpen = false } // GetPlan returns the TabletPlan that for the query. Plans are cached in a cache.LRUCache. diff --git a/go/vt/vttablet/tabletserver/state_manager.go b/go/vt/vttablet/tabletserver/state_manager.go index 2c7eaaec05d..0ca20a19e71 100644 --- a/go/vt/vttablet/tabletserver/state_manager.go +++ b/go/vt/vttablet/tabletserver/state_manager.go @@ -41,15 +41,6 @@ const ( StateNotServing // StateServing is where queries are allowed. StateServing - // StateTransitioning is a transient state indicating that - // the tabletserver is tranisitioning to a new state. - // In order to achieve clean transitions, no requests are - // allowed during this state. - StateTransitioning - // StateShuttingDown indicates that the tabletserver - // is shutting down. In this state, we wait for outstanding - // requests and transactions to conclude. - StateShuttingDown ) // stateName names every state. The number of elements must @@ -58,8 +49,6 @@ var stateName = []string{ "NOT_SERVING", "NOT_SERVING", "SERVING", - "NOT_SERVING", - "SHUTTING_DOWN", } // stateDetail matches every state and optionally more information about the reason @@ -68,19 +57,23 @@ var stateDetail = []string{ "Not Connected", "Not Serving", "", - "Transitioning", - "Shutting Down", } // stateManager manages state transition for all the TabletServer // subcomponents. type stateManager struct { - mu sync.Mutex - state int64 - lameduck sync2.AtomicInt32 - target querypb.Target + mu sync.Mutex + wantState int64 + wantTabletType topodatapb.TabletType + state int64 + target querypb.Target + transitioning bool + connecting bool + // TODO(sougou): deprecate alsoAllow alsoAllow []topodatapb.TabletType - requests sync.WaitGroup + + requests sync.WaitGroup + lameduck sync2.AtomicInt32 se schemaEngine hw subComponent @@ -127,235 +120,319 @@ type txThrottler interface { Close() } -// EnterLameduck causes tabletserver to enter the lameduck state. This -// state causes health checks to fail, but the behavior of tabletserver -// otherwise remains the same. Any subsequent calls to SetServingType will -// cause the tabletserver to exit this mode. -func (sm *stateManager) EnterLameduck() { - sm.lameduck.Set(1) +const ( + actionNone = iota + actionFullStart + actionServeNewType + actionGracefulStop +) + +func (sm *stateManager) SetServingType(tabletType topodatapb.TabletType, state int64, alsoAllow []topodatapb.TabletType) (stateChanged bool, err error) { + // TODO(sougou): deprecate the waits after tabletmanager has been refactored. + startTime := time.Now() + stateChanged = sm.setDesiredState(tabletType, state, alsoAllow) + for { + curState, curTabletType := sm.State(), sm.Target().TabletType + if curState == StateNotConnected { + return stateChanged, vterrors.Errorf(vtrpcpb.Code_UNAVAILABLE, "MySQL is unavailable") + } + if curState == state && curTabletType == tabletType { + return stateChanged, nil + } + time.Sleep(10 * time.Millisecond) + if time.Since(startTime) > 1*time.Second { + return stateChanged, vterrors.Errorf(vtrpcpb.Code_DEADLINE_EXCEEDED, "State transition deadline exceeded") + } + } } -// ExitLameduck causes the tabletserver to exit the lameduck mode. -func (sm *stateManager) ExitLameduck() { - sm.lameduck.Set(0) +func (sm *stateManager) CheckMySQL() { + if !sm.checkMySQLThrottler.TryAcquire() { + return + } + go func() { + defer func() { + time.Sleep(1 * time.Second) + sm.checkMySQLThrottler.Release() + }() + + err := sm.qe.IsMySQLReachable() + if err == nil { + return + } + + log.Errorf("Cannot connect to MySQL, shutting down query service: %v", err) + sm.mu.Lock() + // If we're already transitioning, don't interfere. + if sm.transitioning { + sm.mu.Unlock() + return + } + // Setting this flag will ensure that no one else will + // invoke sm.executeTransition while we sleep. + sm.transitioning = true + sm.mu.Unlock() + + // This code path emulates the error case at the end of the loop + // of executeTransition where it waits for 1s and retries. + sm.closeAll() + time.Sleep(1 * time.Second) + go sm.executeTransition() + }() } -func (sm *stateManager) State() int64 { - sm.mu.Lock() - defer sm.mu.Unlock() - return sm.state +func (sm *stateManager) StopService() { + defer close(sm.setTimeBomb()) + + sm.SetServingType(sm.Target().TabletType, StateNotConnected, nil) + for { + if sm.State() == StateNotConnected { + return + } + time.Sleep(10 * time.Millisecond) + } } -func (sm *stateManager) Target() querypb.Target { +// StartRequest validates the current state and target and registers +// the request (a waitgroup) as started. Every StartRequest must be +// ended with an EndRequest. +func (sm *stateManager) StartRequest(ctx context.Context, target *querypb.Target, allowOnTransition bool) (err error) { sm.mu.Lock() defer sm.mu.Unlock() - target := sm.target - return target -} -// StateByName returns the name of the current TabletServer state. -func (sm *stateManager) StateByName() string { - if sm.lameduck.Get() != 0 { - return "NOT_SERVING" + // All the checks below must pass. + switch { + case sm.state != StateServing: + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "operation not allowed in state %s", stateName[sm.state]) + case sm.transitioning && !allowOnTransition: + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "operation not allowed in state %s", stateName[sm.state]) + case target == nil && !tabletenv.IsLocalContext(ctx): + return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "No target") + case target.Keyspace != sm.target.Keyspace: + return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid keyspace %v", target.Keyspace) + case target.Shard != sm.target.Shard: + return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid shard %v", target.Shard) + case target.TabletType != sm.target.TabletType: + for _, otherType := range sm.alsoAllow { + if target.TabletType == otherType { + goto ok + } + } + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "invalid tablet type: %v, want: %v or %v", target.TabletType, sm.target.TabletType, sm.alsoAllow) } - sm.mu.Lock() - name := stateName[sm.state] - sm.mu.Unlock() - return name -} - -// setState changes the state and logs the event. -// It requires the caller to hold a lock on mu. -func (sm *stateManager) setState(state int64) { - log.Infof("TabletServer state: %s -> %s", stateInfo(sm.state), stateInfo(state)) - sm.state = state - sm.history.Add(&historyRecord{ - Time: time.Now(), - ServingState: stateInfo(state), - TabletType: sm.target.TabletType.String(), - }) -} -// transition obtains a lock and changes the state. -func (sm *stateManager) transition(newState int64) { - sm.mu.Lock() - sm.setState(newState) - sm.mu.Unlock() +ok: + sm.requests.Add(1) + return nil } -// IsServing returns true if TabletServer is in SERVING state. -func (sm *stateManager) IsServing() bool { - return sm.StateByName() == "SERVING" +// EndRequest unregisters the current request (a waitgroup) as done. +func (sm *stateManager) EndRequest() { + sm.requests.Done() } -const ( - actionNone = iota - actionFullStart - actionServeNewType - actionGracefulStop -) - -func (sm *stateManager) SetServingType(tabletType topodatapb.TabletType, serving bool, alsoAllow []topodatapb.TabletType) (stateChanged bool, err error) { - defer sm.ExitLameduck() +// VerifyTarget allows requests to be executed even in non-serving state. +// Such requests will get terminated without wait on shutdown. +func (sm *stateManager) VerifyTarget(ctx context.Context, target *querypb.Target) error { + sm.mu.Lock() + defer sm.mu.Unlock() - action, err := sm.decideAction(tabletType, serving, alsoAllow) - if err != nil { - return false, err - } - switch action { - case actionNone: - return false, nil - case actionFullStart: - if err := sm.fullStart(); err != nil { - sm.closeAll() - return true, err - } - return true, nil - case actionServeNewType: - if err := sm.serveNewType(); err != nil { - sm.closeAll() - return true, err + switch { + case target == nil && !tabletenv.IsLocalContext(ctx): + return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "No target") + case target.Keyspace != sm.target.Keyspace: + return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid keyspace %v", target.Keyspace) + case target.Shard != sm.target.Shard: + return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid shard %v", target.Shard) + case target.TabletType != sm.target.TabletType: + for _, otherType := range sm.alsoAllow { + if target.TabletType == otherType { + return nil + } } - return true, nil - case actionGracefulStop: - sm.gracefulStop() - return true, nil + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "invalid tablet type: %v, want: %v or %v", target.TabletType, sm.target.TabletType, sm.alsoAllow) } - panic("unreachable") + return nil } -func (sm *stateManager) decideAction(tabletType topodatapb.TabletType, serving bool, alsoAllow []topodatapb.TabletType) (action int, err error) { +func (sm *stateManager) setDesiredState(tabletType topodatapb.TabletType, state int64, alsoAllow []topodatapb.TabletType) bool { sm.mu.Lock() defer sm.mu.Unlock() + stateChanged := false + if sm.wantTabletType != tabletType { + stateChanged = true + sm.wantTabletType = tabletType + } + if sm.wantState != state { + stateChanged = true + sm.wantState = state + } sm.alsoAllow = alsoAllow - - // Handle the case where the requested TabletType and serving state - // match our current state. This avoids an unnecessary transition. - // There's no similar shortcut if serving is false, because there - // are different 'not serving' states that require different actions. - if sm.target.TabletType == tabletType { - if serving && sm.state == StateServing { - // We're already in the desired state. - return actionNone, nil - } + if sm.transitioning { + return stateChanged } - sm.target.TabletType = tabletType - switch sm.state { - case StateNotConnected: - if serving { - sm.setState(StateTransitioning) - return actionFullStart, nil + if sm.wantState == sm.state && sm.wantTabletType == sm.target.TabletType { + return stateChanged + } + sm.transitioning = true + go sm.executeTransition() + return stateChanged +} + +// executeTransition must be invoked after setting sm.transitioning to true. +// If the flag is already set, it must not be called. The function will +// reset the flag to false when it returns. +func (sm *stateManager) executeTransition() { + // Repeat until desired state is reached. + errorReported := false + for { + ok, wantTabletType, wantState := sm.transitionDone() + if ok { + return } - case StateNotServing: - if serving { - sm.setState(StateTransitioning) - return actionServeNewType, nil + + var err error + switch wantTabletType { + case topodatapb.TabletType_MASTER: + if wantState == StateServing { + err = sm.serveMaster() + } else { + err = sm.unserveMaster() + } + default: + if wantState == StateServing { + err = sm.serveNonMaster(wantTabletType) + } else { + err = sm.unserveNonMaster(wantTabletType) + } } - case StateServing: - if !serving { - sm.setState(StateShuttingDown) - return actionGracefulStop, nil + // If there was an error, shut everything down + // and retry after a delay. + // If there was no error, we restart the loop + // which verifies that the desired state was + // not changed before returning. If it was changed, + // it executes a new transition. + if err != nil { + if !errorReported { + errorReported = true + log.Errorf("Error transitioning to the desired state: %v, %v, will keep retrying: %v", wantTabletType, stateName[wantState], err) + } + sm.closeAll() + time.Sleep(1 * time.Second) } - sm.setState(StateTransitioning) - return actionServeNewType, nil - case StateTransitioning, StateShuttingDown: - return actionNone, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "cannot SetServingType, current state: %s", stateName[sm.state]) - default: - panic("unreachable") } - return actionNone, nil } -func (sm *stateManager) fullStart() error { - if err := sm.qe.IsMySQLReachable(); err != nil { - return err +// transitionDone returns true if the desired state matches the current state. +// Otherwise, it returns false, the desired tablet type and state. +func (sm *stateManager) transitionDone() (bool, topodatapb.TabletType, int64) { + sm.mu.Lock() + defer sm.mu.Unlock() + + wantTabletType := sm.wantTabletType + wantState := sm.wantState + if wantState == sm.state && wantTabletType == sm.target.TabletType { + sm.transitioning = false + return true, wantTabletType, wantState } - if err := sm.se.Open(); err != nil { + return false, wantTabletType, wantState +} + +func (sm *stateManager) serveMaster() error { + sm.watcher.Close() + sm.hr.Close() + + if err := sm.connect(); err != nil { return err } - sm.vstreamer.Open() - if err := sm.qe.Open(); err != nil { + + sm.hw.Open() + sm.tracker.Open() + if err := sm.te.AcceptReadWrite(); err != nil { return err } - if err := sm.txThrottler.Open(); err != nil { + sm.messager.Open() + sm.setState(topodatapb.TabletType_MASTER, StateServing) + return nil +} + +func (sm *stateManager) unserveMaster() error { + sm.unserveCommon() + + sm.watcher.Close() + sm.hr.Close() + + if err := sm.connect(); err != nil { return err } - return sm.serveNewType() + + sm.hw.Open() + sm.tracker.Open() + sm.setState(topodatapb.TabletType_MASTER, StateNotServing) + return nil } -func (sm *stateManager) serveNewType() (err error) { - if sm.target.TabletType == topodatapb.TabletType_MASTER { - sm.watcher.Close() - sm.hr.Close() +func (sm *stateManager) serveNonMaster(wantTabletType topodatapb.TabletType) error { + sm.messager.Close() + sm.tracker.Close() + sm.hw.Close() + sm.se.MakeNonMaster() - sm.hw.Open() - sm.tracker.Open() - if err := sm.te.AcceptReadWrite(); err != nil { - return err - } - sm.messager.Open() - } else { - sm.messager.Close() - if err := sm.te.AcceptReadOnly(); err != nil { - return err - } - sm.tracker.Close() - sm.hw.Close() - sm.se.MakeNonMaster() + if err := sm.connect(); err != nil { + return err + } - sm.hr.Open() - sm.watcher.Open() + if err := sm.te.AcceptReadOnly(); err != nil { + return err } - sm.transition(StateServing) + sm.hr.Open() + sm.watcher.Open() + sm.setState(wantTabletType, StateServing) return nil } -func (sm *stateManager) gracefulStop() { - defer close(sm.setTimeBomb()) - sm.waitForShutdown() - sm.transition(StateNotServing) -} +func (sm *stateManager) unserveNonMaster(wantTabletType topodatapb.TabletType) error { + sm.unserveCommon() -func (sm *stateManager) StopService() { - defer close(sm.setTimeBomb()) + sm.tracker.Close() + sm.hw.Close() + sm.se.MakeNonMaster() - sm.mu.Lock() - if sm.state != StateServing && sm.state != StateNotServing { - sm.mu.Unlock() - return + if err := sm.connect(); err != nil { + return err } - sm.setState(StateShuttingDown) - sm.mu.Unlock() - log.Info("Executing complete shutdown.") - sm.waitForShutdown() - sm.qe.Close() - sm.watcher.Close() - sm.vstreamer.Close() - sm.hr.Close() - sm.hw.Close() - sm.se.Close() - log.Info("Shutdown complete.") - sm.transition(StateNotConnected) + sm.hr.Open() + sm.watcher.Open() + sm.setState(wantTabletType, StateServing) + return nil +} + +func (sm *stateManager) connect() error { + if err := sm.qe.IsMySQLReachable(); err != nil { + return err + } + if err := sm.se.Open(); err != nil { + return err + } + sm.vstreamer.Open() + if err := sm.qe.Open(); err != nil { + return err + } + return sm.txThrottler.Open() } -func (sm *stateManager) waitForShutdown() { +func (sm *stateManager) unserveCommon() { sm.messager.Close() sm.te.Close() - sm.txThrottler.Close() - sm.tracker.Close() sm.qe.StopServing() sm.requests.Wait() } -// closeAll is called if TabletServer fails to start. -// It forcibly shuts down everything. func (sm *stateManager) closeAll() { - sm.messager.Close() - sm.te.Close() + sm.unserveCommon() sm.txThrottler.Close() - sm.qe.StopServing() sm.qe.Close() sm.watcher.Close() sm.tracker.Close() @@ -363,7 +440,7 @@ func (sm *stateManager) closeAll() { sm.hr.Close() sm.hw.Close() sm.se.Close() - sm.transition(StateNotConnected) + sm.setState(topodatapb.TabletType_UNKNOWN, StateNotConnected) } func (sm *stateManager) setTimeBomb() chan struct{} { @@ -383,123 +460,63 @@ func (sm *stateManager) setTimeBomb() chan struct{} { return done } -func (sm *stateManager) CheckMySQL() { - if !sm.checkMySQLThrottler.TryAcquire() { - return +// setState changes the state and logs the event. +func (sm *stateManager) setState(tabletType topodatapb.TabletType, state int64) { + sm.mu.Lock() + defer sm.mu.Unlock() + if tabletType == topodatapb.TabletType_UNKNOWN { + tabletType = sm.wantTabletType } - go func() { - defer func() { - time.Sleep(1 * time.Second) - sm.checkMySQLThrottler.Release() - }() - if sm.isMySQLReachable() { - return - } - log.Info("Check MySQL failed. Shutting down query service") - sm.StopService() - }() + log.Infof("TabletServer transition: %v -> %v, %s -> %s", sm.target.TabletType, tabletType, stateInfo(sm.state), stateInfo(state)) + sm.target.TabletType = tabletType + sm.state = state + sm.history.Add(&historyRecord{ + Time: time.Now(), + ServingState: stateInfo(state), + TabletType: sm.target.TabletType.String(), + }) } -// isMySQLReachable returns true if we can connect to MySQL. -// The function returns false only if the query service is -// in StateServing or StateNotServing. -func (sm *stateManager) isMySQLReachable() bool { - sm.mu.Lock() - switch sm.state { - case StateServing: - // Prevent transition out of this state by - // reserving a request. - sm.requests.Add(1) - defer sm.requests.Done() - case StateNotServing: - // Prevent transition out of this state by - // temporarily switching to StateTransitioning. - sm.setState(StateTransitioning) - defer func() { - sm.transition(StateNotServing) - }() - default: - sm.mu.Unlock() - return true - } - sm.mu.Unlock() - if err := sm.qe.IsMySQLReachable(); err != nil { - log.Errorf("Cannot connect to MySQL: %v", err) - return false - } - return true +// EnterLameduck causes tabletserver to enter the lameduck state. This +// state causes health checks to fail, but the behavior of tabletserver +// otherwise remains the same. Any subsequent calls to SetServingType will +// cause the tabletserver to exit this mode. +func (sm *stateManager) EnterLameduck() { + sm.lameduck.Set(1) } -// startRequest validates the current state and target and registers -// the request (a waitgroup) as started. Every startRequest requires -// one and only one corresponding endRequest. When the service shuts -// down, StopService will wait on this waitgroup to ensure that there -// are no requests in flight. -func (sm *stateManager) startRequest(ctx context.Context, target *querypb.Target, allowOnShutdown bool) (err error) { - sm.mu.Lock() - defer sm.mu.Unlock() - if sm.state == StateServing { - goto verifyTarget - } - if allowOnShutdown && sm.state == StateShuttingDown { - goto verifyTarget - } - return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "operation not allowed in state %s", stateName[sm.state]) - -verifyTarget: - if target != nil { - // a valid target needs to be used - switch { - case target.Keyspace != sm.target.Keyspace: - return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid keyspace %v", target.Keyspace) - case target.Shard != sm.target.Shard: - return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid shard %v", target.Shard) - case target.TabletType != sm.target.TabletType: - for _, otherType := range sm.alsoAllow { - if target.TabletType == otherType { - goto ok - } - } - return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "invalid tablet type: %v, want: %v or %v", target.TabletType, sm.target.TabletType, sm.alsoAllow) - } - } else if !tabletenv.IsLocalContext(ctx) { - return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "No target") - } +// ExitLameduck causes the tabletserver to exit the lameduck mode. +func (sm *stateManager) ExitLameduck() { + sm.lameduck.Set(0) +} -ok: - sm.requests.Add(1) - return nil +// IsServing returns true if TabletServer is in SERVING state. +func (sm *stateManager) IsServing() bool { + return sm.StateByName() == "SERVING" } -// endRequest unregisters the current request (a waitgroup) as done. -func (sm *stateManager) endRequest() { - sm.requests.Done() +func (sm *stateManager) State() int64 { + sm.mu.Lock() + defer sm.mu.Unlock() + return sm.state } -// verifyTarget allows requests to be executed even in non-serving state. -func (sm *stateManager) verifyTarget(ctx context.Context, target *querypb.Target) error { +func (sm *stateManager) Target() querypb.Target { sm.mu.Lock() defer sm.mu.Unlock() + target := sm.target + return target +} - if target != nil { - // a valid target needs to be used - switch { - case target.Keyspace != sm.target.Keyspace: - return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid keyspace %v", target.Keyspace) - case target.Shard != sm.target.Shard: - return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid shard %v", target.Shard) - case target.TabletType != sm.target.TabletType: - for _, otherType := range sm.alsoAllow { - if target.TabletType == otherType { - return nil - } - } - return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "invalid tablet type: %v, want: %v or %v", target.TabletType, sm.target.TabletType, sm.alsoAllow) - } - } else if !tabletenv.IsLocalContext(ctx) { - return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "No target") +// StateByName returns the name of the current TabletServer state. +func (sm *stateManager) StateByName() string { + if sm.lameduck.Get() != 0 { + return "NOT_SERVING" } - return nil + sm.mu.Lock() + defer sm.mu.Unlock() + name := stateName[sm.state] + return name } // stateInfo returns a string representation of the state and optional detail diff --git a/go/vt/vttablet/tabletserver/tabletserver.go b/go/vt/vttablet/tabletserver/tabletserver.go index 2d74d084ea3..96932287cef 100644 --- a/go/vt/vttablet/tabletserver/tabletserver.go +++ b/go/vt/vttablet/tabletserver/tabletserver.go @@ -314,17 +314,20 @@ func (tsv *TabletServer) InitACL(tableACLConfigFile string, enforceTableACLConfi // should also be honored for serving. // Returns true if the state of QueryService or the tablet type changed. func (tsv *TabletServer) SetServingType(tabletType topodatapb.TabletType, serving bool, alsoAllow []topodatapb.TabletType) (stateChanged bool, err error) { - return tsv.sm.SetServingType(tabletType, serving, alsoAllow) + state := int64(StateNotServing) + if serving { + state = StateServing + } + return tsv.sm.SetServingType(tabletType, state, alsoAllow) } // StartService is a convenience function for InitDBConfig->SetServingType // with serving=true. -func (tsv *TabletServer) StartService(target querypb.Target, dbcfgs *dbconfigs.DBConfigs) (err error) { - err = tsv.InitDBConfig(target, dbcfgs) - if err != nil { +func (tsv *TabletServer) StartService(target querypb.Target, dbcfgs *dbconfigs.DBConfigs) error { + if err := tsv.InitDBConfig(target, dbcfgs); err != nil { return err } - _ /* state changed */, err = tsv.sm.SetServingType(target.TabletType, true, nil) + _, err := tsv.sm.SetServingType(target.TabletType, StateServing, nil) return err } @@ -390,7 +393,7 @@ func (tsv *TabletServer) begin(ctx context.Context, target *querypb.Target, preQ err = tsv.execRequest( ctx, tsv.QueryTimeout.Get(), "Begin", "begin", nil, - target, options, false, /* allowOnShutdown */ + target, options, false, /* allowOnTransition */ func(ctx context.Context, logStats *tabletenv.LogStats) error { startTime := time.Now() if tsv.txThrottler.Throttle() { @@ -422,7 +425,7 @@ func (tsv *TabletServer) Commit(ctx context.Context, target *querypb.Target, tra err = tsv.execRequest( ctx, tsv.QueryTimeout.Get(), "Commit", "commit", nil, - target, nil, true, /* allowOnShutdown */ + target, nil, true, /* allowOnTransition */ func(ctx context.Context, logStats *tabletenv.LogStats) error { startTime := time.Now() logStats.TransactionID = transactionID @@ -453,7 +456,7 @@ func (tsv *TabletServer) Rollback(ctx context.Context, target *querypb.Target, t err = tsv.execRequest( ctx, tsv.QueryTimeout.Get(), "Rollback", "rollback", nil, - target, nil, true, /* allowOnShutdown */ + target, nil, true, /* allowOnTransition */ func(ctx context.Context, logStats *tabletenv.LogStats) error { defer tsv.stats.QueryTimings.Record("ROLLBACK", time.Now()) logStats.TransactionID = transactionID @@ -473,7 +476,7 @@ func (tsv *TabletServer) Prepare(ctx context.Context, target *querypb.Target, tr return tsv.execRequest( ctx, tsv.QueryTimeout.Get(), "Prepare", "prepare", nil, - target, nil, true, /* allowOnShutdown */ + target, nil, true, /* allowOnTransition */ func(ctx context.Context, logStats *tabletenv.LogStats) error { txe := &TxExecutor{ ctx: ctx, @@ -490,7 +493,7 @@ func (tsv *TabletServer) CommitPrepared(ctx context.Context, target *querypb.Tar return tsv.execRequest( ctx, tsv.QueryTimeout.Get(), "CommitPrepared", "commit_prepared", nil, - target, nil, true, /* allowOnShutdown */ + target, nil, true, /* allowOnTransition */ func(ctx context.Context, logStats *tabletenv.LogStats) error { txe := &TxExecutor{ ctx: ctx, @@ -507,7 +510,7 @@ func (tsv *TabletServer) RollbackPrepared(ctx context.Context, target *querypb.T return tsv.execRequest( ctx, tsv.QueryTimeout.Get(), "RollbackPrepared", "rollback_prepared", nil, - target, nil, true, /* allowOnShutdown */ + target, nil, true, /* allowOnTransition */ func(ctx context.Context, logStats *tabletenv.LogStats) error { txe := &TxExecutor{ ctx: ctx, @@ -524,7 +527,7 @@ func (tsv *TabletServer) CreateTransaction(ctx context.Context, target *querypb. return tsv.execRequest( ctx, tsv.QueryTimeout.Get(), "CreateTransaction", "create_transaction", nil, - target, nil, true, /* allowOnShutdown */ + target, nil, true, /* allowOnTransition */ func(ctx context.Context, logStats *tabletenv.LogStats) error { txe := &TxExecutor{ ctx: ctx, @@ -542,7 +545,7 @@ func (tsv *TabletServer) StartCommit(ctx context.Context, target *querypb.Target return tsv.execRequest( ctx, tsv.QueryTimeout.Get(), "StartCommit", "start_commit", nil, - target, nil, true, /* allowOnShutdown */ + target, nil, true, /* allowOnTransition */ func(ctx context.Context, logStats *tabletenv.LogStats) error { txe := &TxExecutor{ ctx: ctx, @@ -560,7 +563,7 @@ func (tsv *TabletServer) SetRollback(ctx context.Context, target *querypb.Target return tsv.execRequest( ctx, tsv.QueryTimeout.Get(), "SetRollback", "set_rollback", nil, - target, nil, true, /* allowOnShutdown */ + target, nil, true, /* allowOnTransition */ func(ctx context.Context, logStats *tabletenv.LogStats) error { txe := &TxExecutor{ ctx: ctx, @@ -578,7 +581,7 @@ func (tsv *TabletServer) ConcludeTransaction(ctx context.Context, target *queryp return tsv.execRequest( ctx, tsv.QueryTimeout.Get(), "ConcludeTransaction", "conclude_transaction", nil, - target, nil, true, /* allowOnShutdown */ + target, nil, true, /* allowOnTransition */ func(ctx context.Context, logStats *tabletenv.LogStats) error { txe := &TxExecutor{ ctx: ctx, @@ -595,7 +598,7 @@ func (tsv *TabletServer) ReadTransaction(ctx context.Context, target *querypb.Ta err = tsv.execRequest( ctx, tsv.QueryTimeout.Get(), "ReadTransaction", "read_transaction", nil, - target, nil, true, /* allowOnShutdown */ + target, nil, true, /* allowOnTransition */ func(ctx context.Context, logStats *tabletenv.LogStats) error { txe := &TxExecutor{ ctx: ctx, @@ -619,11 +622,11 @@ func (tsv *TabletServer) Execute(ctx context.Context, target *querypb.Target, sq return nil, vterrors.New(vtrpcpb.Code_INTERNAL, "transactionID and reserveID must match if both are non-zero") } - allowOnShutdown := transactionID != 0 + allowOnTransition := transactionID != 0 err = tsv.execRequest( ctx, tsv.QueryTimeout.Get(), "Execute", sql, bindVariables, - target, options, allowOnShutdown, + target, options, allowOnTransition, func(ctx context.Context, logStats *tabletenv.LogStats) error { if bindVariables == nil { bindVariables = make(map[string]*querypb.BindVariable) @@ -670,7 +673,7 @@ func (tsv *TabletServer) StreamExecute(ctx context.Context, target *querypb.Targ return tsv.execRequest( ctx, 0, "StreamExecute", sql, bindVariables, - target, options, false, /* allowOnShutdown */ + target, options, false, /* allowOnTransition */ func(ctx context.Context, logStats *tabletenv.LogStats) error { if bindVariables == nil { bindVariables = make(map[string]*querypb.BindVariable) @@ -714,9 +717,9 @@ func (tsv *TabletServer) ExecuteBatch(ctx context.Context, target *querypb.Targe if tsv.enableHotRowProtection && asTransaction { // Serialize transactions which target the same hot row range. - // NOTE: We put this intentionally at this place *before* tsv.startRequest() - // gets called below. Otherwise, the startRequest()/endRequest() section from - // below would overlap with the startRequest()/endRequest() section executed + // NOTE: We put this intentionally at this place *before* StartRequest() + // gets called below. Otherwise, the StartRequest()/EndRequest() section from + // below would overlap with the StartRequest()/EndRequest() section executed // by tsv.beginWaitForSameRangeTransactions(). txDone, err := tsv.beginWaitForSameRangeTransactions(ctx, target, options, queries[0].Sql, queries[0].BindVariables) if err != nil { @@ -727,17 +730,17 @@ func (tsv *TabletServer) ExecuteBatch(ctx context.Context, target *querypb.Targe } } - allowOnShutdown := transactionID != 0 - // TODO(sougou): Convert startRequest/endRequest pattern to use wrapper + allowOnTransition := transactionID != 0 + // TODO(sougou): Convert StartRequest/EndRequest pattern to use wrapper // function tsv.execRequest() instead. // Note that below we always return "err" right away and do not call // tsv.convertAndLogError. That's because the methods which returned "err", // e.g. tsv.Execute(), already called that function and therefore already // converted and logged the error. - if err = tsv.sm.startRequest(ctx, target, allowOnShutdown); err != nil { + if err = tsv.sm.StartRequest(ctx, target, allowOnTransition); err != nil { return nil, err } - defer tsv.sm.endRequest() + defer tsv.sm.EndRequest() defer tsv.handlePanicAndSendLogStats("batch", nil, nil) if options == nil { @@ -829,7 +832,7 @@ func (tsv *TabletServer) beginWaitForSameRangeTransactions(ctx context.Context, // -queryserver-config-txpool-timeout (defaults to 1s) to limit the waiting. ctx, tsv.QueryTimeout.Get(), "", "waitForSameRangeTransactions", nil, - target, options, false, /* allowOnShutdown */ + target, options, false, /* allowOnTransition */ func(ctx context.Context, logStats *tabletenv.LogStats) error { k, table := tsv.computeTxSerializerKey(ctx, logStats, sql, bindVariables) if k == "" { @@ -906,7 +909,7 @@ func (tsv *TabletServer) MessageStream(ctx context.Context, target *querypb.Targ return tsv.execRequest( ctx, 0, "MessageStream", "stream", nil, - target, nil, false, /* allowOnShutdown */ + target, nil, false, /* allowOnTransition */ func(ctx context.Context, logStats *tabletenv.LogStats) error { plan, err := tsv.qe.GetMessageStreamPlan(name) if err != nil { @@ -958,10 +961,10 @@ func (tsv *TabletServer) PurgeMessages(ctx context.Context, target *querypb.Targ } func (tsv *TabletServer) execDML(ctx context.Context, target *querypb.Target, queryGenerator func() (string, map[string]*querypb.BindVariable, error)) (count int64, err error) { - if err = tsv.sm.startRequest(ctx, target, false /* allowOnShutdown */); err != nil { + if err = tsv.sm.StartRequest(ctx, target, false /* allowOnTransition */); err != nil { return 0, err } - defer tsv.sm.endRequest() + defer tsv.sm.EndRequest() defer tsv.handlePanicAndSendLogStats("ack", nil, nil) query, bv, err := queryGenerator() @@ -994,7 +997,7 @@ func (tsv *TabletServer) execDML(ctx context.Context, target *querypb.Target, qu // VStream streams VReplication events. func (tsv *TabletServer) VStream(ctx context.Context, target *querypb.Target, startPos string, tablePKs []*binlogdatapb.TableLastPK, filter *binlogdatapb.Filter, send func([]*binlogdatapb.VEvent) error) error { - if err := tsv.sm.verifyTarget(ctx, target); err != nil { + if err := tsv.sm.VerifyTarget(ctx, target); err != nil { return err } return tsv.vstreamer.Stream(ctx, startPos, tablePKs, filter, send) @@ -1002,7 +1005,7 @@ func (tsv *TabletServer) VStream(ctx context.Context, target *querypb.Target, st // VStreamRows streams rows from the specified starting point. func (tsv *TabletServer) VStreamRows(ctx context.Context, target *querypb.Target, query string, lastpk *querypb.QueryResult, send func(*binlogdatapb.VStreamRowsResponse) error) error { - if err := tsv.sm.verifyTarget(ctx, target); err != nil { + if err := tsv.sm.VerifyTarget(ctx, target); err != nil { return err } var row []sqltypes.Value @@ -1018,7 +1021,7 @@ func (tsv *TabletServer) VStreamRows(ctx context.Context, target *querypb.Target // VStreamResults streams rows from the specified starting point. func (tsv *TabletServer) VStreamResults(ctx context.Context, target *querypb.Target, query string, send func(*binlogdatapb.VStreamResultsResponse) error) error { - if err := tsv.sm.verifyTarget(ctx, target); err != nil { + if err := tsv.sm.VerifyTarget(ctx, target); err != nil { return err } return tsv.vstreamer.StreamResults(ctx, query, send) @@ -1112,7 +1115,7 @@ func (tsv *TabletServer) Release(ctx context.Context, target *querypb.Target, tr func (tsv *TabletServer) execRequest( ctx context.Context, timeout time.Duration, requestName, sql string, bindVariables map[string]*querypb.BindVariable, - target *querypb.Target, options *querypb.ExecuteOptions, allowOnShutdown bool, + target *querypb.Target, options *querypb.ExecuteOptions, allowOnTransition bool, exec func(ctx context.Context, logStats *tabletenv.LogStats) error, ) (err error) { span, ctx := trace.NewSpan(ctx, "TabletServer."+requestName) @@ -1132,14 +1135,14 @@ func (tsv *TabletServer) execRequest( logStats.OriginalSQL = sql logStats.BindVariables = bindVariables defer tsv.handlePanicAndSendLogStats(sql, bindVariables, logStats) - if err = tsv.sm.startRequest(ctx, target, allowOnShutdown); err != nil { + if err = tsv.sm.StartRequest(ctx, target, allowOnTransition); err != nil { return err } ctx, cancel := withTimeout(ctx, timeout, options) defer func() { cancel() - tsv.sm.endRequest() + tsv.sm.EndRequest() }() err = exec(ctx, logStats) diff --git a/go/vt/vttablet/tabletserver/tabletserver_test.go b/go/vt/vttablet/tabletserver/tabletserver_test.go index 17c0ffaa6cf..7ad74eb75a3 100644 --- a/go/vt/vttablet/tabletserver/tabletserver_test.go +++ b/go/vt/vttablet/tabletserver/tabletserver_test.go @@ -61,16 +61,12 @@ func TestTabletServerGetState(t *testing.T) { StateNotConnected, StateNotServing, StateServing, - StateTransitioning, - StateShuttingDown, } // Don't reuse stateName. names := []string{ "NOT_SERVING", "NOT_SERVING", "SERVING", - "NOT_SERVING", - "SHUTTING_DOWN", } config := tabletenv.NewDefaultConfig() tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) diff --git a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go index 3bcc88706b6..868dc8c6c48 100644 --- a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go +++ b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go @@ -223,7 +223,7 @@ func (t *TxThrottler) Open() error { return nil } if t.state != nil { - return fmt.Errorf("transaction throttler already opened") + return nil } var err error t.state, err = newTxThrottlerState(t.config, t.target.Keyspace, t.target.Shard) From 3efba4be085d91a7c3e9037ed2a4bae844ac93ff Mon Sep 17 00:00:00 2001 From: Sugu Sougoumarane Date: Sun, 28 Jun 2020 13:56:45 -0700 Subject: [PATCH 11/19] vttablet: stateManager tests WIP Signed-off-by: Sugu Sougoumarane --- go/mysql/conn.go | 8 +- go/vt/dbconfigs/dbconfigs.go | 5 + go/vt/vttablet/tabletserver/bench_test.go | 24 +- .../tabletserver/query_executor_test.go | 5 +- go/vt/vttablet/tabletserver/state_manager.go | 72 +- .../tabletserver/state_manager_test.go | 42 + .../tabletserver/tabletserver_test.go | 865 ++++-------------- 7 files changed, 241 insertions(+), 780 deletions(-) create mode 100644 go/vt/vttablet/tabletserver/state_manager_test.go diff --git a/go/mysql/conn.go b/go/mysql/conn.go index 9f0c93a151a..9ee244b6dd4 100644 --- a/go/mysql/conn.go +++ b/go/mysql/conn.go @@ -761,13 +761,7 @@ func (c *Conn) handleNextCommand(handler Handler) error { data, err := c.readEphemeralPacket() if err != nil { // Don't log EOF errors. They cause too much spam. - // Note the EOF detection is not 100% - // guaranteed, in the case where the client - // connection is already closed before we call - // 'readEphemeralPacket'. This is a corner - // case though, and very unlikely to happen, - // and the only downside is we log a bit more then. - if err != io.EOF { + if err != io.EOF && !strings.Contains(err.Error(), "use of closed network connection") { log.Errorf("Error reading packet from %s: %v", c, err) } return err diff --git a/go/vt/dbconfigs/dbconfigs.go b/go/vt/dbconfigs/dbconfigs.go index fe13a8ee782..8caef96abd2 100644 --- a/go/vt/dbconfigs/dbconfigs.go +++ b/go/vt/dbconfigs/dbconfigs.go @@ -24,6 +24,7 @@ import ( "context" "encoding/json" "flag" + "fmt" "vitess.io/vitess/go/mysql" "vitess.io/vitess/go/vt/log" @@ -190,6 +191,10 @@ func (c Connector) Connect(ctx context.Context) (*mysql.Conn, error) { // MysqlParams returns the connections params func (c Connector) MysqlParams() (*mysql.ConnParams, error) { + if c.connParams == nil { + // This is only possible during tests. + return nil, fmt.Errorf("parameters are empty") + } params, err := withCredentials(c.connParams) if err != nil { return nil, err diff --git a/go/vt/vttablet/tabletserver/bench_test.go b/go/vt/vttablet/tabletserver/bench_test.go index c757454db45..43d3faea4df 100644 --- a/go/vt/vttablet/tabletserver/bench_test.go +++ b/go/vt/vttablet/tabletserver/bench_test.go @@ -27,8 +27,6 @@ import ( querypb "vitess.io/vitess/go/vt/proto/query" topodatapb "vitess.io/vitess/go/vt/proto/topodata" - "vitess.io/vitess/go/vt/topo/memorytopo" - "vitess.io/vitess/go/vt/vttablet/tabletserver/tabletenv" ) // Benchmark run on 6/27/17, with optimized byte-level operations @@ -57,8 +55,10 @@ func init() { } func BenchmarkExecuteVarBinary(b *testing.B) { - db := setUpTabletServerTest(nil) + db, tsv := setupTabletServerTest(nil) defer db.Close() + defer tsv.StopService() + // sql that will be executed in this test bv := map[string]*querypb.BindVariable{ "vtg1": sqltypes.Int64BindVariable(1), @@ -67,14 +67,7 @@ func BenchmarkExecuteVarBinary(b *testing.B) { bv[fmt.Sprintf("vtg%d", i)] = sqltypes.BytesBindVariable(benchVarValue) } - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - if err := tsv.StartService(target, newDBConfigs(db)); err != nil { - panic(err) - } - defer tsv.StopService() - db.AllowAll = true for i := 0; i < b.N; i++ { if _, err := tsv.Execute(context.Background(), &target, benchQuery, bv, 0, 0, nil); err != nil { @@ -84,8 +77,10 @@ func BenchmarkExecuteVarBinary(b *testing.B) { } func BenchmarkExecuteExpression(b *testing.B) { - db := setUpTabletServerTest(nil) + db, tsv := setupTabletServerTest(nil) defer db.Close() + defer tsv.StopService() + // sql that will be executed in this test bv := map[string]*querypb.BindVariable{ "vtg1": sqltypes.Int64BindVariable(1), @@ -97,14 +92,7 @@ func BenchmarkExecuteExpression(b *testing.B) { } } - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - if err := tsv.StartService(target, newDBConfigs(db)); err != nil { - panic(err) - } - defer tsv.StopService() - db.AllowAll = true for i := 0; i < b.N; i++ { if _, err := tsv.Execute(context.Background(), &target, benchQuery, bv, 0, 0, nil); err != nil { diff --git a/go/vt/vttablet/tabletserver/query_executor_test.go b/go/vt/vttablet/tabletserver/query_executor_test.go index 2f297078310..922cf7d2762 100644 --- a/go/vt/vttablet/tabletserver/query_executor_test.go +++ b/go/vt/vttablet/tabletserver/query_executor_test.go @@ -1109,7 +1109,10 @@ func newTestTabletServer(ctx context.Context, flags executorFlags, db *fakesqldb tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) dbconfigs := newDBConfigs(db) target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - tsv.StartService(target, dbconfigs) + err := tsv.StartService(target, dbconfigs) + if err != nil { + panic(err) + } return tsv } diff --git a/go/vt/vttablet/tabletserver/state_manager.go b/go/vt/vttablet/tabletserver/state_manager.go index 0ca20a19e71..9fabe5cc922 100644 --- a/go/vt/vttablet/tabletserver/state_manager.go +++ b/go/vt/vttablet/tabletserver/state_manager.go @@ -68,7 +68,6 @@ type stateManager struct { state int64 target querypb.Target transitioning bool - connecting bool // TODO(sougou): deprecate alsoAllow alsoAllow []topodatapb.TabletType @@ -120,30 +119,10 @@ type txThrottler interface { Close() } -const ( - actionNone = iota - actionFullStart - actionServeNewType - actionGracefulStop -) - func (sm *stateManager) SetServingType(tabletType topodatapb.TabletType, state int64, alsoAllow []topodatapb.TabletType) (stateChanged bool, err error) { - // TODO(sougou): deprecate the waits after tabletmanager has been refactored. - startTime := time.Now() - stateChanged = sm.setDesiredState(tabletType, state, alsoAllow) - for { - curState, curTabletType := sm.State(), sm.Target().TabletType - if curState == StateNotConnected { - return stateChanged, vterrors.Errorf(vtrpcpb.Code_UNAVAILABLE, "MySQL is unavailable") - } - if curState == state && curTabletType == tabletType { - return stateChanged, nil - } - time.Sleep(10 * time.Millisecond) - if time.Since(startTime) > 1*time.Second { - return stateChanged, vterrors.Errorf(vtrpcpb.Code_DEADLINE_EXCEEDED, "State transition deadline exceeded") - } - } + stateChanged, errch := sm.setDesiredState(tabletType, state, alsoAllow) + err = <-errch + return stateChanged, err } func (sm *stateManager) CheckMySQL() { @@ -177,20 +156,15 @@ func (sm *stateManager) CheckMySQL() { // of executeTransition where it waits for 1s and retries. sm.closeAll() time.Sleep(1 * time.Second) - go sm.executeTransition() + go sm.executeTransition(make(chan error, 1)) }() } func (sm *stateManager) StopService() { defer close(sm.setTimeBomb()) + log.Info("Stopping TabletServer") sm.SetServingType(sm.Target().TabletType, StateNotConnected, nil) - for { - if sm.State() == StateNotConnected { - return - } - time.Sleep(10 * time.Millisecond) - } } // StartRequest validates the current state and target and registers @@ -255,10 +229,12 @@ func (sm *stateManager) VerifyTarget(ctx context.Context, target *querypb.Target return nil } -func (sm *stateManager) setDesiredState(tabletType topodatapb.TabletType, state int64, alsoAllow []topodatapb.TabletType) bool { +func (sm *stateManager) setDesiredState(tabletType topodatapb.TabletType, state int64, alsoAllow []topodatapb.TabletType) (bool, <-chan error) { sm.mu.Lock() defer sm.mu.Unlock() + ch := make(chan error, 1) + stateChanged := false if sm.wantTabletType != tabletType { stateChanged = true @@ -270,42 +246,49 @@ func (sm *stateManager) setDesiredState(tabletType topodatapb.TabletType, state } sm.alsoAllow = alsoAllow if sm.transitioning { - return stateChanged + ch <- nil + return stateChanged, ch } if sm.wantState == sm.state && sm.wantTabletType == sm.target.TabletType { - return stateChanged + ch <- nil + return stateChanged, ch } sm.transitioning = true - go sm.executeTransition() - return stateChanged + go sm.executeTransition(ch) + return stateChanged, ch } // executeTransition must be invoked after setting sm.transitioning to true. // If the flag is already set, it must not be called. The function will // reset the flag to false when it returns. -func (sm *stateManager) executeTransition() { +func (sm *stateManager) executeTransition(ch chan<- error) { // Repeat until desired state is reached. errorReported := false for { ok, wantTabletType, wantState := sm.transitionDone() if ok { + if !errorReported { + ch <- nil + } return } var err error - switch wantTabletType { - case topodatapb.TabletType_MASTER: - if wantState == StateServing { + switch wantState { + case StateServing: + if wantTabletType == topodatapb.TabletType_MASTER { err = sm.serveMaster() } else { - err = sm.unserveMaster() - } - default: - if wantState == StateServing { err = sm.serveNonMaster(wantTabletType) + } + case StateNotServing: + if wantTabletType == topodatapb.TabletType_MASTER { + err = sm.unserveMaster() } else { err = sm.unserveNonMaster(wantTabletType) } + case StateNotConnected: + sm.closeAll() } // If there was an error, shut everything down // and retry after a delay. @@ -316,6 +299,7 @@ func (sm *stateManager) executeTransition() { if err != nil { if !errorReported { errorReported = true + ch <- err log.Errorf("Error transitioning to the desired state: %v, %v, will keep retrying: %v", wantTabletType, stateName[wantState], err) } sm.closeAll() diff --git a/go/vt/vttablet/tabletserver/state_manager_test.go b/go/vt/vttablet/tabletserver/state_manager_test.go new file mode 100644 index 00000000000..9a7909274dd --- /dev/null +++ b/go/vt/vttablet/tabletserver/state_manager_test.go @@ -0,0 +1,42 @@ +/* +Copyright 2020 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package tabletserver + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestTabletServerGetState(t *testing.T) { + states := []int64{ + StateNotConnected, + StateNotServing, + StateServing, + } + // Don't reuse stateName. + names := []string{ + "NOT_SERVING", + "NOT_SERVING", + "SERVING", + } + sm := &stateManager{} + for i, state := range states { + sm.state = state + require.Equal(t, names[i], sm.StateByName(), "StateByName") + } +} diff --git a/go/vt/vttablet/tabletserver/tabletserver_test.go b/go/vt/vttablet/tabletserver/tabletserver_test.go index 7ad74eb75a3..ae4927989c9 100644 --- a/go/vt/vttablet/tabletserver/tabletserver_test.go +++ b/go/vt/vttablet/tabletserver/tabletserver_test.go @@ -32,16 +32,15 @@ import ( "vitess.io/vitess/go/vt/callerid" + "vitess.io/vitess/go/mysql/fakesqldb" "vitess.io/vitess/go/test/utils" "github.com/stretchr/testify/assert" - "github.com/golang/protobuf/proto" "github.com/stretchr/testify/require" "golang.org/x/net/context" "vitess.io/vitess/go/mysql" - "vitess.io/vitess/go/mysql/fakesqldb" "vitess.io/vitess/go/sqltypes" "vitess.io/vitess/go/vt/log" "vitess.io/vitess/go/vt/sqlparser" @@ -56,419 +55,37 @@ import ( vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" ) -func TestTabletServerGetState(t *testing.T) { - states := []int64{ - StateNotConnected, - StateNotServing, - StateServing, - } - // Don't reuse stateName. - names := []string{ - "NOT_SERVING", - "NOT_SERVING", - "SERVING", - } - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - for i, state := range states { - tsv.sm.setState(state) - require.Equal(t, names[i], tsv.sm.StateByName(), "StateByName") - } - tsv.EnterLameduck() - require.Equal(t, "NOT_SERVING", tsv.sm.StateByName(), "StateByName") -} - -func TestTabletServerAllowQueriesFailBadConn(t *testing.T) { - db := setUpTabletServerTest(t) - defer db.Close() - db.EnableConnFail() - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - checkTabletServerState(t, tsv, StateNotConnected) - dbcfgs := newDBConfigs(db) - target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - require.Error(t, err, "TabletServer.StartService should fail") - checkTabletServerState(t, tsv, StateNotConnected) -} - -func TestTabletServerAllowQueries(t *testing.T) { - db := setUpTabletServerTest(t) - defer db.Close() - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - checkTabletServerState(t, tsv, StateNotConnected) - dbcfgs := newDBConfigs(db) - tsv.sm.setState(StateServing) - target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - tsv.StopService() - want := "InitDBConfig failed" - require.Error(t, err) - assert.Contains(t, err.Error(), want) - tsv.sm.setState(StateShuttingDown) - err = tsv.StartService(target, dbcfgs) - require.Error(t, err, "TabletServer.StartService should fail") - tsv.StopService() -} - -func TestTabletServerInitDBConfig(t *testing.T) { - db := setUpTabletServerTest(t) - defer db.Close() - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - tsv.sm.setState(StateServing) - target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - dbcfgs := newDBConfigs(db) - err := tsv.InitDBConfig(target, dbcfgs) - want := "InitDBConfig failed" - require.Error(t, err) - assert.Contains(t, err.Error(), want) - tsv.sm.setState(StateNotConnected) - err = tsv.InitDBConfig(target, dbcfgs) - require.NoError(t, err) -} - -func TestDecideAction(t *testing.T) { - db := setUpTabletServerTest(t) - defer db.Close() - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - dbcfgs := newDBConfigs(db) - err := tsv.InitDBConfig(target, dbcfgs) - require.NoError(t, err) - - tsv.sm.setState(StateNotConnected) - action, err := tsv.sm.decideAction(topodatapb.TabletType_MASTER, false, nil) - require.NoError(t, err) - if action != actionNone { - t.Errorf("decideAction: %v, want %v", action, actionNone) - } - - tsv.sm.setState(StateNotConnected) - action, err = tsv.sm.decideAction(topodatapb.TabletType_MASTER, true, nil) - require.NoError(t, err) - if action != actionFullStart { - t.Errorf("decideAction: %v, want %v", action, actionFullStart) - } - if tsv.sm.State() != StateTransitioning { - t.Errorf("tsv.sm.state: %v, want %v", tsv.sm.State(), StateTransitioning) - } - - tsv.sm.setState(StateNotServing) - action, err = tsv.sm.decideAction(topodatapb.TabletType_MASTER, false, nil) - require.NoError(t, err) - if action != actionNone { - t.Errorf("decideAction: %v, want %v", action, actionNone) - } - - tsv.sm.setState(StateNotServing) - action, err = tsv.sm.decideAction(topodatapb.TabletType_MASTER, true, nil) - require.NoError(t, err) - if action != actionServeNewType { - t.Errorf("decideAction: %v, want %v", action, actionServeNewType) - } - if tsv.sm.State() != StateTransitioning { - t.Errorf("tsv.sm.state: %v, want %v", tsv.sm.State(), StateTransitioning) - } - - tsv.sm.setState(StateServing) - action, err = tsv.sm.decideAction(topodatapb.TabletType_MASTER, false, nil) - require.NoError(t, err) - if action != actionGracefulStop { - t.Errorf("decideAction: %v, want %v", action, actionGracefulStop) - } - if tsv.sm.State() != StateShuttingDown { - t.Errorf("tsv.sm.state: %v, want %v", tsv.sm.State(), StateShuttingDown) - } - - tsv.sm.setState(StateServing) - action, err = tsv.sm.decideAction(topodatapb.TabletType_REPLICA, true, nil) - require.NoError(t, err) - if action != actionServeNewType { - t.Errorf("decideAction: %v, want %v", action, actionServeNewType) - } - if tsv.sm.State() != StateTransitioning { - t.Errorf("tsv.sm.state: %v, want %v", tsv.sm.State(), StateTransitioning) - } - tsv.sm.target.TabletType = topodatapb.TabletType_MASTER - - tsv.sm.setState(StateServing) - action, err = tsv.sm.decideAction(topodatapb.TabletType_MASTER, true, nil) - require.NoError(t, err) - if action != actionNone { - t.Errorf("decideAction: %v, want %v", action, actionNone) - } - if tsv.sm.State() != StateServing { - t.Errorf("tsv.sm.state: %v, want %v", tsv.sm.State(), StateServing) - } - - tsv.sm.setState(StateTransitioning) - _, err = tsv.sm.decideAction(topodatapb.TabletType_MASTER, false, nil) - want := "cannot SetServingType" - require.Error(t, err) - assert.Contains(t, err.Error(), want) - - tsv.sm.setState(StateShuttingDown) - _, err = tsv.sm.decideAction(topodatapb.TabletType_MASTER, false, nil) - want = "cannot SetServingType" - require.Error(t, err) - assert.Contains(t, err.Error(), want) -} - -func TestSetServingType(t *testing.T) { - db := setUpTabletServerTest(t) - defer db.Close() - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) - target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.InitDBConfig(target, dbcfgs) - require.NoError(t, err) - - stateChanged, err := tsv.SetServingType(topodatapb.TabletType_REPLICA, false, nil) - if stateChanged != false { - t.Errorf("SetServingType() should NOT have changed the QueryService state, but did") - } - require.NoError(t, err) - checkTabletServerState(t, tsv, StateNotConnected) - - stateChanged, err = tsv.SetServingType(topodatapb.TabletType_REPLICA, true, nil) - if stateChanged != true { - t.Errorf("SetServingType() should have changed the QueryService state, but did not") - } - require.NoError(t, err) - checkTabletServerState(t, tsv, StateServing) - - stateChanged, err = tsv.SetServingType(topodatapb.TabletType_RDONLY, true, nil) - if stateChanged != true { - t.Errorf("SetServingType() should have changed the tablet type, but did not") - } - require.NoError(t, err) - checkTabletServerState(t, tsv, StateServing) - - stateChanged, err = tsv.SetServingType(topodatapb.TabletType_SPARE, false, nil) - if stateChanged != true { - t.Errorf("SetServingType() should have changed the QueryService state, but did not") - } - require.NoError(t, err) - checkTabletServerState(t, tsv, StateNotServing) - - // Verify that we exit lameduck when SetServingType is called. - tsv.EnterLameduck() - if stateName := tsv.sm.StateByName(); stateName != "NOT_SERVING" { - t.Errorf("StateByName: %s, want NOT_SERVING", stateName) - } - stateChanged, err = tsv.SetServingType(topodatapb.TabletType_REPLICA, true, nil) - if stateChanged != true { - t.Errorf("SetServingType() should have changed the QueryService state, but did not") - } - require.NoError(t, err) - checkTabletServerState(t, tsv, StateServing) - if stateName := tsv.sm.StateByName(); stateName != "SERVING" { - t.Errorf("StateByName: %s, want SERVING", stateName) - } - - tsv.StopService() - checkTabletServerState(t, tsv, StateNotConnected) -} - -func TestTabletServerSingleSchemaFailure(t *testing.T) { - db := setUpTabletServerTest(t) - defer db.Close() - - want := &sqltypes.Result{ - Fields: mysql.BaseShowTablesFields, - Rows: [][]sqltypes.Value{ - mysql.BaseShowTablesRow("test_table", false, ""), - // Return a table that tabletserver can't access (the mock will reject all queries to it). - mysql.BaseShowTablesRow("rejected_table", false, ""), - }, - } - db.AddQuery(mysql.BaseShowTables, want) - - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) - target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - assert.Error(t, err) -} - -func TestTabletServerCheckMysql(t *testing.T) { - db := setUpTabletServerTest(t) - defer db.Close() - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) - target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - defer tsv.StopService() - require.NoError(t, err) - if !tsv.sm.isMySQLReachable() { - t.Error("isMySQLReachable should return true") - } - stateChanged, err := tsv.SetServingType(topodatapb.TabletType_SPARE, false, nil) - require.NoError(t, err) - if stateChanged != true { - t.Errorf("SetServingType() should have changed the QueryService state, but did not") - } - if !tsv.sm.isMySQLReachable() { - t.Error("isMySQLReachable should return true") - } - checkTabletServerState(t, tsv, StateNotServing) -} - -func TestTabletServerReconnect(t *testing.T) { - db := setUpTabletServerTest(t) - defer db.Close() - query := "select addr from test_table where pk = 1 limit 1000" - want := &sqltypes.Result{} - db.AddQuery(query, want) - db.AddQuery("select addr from test_table where 1 != 1", &sqltypes.Result{}) - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) - target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) +func TestBeginOnReplica(t *testing.T) { + db, tsv := setupTabletServerTest(t) defer tsv.StopService() - - if tsv.sm.StateByName() != "SERVING" { - t.Errorf("StateByName: %s, must be SERVING", tsv.sm.StateByName()) - } - if err != nil { - t.Fatalf("TabletServer.StartService should success but get error: %v", err) - } - _, err = tsv.Execute(context.Background(), &target, query, nil, 0, 0, nil) - require.NoError(t, err) - - // make mysql conn fail - db.Close() - _, err = tsv.Execute(context.Background(), &target, query, nil, 0, 0, nil) - if err == nil { - t.Error("Execute: want error, got nil") - } - time.Sleep(50 * time.Millisecond) - if tsv.sm.StateByName() == "SERVING" { - t.Error("StateByName is still SERVING, must be NOT_SERVING") - } - - // make mysql conn work - db = setUpTabletServerTest(t) - db.AddQuery(query, want) - db.AddQuery("select addr from test_table where 1 != 1", &sqltypes.Result{}) - dbcfgs = newDBConfigs(db) - err = tsv.StartService(target, dbcfgs) - require.NoError(t, err) - _, err = tsv.Execute(context.Background(), &target, query, nil, 0, 0, nil) - require.NoError(t, err) -} - -func TestTabletServerTarget(t *testing.T) { - db := setUpTabletServerTest(t) defer db.Close() - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) - target1 := querypb.Target{ - Keyspace: "test_keyspace", - Shard: "test_shard", - TabletType: topodatapb.TabletType_MASTER, - } - err := tsv.StartService(target1, dbcfgs) - if err != nil { - t.Fatalf("StartService failed: %v", err) - } - defer tsv.StopService() - - // query that works - db.AddQuery("select * from test_table limit 1000", &sqltypes.Result{}) - _, err = tsv.Execute(ctx, &target1, "select * from test_table limit 1000", nil, 0, 0, nil) - require.NoError(t, err) - // wrong tablet type - target2 := proto.Clone(&target1).(*querypb.Target) - target2.TabletType = topodatapb.TabletType_REPLICA - _, err = tsv.Execute(ctx, target2, "select * from test_table limit 1000", nil, 0, 0, nil) - want := "invalid tablet type" - require.Error(t, err) - assert.Contains(t, err.Error(), want) - - // set expected target type to MASTER, but also accept REPLICA - tsv.SetServingType(topodatapb.TabletType_MASTER, true, []topodatapb.TabletType{topodatapb.TabletType_REPLICA}) - _, err = tsv.Execute(ctx, &target1, "select * from test_table limit 1000", nil, 0, 0, nil) - require.NoError(t, err) - _, err = tsv.Execute(ctx, target2, "select * from test_table limit 1000", nil, 0, 0, nil) - require.NoError(t, err) - - // wrong keyspace - target2 = proto.Clone(&target1).(*querypb.Target) - target2.Keyspace = "bad" - _, err = tsv.Execute(ctx, target2, "select * from test_table limit 1000", nil, 0, 0, nil) - want = "invalid keyspace bad" - require.Error(t, err) - assert.Contains(t, err.Error(), want) - - // wrong shard - target2 = proto.Clone(&target1).(*querypb.Target) - target2.Shard = "bad" - _, err = tsv.Execute(ctx, target2, "select * from test_table limit 1000", nil, 0, 0, nil) - want = "invalid shard bad" - require.Error(t, err) - assert.Contains(t, err.Error(), want) - - // no target - _, err = tsv.Execute(ctx, nil, "select * from test_table limit 1000", nil, 0, 0, nil) - want = "No target" - require.Error(t, err) - assert.Contains(t, err.Error(), want) - - // Disallow all if service is stopped. - tsv.StopService() - _, err = tsv.Execute(ctx, &target1, "select * from test_table limit 1000", nil, 0, 0, nil) - want = "operation not allowed in state NOT_SERVING" - require.Error(t, err) - assert.Contains(t, err.Error(), want) -} - -func TestBeginOnReplica(t *testing.T) { - db := setUpTabletServerTest(t) db.AddQueryPattern(".*", &sqltypes.Result{}) - defer db.Close() - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) - target1 := querypb.Target{ - Keyspace: "test_keyspace", - Shard: "test_shard", - TabletType: topodatapb.TabletType_REPLICA, - } - err := tsv.StartService(target1, dbcfgs) + target := querypb.Target{TabletType: topodatapb.TabletType_REPLICA} + _, err := tsv.SetServingType(topodatapb.TabletType_REPLICA, true, nil) require.NoError(t, err) - defer tsv.StopService() - tsv.SetServingType(topodatapb.TabletType_REPLICA, true, nil) options := querypb.ExecuteOptions{ TransactionIsolation: querypb.ExecuteOptions_CONSISTENT_SNAPSHOT_READ_ONLY, } - txID, alias, err := tsv.Begin(ctx, &target1, &options) + txID, alias, err := tsv.Begin(ctx, &target, &options) require.NoError(t, err, "failed to create read only tx on replica") assert.Equal(t, tsv.alias, *alias, "Wrong tablet alias from Begin") - _, err = tsv.Rollback(ctx, &target1, txID) + _, err = tsv.Rollback(ctx, &target, txID) require.NoError(t, err, "failed to rollback read only tx") // test that we can still create transactions even in read-only mode options = querypb.ExecuteOptions{} - txID, _, err = tsv.Begin(ctx, &target1, &options) + txID, _, err = tsv.Begin(ctx, &target, &options) require.NoError(t, err, "expected write tx to be allowed") - _, err = tsv.Rollback(ctx, &target1, txID) + _, err = tsv.Rollback(ctx, &target, txID) require.NoError(t, err) } func TestTabletServerMasterToReplica(t *testing.T) { // Reuse code from tx_executor_test. _, tsv, db := newTestTxExecutor(t) + defer tsv.StopService() defer db.Close() target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} txid1, _, err := tsv.Begin(ctx, &target, nil) @@ -507,8 +124,8 @@ func TestTabletServerMasterToReplica(t *testing.T) { func TestTabletServerRedoLogIsKeptBetweenRestarts(t *testing.T) { // Reuse code from tx_executor_test. _, tsv, db := newTestTxExecutor(t) - defer db.Close() defer tsv.StopService() + defer db.Close() tsv.SetServingType(topodatapb.TabletType_REPLICA, true, nil) turnOnTxEngine := func() { @@ -590,8 +207,8 @@ func TestTabletServerRedoLogIsKeptBetweenRestarts(t *testing.T) { func TestTabletServerCreateTransaction(t *testing.T) { _, tsv, db := newTestTxExecutor(t) - defer db.Close() defer tsv.StopService() + defer db.Close() target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} db.AddQueryPattern(fmt.Sprintf("insert into _vt\\.dt_state\\(dtid, state, time_created\\) values \\('aa', %d,.*", int(querypb.TransactionState_PREPARE)), &sqltypes.Result{}) @@ -605,8 +222,8 @@ func TestTabletServerCreateTransaction(t *testing.T) { func TestTabletServerStartCommit(t *testing.T) { _, tsv, db := newTestTxExecutor(t) - defer db.Close() defer tsv.StopService() + defer db.Close() target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} commitTransition := fmt.Sprintf("update _vt.dt_state set state = %d where dtid = 'aa' and state = %d", int(querypb.TransactionState_COMMIT), int(querypb.TransactionState_PREPARE)) @@ -623,8 +240,8 @@ func TestTabletServerStartCommit(t *testing.T) { func TestTabletserverSetRollback(t *testing.T) { _, tsv, db := newTestTxExecutor(t) - defer db.Close() defer tsv.StopService() + defer db.Close() target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} rollbackTransition := fmt.Sprintf("update _vt.dt_state set state = %d where dtid = 'aa' and state = %d", int(querypb.TransactionState_ROLLBACK), int(querypb.TransactionState_PREPARE)) @@ -641,8 +258,8 @@ func TestTabletserverSetRollback(t *testing.T) { func TestTabletServerReadTransaction(t *testing.T) { _, tsv, db := newTestTxExecutor(t) - defer db.Close() defer tsv.StopService() + defer db.Close() target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} db.AddQuery("select dtid, state, time_created from _vt.dt_state where dtid = 'aa'", &sqltypes.Result{}) @@ -734,8 +351,8 @@ func TestTabletServerReadTransaction(t *testing.T) { func TestTabletServerConcludeTransaction(t *testing.T) { _, tsv, db := newTestTxExecutor(t) - defer db.Close() defer tsv.StopService() + defer db.Close() target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} db.AddQuery("delete from _vt.dt_state where dtid = 'aa'", &sqltypes.Result{}) @@ -745,27 +362,25 @@ func TestTabletServerConcludeTransaction(t *testing.T) { } func TestTabletServerBeginFail(t *testing.T) { - db := setUpTabletServerTest(t) - defer db.Close() config := tabletenv.NewDefaultConfig() config.TxPool.Size = 1 - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) - target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - require.NoError(t, err) + db, tsv := setupTabletServerTestCustom(t, config) defer tsv.StopService() + defer db.Close() + + target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} ctx, cancel := context.WithTimeout(context.Background(), 1*time.Nanosecond) defer cancel() tsv.Begin(ctx, &target, nil) - _, _, err = tsv.Begin(ctx, &target, nil) + _, _, err := tsv.Begin(ctx, &target, nil) require.EqualError(t, err, "transaction pool aborting request due to already expired context", "Begin err") } func TestTabletServerCommitTransaction(t *testing.T) { - db := setUpTabletServerTest(t) + db, tsv := setupTabletServerTest(t) + defer tsv.StopService() defer db.Close() - // sql that will be executed in this test + executeSQL := "select * from test_table limit 1000" executeSQLResult := &sqltypes.Result{ Fields: []*querypb.Field{ @@ -776,54 +391,34 @@ func TestTabletServerCommitTransaction(t *testing.T) { }, } db.AddQuery(executeSQL, executeSQLResult) - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) + target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - if err != nil { - t.Fatalf("StartService failed: %v", err) - } - defer tsv.StopService() transactionID, _, err := tsv.Begin(ctx, &target, nil) - if err != nil { - t.Fatalf("call TabletServer.Begin failed: %v", err) - } - if _, err := tsv.Execute(ctx, &target, executeSQL, nil, transactionID, 0, nil); err != nil { - t.Fatalf("failed to execute query: %s: %s", executeSQL, err) - } - if _, err := tsv.Commit(ctx, &target, transactionID); err != nil { - t.Fatalf("call TabletServer.Commit failed: %v", err) - } + require.NoError(t, err) + _, err = tsv.Execute(ctx, &target, executeSQL, nil, transactionID, 0, nil) + require.NoError(t, err) + _, err = tsv.Commit(ctx, &target, transactionID) + require.NoError(t, err) } -func TestTabletServerCommitRollbackFail(t *testing.T) { - db := setUpTabletServerTest(t) +func TestTabletServerCommiRollbacktFail(t *testing.T) { + db, tsv := setupTabletServerTest(t) + defer tsv.StopService() defer db.Close() - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) + target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - if err != nil { - t.Fatalf("StartService failed: %v", err) - } - defer tsv.StopService() - _, err = tsv.Commit(ctx, &target, -1) + _, err := tsv.Commit(ctx, &target, -1) want := "transaction -1: not found" - if err == nil || err.Error() != want { - t.Fatalf("Commit err: %v, want %v", err, want) - } + require.Equal(t, want, err.Error()) _, err = tsv.Rollback(ctx, &target, -1) - if err == nil || err.Error() != want { - t.Fatalf("Commit err: %v, want %v", err, want) - } + require.Equal(t, want, err.Error()) } func TestTabletServerRollback(t *testing.T) { - db := setUpTabletServerTest(t) + db, tsv := setupTabletServerTest(t) + defer tsv.StopService() defer db.Close() - // sql that will be executed in this test + executeSQL := "select * from test_table limit 1000" executeSQLResult := &sqltypes.Result{ Fields: []*querypb.Field{ @@ -834,32 +429,24 @@ func TestTabletServerRollback(t *testing.T) { }, } db.AddQuery(executeSQL, executeSQLResult) - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) + target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - if err != nil { - t.Fatalf("StartService failed: %v", err) - } - defer tsv.StopService() transactionID, _, err := tsv.Begin(ctx, &target, nil) + require.NoError(t, err) if err != nil { t.Fatalf("call TabletServer.Begin failed: %v", err) } - if _, err := tsv.Execute(ctx, &target, executeSQL, nil, transactionID, 0, nil); err != nil { - t.Fatalf("failed to execute query: %s: %v", executeSQL, err) - } - if _, err := tsv.Rollback(ctx, &target, transactionID); err != nil { - t.Fatalf("call TabletServer.Rollback failed: %v", err) - } + _, err = tsv.Execute(ctx, &target, executeSQL, nil, transactionID, 0, nil) + require.NoError(t, err) + _, err = tsv.Rollback(ctx, &target, transactionID) + require.NoError(t, err) } func TestTabletServerPrepare(t *testing.T) { // Reuse code from tx_executor_test. _, tsv, db := newTestTxExecutor(t) - defer db.Close() defer tsv.StopService() + defer db.Close() target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} transactionID, _, err := tsv.Begin(ctx, &target, nil) require.NoError(t, err) @@ -873,8 +460,8 @@ func TestTabletServerPrepare(t *testing.T) { func TestTabletServerCommitPrepared(t *testing.T) { // Reuse code from tx_executor_test. _, tsv, db := newTestTxExecutor(t) - defer db.Close() defer tsv.StopService() + defer db.Close() target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} transactionID, _, err := tsv.Begin(ctx, &target, nil) require.NoError(t, err) @@ -888,17 +475,12 @@ func TestTabletServerCommitPrepared(t *testing.T) { } func TestTabletServerReserveConnection(t *testing.T) { - db := setUpTabletServerTest(t) + db, tsv := setupTabletServerTest(t) + defer tsv.StopService() defer db.Close() db.AddQueryPattern(".*", &sqltypes.Result{}) - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - require.NoError(t, err) - defer tsv.StopService() options := &querypb.ExecuteOptions{} // reserve a connection @@ -915,73 +497,53 @@ func TestTabletServerReserveConnection(t *testing.T) { } func TestTabletServerExecNonExistentConnection(t *testing.T) { - db := setUpTabletServerTest(t) + db, tsv := setupTabletServerTest(t) + defer tsv.StopService() defer db.Close() db.AddQueryPattern(".*", &sqltypes.Result{}) - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - require.NoError(t, err) - defer tsv.StopService() options := &querypb.ExecuteOptions{} // run a query with a non-existent reserved id - _, err = tsv.Execute(ctx, &target, "select 42", nil, 0, 123456, options) + _, err := tsv.Execute(ctx, &target, "select 42", nil, 0, 123456, options) require.Error(t, err) } func TestTabletServerReleaseNonExistentConnection(t *testing.T) { - db := setUpTabletServerTest(t) + db, tsv := setupTabletServerTest(t) + defer tsv.StopService() defer db.Close() db.AddQueryPattern(".*", &sqltypes.Result{}) - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - require.NoError(t, err) - defer tsv.StopService() // run a query with a non-existent reserved id - err = tsv.Release(ctx, &target, 0, 123456) + err := tsv.Release(ctx, &target, 0, 123456) require.Error(t, err) } func TestMakeSureToCloseDbConnWhenBeginQueryFails(t *testing.T) { - db := setUpTabletServerTest(t) + db, tsv := setupTabletServerTest(t) + defer tsv.StopService() defer db.Close() db.AddRejectedQuery("begin", errors.New("it broke")) - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - require.NoError(t, err) - defer tsv.StopService() options := &querypb.ExecuteOptions{} // run a query with a non-existent reserved id - _, _, _, _, err = tsv.ReserveBeginExecute(ctx, &target, "select 42", []string{}, nil, options) + _, _, _, _, err := tsv.ReserveBeginExecute(ctx, &target, "select 42", []string{}, nil, options) require.Error(t, err) } func TestTabletServerReserveAndBeginCommit(t *testing.T) { - db := setUpTabletServerTest(t) + db, tsv := setupTabletServerTest(t) + defer tsv.StopService() defer db.Close() db.AddQueryPattern(".*", &sqltypes.Result{}) - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - require.NoError(t, err) - defer tsv.StopService() options := &querypb.ExecuteOptions{} // reserve a connection and a transaction @@ -1031,8 +593,8 @@ func TestTabletServerReserveAndBeginCommit(t *testing.T) { func TestTabletServerRollbackPrepared(t *testing.T) { // Reuse code from tx_executor_test. _, tsv, db := newTestTxExecutor(t) - defer db.Close() defer tsv.StopService() + defer db.Close() target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} transactionID, _, err := tsv.Begin(ctx, &target, nil) require.NoError(t, err) @@ -1045,9 +607,10 @@ func TestTabletServerRollbackPrepared(t *testing.T) { } func TestTabletServerStreamExecute(t *testing.T) { - db := setUpTabletServerTest(t) + db, tsv := setupTabletServerTest(t) + defer tsv.StopService() defer db.Close() - // sql that will be executed in this test + executeSQL := "select * from test_table limit 1000" executeSQLResult := &sqltypes.Result{ Fields: []*querypb.Field{ @@ -1059,15 +622,7 @@ func TestTabletServerStreamExecute(t *testing.T) { } db.AddQuery(executeSQL, executeSQLResult) - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - if err != nil { - t.Fatalf("StartService failed: %v", err) - } - defer tsv.StopService() callback := func(*sqltypes.Result) error { return nil } if err := tsv.StreamExecute(ctx, &target, executeSQL, nil, 0, nil, callback); err != nil { t.Fatalf("TabletServer.StreamExecute should success: %s, but get error: %v", @@ -1076,9 +631,10 @@ func TestTabletServerStreamExecute(t *testing.T) { } func TestTabletServerStreamExecuteComments(t *testing.T) { - db := setUpTabletServerTest(t) + db, tsv := setupTabletServerTest(t) + defer tsv.StopService() defer db.Close() - // sql that will be executed in this test + executeSQL := "/* leading */ select * from test_table limit 1000 /* trailing */" executeSQLResult := &sqltypes.Result{ Fields: []*querypb.Field{ @@ -1090,15 +646,7 @@ func TestTabletServerStreamExecuteComments(t *testing.T) { } db.AddQuery(executeSQL, executeSQLResult) - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - if err != nil { - t.Fatalf("StartService failed: %v", err) - } - defer tsv.StopService() callback := func(*sqltypes.Result) error { return nil } ch := tabletenv.StatsLogger.Subscribe("test stats logging") @@ -1125,23 +673,17 @@ func TestTabletServerStreamExecuteComments(t *testing.T) { } } func TestTabletServerExecuteBatch(t *testing.T) { - db := setUpTabletServerTest(t) + db, tsv := setupTabletServerTest(t) + defer tsv.StopService() defer db.Close() + sql := "insert into test_table values (1, 2, 'addr', 'name')" sqlResult := &sqltypes.Result{} expandedSQL := "insert into test_table(pk, name, addr, name_string) values (1, 2, 'addr', 'name') /* _stream test_table (pk ) (1 ); */" db.AddQuery(sql, sqlResult) db.AddQuery(expandedSQL, sqlResult) - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - if err != nil { - t.Fatalf("StartService failed: %v", err) - } - defer tsv.StopService() if _, err := tsv.ExecuteBatch(ctx, &target, []*querypb.BoundQuery{ { Sql: sql, @@ -1153,36 +695,22 @@ func TestTabletServerExecuteBatch(t *testing.T) { } func TestTabletServerExecuteBatchFailEmptyQueryList(t *testing.T) { - db := setUpTabletServerTest(t) - defer db.Close() - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) - target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - if err != nil { - t.Fatalf("StartService failed: %v", err) - } + db, tsv := setupTabletServerTest(t) defer tsv.StopService() - _, err = tsv.ExecuteBatch(ctx, nil, []*querypb.BoundQuery{}, false, 0, nil) + defer db.Close() + + _, err := tsv.ExecuteBatch(ctx, nil, []*querypb.BoundQuery{}, false, 0, nil) want := "Empty query list" require.Error(t, err) assert.Contains(t, err.Error(), want) } func TestTabletServerExecuteBatchFailAsTransaction(t *testing.T) { - db := setUpTabletServerTest(t) - defer db.Close() - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) - target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - if err != nil { - t.Fatalf("StartService failed: %v", err) - } + db, tsv := setupTabletServerTest(t) defer tsv.StopService() - _, err = tsv.ExecuteBatch(ctx, nil, []*querypb.BoundQuery{ + defer db.Close() + + _, err := tsv.ExecuteBatch(ctx, nil, []*querypb.BoundQuery{ { Sql: "begin", BindVariables: nil, @@ -1194,19 +722,12 @@ func TestTabletServerExecuteBatchFailAsTransaction(t *testing.T) { } func TestTabletServerExecuteBatchBeginFail(t *testing.T) { - db := setUpTabletServerTest(t) + db, tsv := setupTabletServerTest(t) + defer tsv.StopService() defer db.Close() + // make "begin" query fail db.AddRejectedQuery("begin", errRejected) - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) - target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - if err != nil { - t.Fatalf("StartService failed: %v", err) - } - defer tsv.StopService() if _, err := tsv.ExecuteBatch(ctx, nil, []*querypb.BoundQuery{ { Sql: "begin", @@ -1218,19 +739,12 @@ func TestTabletServerExecuteBatchBeginFail(t *testing.T) { } func TestTabletServerExecuteBatchCommitFail(t *testing.T) { - db := setUpTabletServerTest(t) + db, tsv := setupTabletServerTest(t) + defer tsv.StopService() defer db.Close() + // make "commit" query fail db.AddRejectedQuery("commit", errRejected) - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) - target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - if err != nil { - t.Fatalf("StartService failed: %v", err) - } - defer tsv.StopService() if _, err := tsv.ExecuteBatch(ctx, nil, []*querypb.BoundQuery{ { Sql: "begin", @@ -1246,8 +760,10 @@ func TestTabletServerExecuteBatchCommitFail(t *testing.T) { } func TestTabletServerExecuteBatchSqlExecFailInTransaction(t *testing.T) { - db := setUpTabletServerTest(t) + db, tsv := setupTabletServerTest(t) + defer tsv.StopService() defer db.Close() + sql := "insert into test_table values (1, 2)" sqlResult := &sqltypes.Result{} expandedSQL := "insert into test_table values (1, 2) /* _stream test_table (pk ) (1 ); */" @@ -1259,15 +775,7 @@ func TestTabletServerExecuteBatchSqlExecFailInTransaction(t *testing.T) { db.AddRejectedQuery(sql, errRejected) db.AddRejectedQuery(expandedSQL, errRejected) - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - if err != nil { - t.Fatalf("StartService failed: %v", err) - } - defer tsv.StopService() if db.GetQueryCalledNum("rollback") != 0 { t.Fatalf("rollback should not be executed.") } @@ -1287,17 +795,10 @@ func TestTabletServerExecuteBatchSqlExecFailInTransaction(t *testing.T) { } func TestTabletServerExecuteBatchCallCommitWithoutABegin(t *testing.T) { - db := setUpTabletServerTest(t) - defer db.Close() - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) - target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - if err != nil { - t.Fatalf("StartService failed: %v", err) - } + db, tsv := setupTabletServerTest(t) defer tsv.StopService() + defer db.Close() + if _, err := tsv.ExecuteBatch(ctx, nil, []*querypb.BoundQuery{ { Sql: "commit", @@ -1309,23 +810,16 @@ func TestTabletServerExecuteBatchCallCommitWithoutABegin(t *testing.T) { } func TestExecuteBatchNestedTransaction(t *testing.T) { - db := setUpTabletServerTest(t) + db, tsv := setupTabletServerTest(t) + defer tsv.StopService() defer db.Close() + sql := "insert into test_table values (1, 2)" sqlResult := &sqltypes.Result{} expandedSQL := "insert into test_table values (1, 2) /* _stream test_table (pk ) (1 ); */" db.AddQuery(sql, sqlResult) db.AddQuery(expandedSQL, sqlResult) - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) - target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - if err != nil { - t.Fatalf("StartService failed: %v", err) - } - defer tsv.StopService() if _, err := tsv.ExecuteBatch(ctx, nil, []*querypb.BoundQuery{ { Sql: "begin", @@ -1361,20 +855,16 @@ func TestSerializeTransactionsSameRow(t *testing.T) { // The actual execution looks like this: // tx1 | tx3 // tx2 - db := setUpTabletServerTest(t) - defer db.Close() config := tabletenv.NewDefaultConfig() config.HotRowProtection.Mode = tabletenv.Enable config.HotRowProtection.MaxConcurrency = 1 // Reduce the txpool to 2 because we should never consume more than two slots. config.TxPool.Size = 2 - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) - target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - if err := tsv.StartService(target, dbcfgs); err != nil { - t.Fatalf("StartService failed: %v", err) - } + db, tsv := setupTabletServerTestCustom(t, config) defer tsv.StopService() + defer db.Close() + + target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} countStart := tsv.stats.WaitTimings.Counts()["TabletServerTest.TxSerializer"] // Fake data. @@ -1471,19 +961,15 @@ func TestSerializeTransactionsSameRow(t *testing.T) { } func TestDMLQueryWithoutWhereClause(t *testing.T) { - db := setUpTabletServerTest(t) - defer db.Close() config := tabletenv.NewDefaultConfig() config.HotRowProtection.Mode = tabletenv.Enable config.HotRowProtection.MaxConcurrency = 1 - config.TxPool.Size = 2 - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) - target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - require.NoError(t, err) + db, tsv := setupTabletServerTestCustom(t, config) defer tsv.StopService() + defer db.Close() + + target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} q := "delete from test_table" db.AddQuery(q+" limit 10001", &sqltypes.Result{}) @@ -1510,20 +996,16 @@ func TestSerializeTransactionsSameRow_ExecuteBatchAsTransaction(t *testing.T) { // The actual execution looks like this: // tx1 | tx3 // tx2 - db := setUpTabletServerTest(t) - defer db.Close() config := tabletenv.NewDefaultConfig() config.HotRowProtection.Mode = tabletenv.Enable config.HotRowProtection.MaxConcurrency = 1 // Reduce the txpool to 2 because we should never consume more than two slots. config.TxPool.Size = 2 - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) - target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - if err := tsv.StartService(target, dbcfgs); err != nil { - t.Fatalf("StartService failed: %v", err) - } + db, tsv := setupTabletServerTestCustom(t, config) defer tsv.StopService() + defer db.Close() + + target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} countStart := tsv.stats.WaitTimings.Counts()["TabletServerTest.TxSerializer"] // Fake data. @@ -1626,20 +1108,16 @@ func TestSerializeTransactionsSameRow_ConcurrentTransactions(t *testing.T) { // Out of these three, two can run in parallel because we increased the // ConcurrentTransactions limit to 2. // One out of the three transaction will always get serialized though. - db := setUpTabletServerTest(t) - defer db.Close() config := tabletenv.NewDefaultConfig() config.HotRowProtection.Mode = tabletenv.Enable config.HotRowProtection.MaxConcurrency = 2 // Reduce the txpool to 2 because we should never consume more than two slots. config.TxPool.Size = 2 - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) - target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - if err := tsv.StartService(target, dbcfgs); err != nil { - t.Fatalf("StartService failed: %v", err) - } + db, tsv := setupTabletServerTestCustom(t, config) defer tsv.StopService() + defer db.Close() + + target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} countStart := tsv.stats.WaitTimings.Counts()["TabletServerTest.TxSerializer"] // Fake data. @@ -1765,19 +1243,15 @@ func TestSerializeTransactionsSameRow_TooManyPendingRequests(t *testing.T) { // serialized. // Since we start to queue before the transaction pool would queue, we need // to enforce an upper limit as well to protect vttablet. - db := setUpTabletServerTest(t) - defer db.Close() config := tabletenv.NewDefaultConfig() config.HotRowProtection.Mode = tabletenv.Enable config.HotRowProtection.MaxQueueSize = 1 config.HotRowProtection.MaxConcurrency = 1 - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) - target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - if err := tsv.StartService(target, dbcfgs); err != nil { - t.Fatalf("StartService failed: %v", err) - } + db, tsv := setupTabletServerTestCustom(t, config) defer tsv.StopService() + defer db.Close() + + target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} countStart := tsv.stats.WaitTimings.Counts()["TabletServerTest.TxSerializer"] // Fake data. @@ -1852,19 +1326,15 @@ func TestSerializeTransactionsSameRow_TooManyPendingRequests(t *testing.T) { func TestSerializeTransactionsSameRow_TooManyPendingRequests_ExecuteBatchAsTransaction(t *testing.T) { // This test rejects queries if more than one transaction is currently in // progress for the hot row i.e. we check that tx2 actually fails. - db := setUpTabletServerTest(t) - defer db.Close() config := tabletenv.NewDefaultConfig() config.HotRowProtection.Mode = tabletenv.Enable config.HotRowProtection.MaxQueueSize = 1 config.HotRowProtection.MaxConcurrency = 1 - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) - target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - if err := tsv.StartService(target, dbcfgs); err != nil { - t.Fatalf("StartService failed: %v", err) - } + db, tsv := setupTabletServerTestCustom(t, config) defer tsv.StopService() + defer db.Close() + + target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} countStart := tsv.stats.WaitTimings.Counts()["TabletServerTest.TxSerializer"] // Fake data. @@ -1943,18 +1413,14 @@ func TestSerializeTransactionsSameRow_RequestCanceled(t *testing.T) { // tx1 and tx2 run against the same row. // tx2 is blocked on tx1. Eventually, tx2 is canceled and its request fails. // Only after that tx1 commits and finishes. - db := setUpTabletServerTest(t) - defer db.Close() config := tabletenv.NewDefaultConfig() config.HotRowProtection.Mode = tabletenv.Enable config.HotRowProtection.MaxConcurrency = 1 - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) - target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - if err := tsv.StartService(target, dbcfgs); err != nil { - t.Fatalf("StartService failed: %v", err) - } + db, tsv := setupTabletServerTestCustom(t, config) defer tsv.StopService() + defer db.Close() + + target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} countStart := tsv.stats.WaitTimings.Counts()["TabletServerTest.TxSerializer"] // Fake data. @@ -2434,17 +1900,9 @@ func TestACLHUP(t *testing.T) { } func TestConfigChanges(t *testing.T) { - db := setUpTabletServerTest(t) - defer db.Close() - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) - target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - if err != nil { - t.Fatalf("StartService failed: %v", err) - } + db, tsv := setupTabletServerTest(t) defer tsv.StopService() + defer db.Close() newSize := 10 newDuration := time.Duration(10 * time.Millisecond) @@ -2507,15 +1965,10 @@ func TestConfigChanges(t *testing.T) { } func TestReserveBeginExecute(t *testing.T) { - db := setUpTabletServerTest(t) + db, tsv := setupTabletServerTest(t) + defer tsv.StopService() defer db.Close() - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - require.NoError(t, err) - defer tsv.StopService() _, transactionID, reservedID, _, err := tsv.ReserveBeginExecute(ctx, &target, "select 42", []string{"select 43"}, nil, &querypb.ExecuteOptions{}) require.NoError(t, err) @@ -2534,15 +1987,11 @@ func TestReserveBeginExecute(t *testing.T) { } func TestReserveExecute_WithoutTx(t *testing.T) { - db := setUpTabletServerTest(t) + db, tsv := setupTabletServerTest(t) + defer tsv.StopService() defer db.Close() - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) + target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - require.NoError(t, err) - defer tsv.StopService() _, reservedID, _, err := tsv.ReserveExecute(ctx, &target, "select 42", []string{"select 43"}, nil, 0, &querypb.ExecuteOptions{}) require.NoError(t, err) @@ -2558,15 +2007,10 @@ func TestReserveExecute_WithoutTx(t *testing.T) { } func TestReserveExecute_WithTx(t *testing.T) { - db := setUpTabletServerTest(t) + db, tsv := setupTabletServerTest(t) + defer tsv.StopService() defer db.Close() - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - require.NoError(t, err) - defer tsv.StopService() transactionID, _, err := tsv.Begin(ctx, &target, &querypb.ExecuteOptions{}) require.NoError(t, err) @@ -2620,17 +2064,13 @@ func TestRelease(t *testing.T) { name += " reserve" } t.Run(name, func(t *testing.T) { - db := setUpTabletServerTest(t) - db.AddQueryPattern(".*", &sqltypes.Result{}) + db, tsv := setupTabletServerTest(t) + defer tsv.StopService() defer db.Close() - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) + db.AddQueryPattern(".*", &sqltypes.Result{}) target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - require.NoError(t, err) - defer tsv.StopService() + var err error var transactionID, reservedID int64 switch { @@ -2661,15 +2101,11 @@ func TestRelease(t *testing.T) { } func TestReserveStats(t *testing.T) { - db := setUpTabletServerTest(t) + db, tsv := setupTabletServerTest(t) + defer tsv.StopService() defer db.Close() - config := tabletenv.NewDefaultConfig() - tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - dbcfgs := newDBConfigs(db) + target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} - err := tsv.StartService(target, dbcfgs) - require.NoError(t, err) - defer tsv.StopService() callerID := &querypb.VTGateCallerID{ Username: "test", @@ -2718,7 +2154,23 @@ func TestReserveStats(t *testing.T) { assert.NotEmpty(t, tsv.te.txPool.env.Stats().UserReservedTimesNs.Counts()["test"]) } -func setUpTabletServerTest(t *testing.T) *fakesqldb.DB { +func setupTabletServerTest(t *testing.T) (*fakesqldb.DB, *TabletServer) { + config := tabletenv.NewDefaultConfig() + return setupTabletServerTestCustom(t, config) +} + +func setupTabletServerTestCustom(t *testing.T, config *tabletenv.TabletConfig) (*fakesqldb.DB, *TabletServer) { + db := setupFakeDB(t) + tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) + require.Equal(t, int64(StateNotConnected), tsv.sm.State()) + dbcfgs := newDBConfigs(db) + target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} + err := tsv.StartService(target, dbcfgs) + require.NoError(t, err) + return db, tsv +} + +func setupFakeDB(t *testing.T) *fakesqldb.DB { db := fakesqldb.New(t) for query, result := range getSupportedQueries() { db.AddQuery(query, result) @@ -2726,13 +2178,6 @@ func setUpTabletServerTest(t *testing.T) *fakesqldb.DB { return db } -func checkTabletServerState(t *testing.T, tsv *TabletServer, expectState int64) { - state := tsv.sm.State() - if state != expectState { - t.Fatalf("TabletServer should in state: %d, but get state: %d", expectState, state) - } -} - func getSupportedQueries() map[string]*sqltypes.Result { return map[string]*sqltypes.Result{ // Queries for how row protection test (txserializer). From 919ad400e35fd598c3bf0624f782c82fdfbe5ac6 Mon Sep 17 00:00:00 2001 From: Sugu Sougoumarane Date: Sun, 28 Jun 2020 21:06:23 -0700 Subject: [PATCH 12/19] vttablet: tests for stateManager Signed-off-by: Sugu Sougoumarane --- go/vt/vttablet/tabletserver/state_manager.go | 71 ++- .../tabletserver/state_manager_test.go | 549 +++++++++++++++++- 2 files changed, 589 insertions(+), 31 deletions(-) diff --git a/go/vt/vttablet/tabletserver/state_manager.go b/go/vt/vttablet/tabletserver/state_manager.go index 9fabe5cc922..9bc6f864d9d 100644 --- a/go/vt/vttablet/tabletserver/state_manager.go +++ b/go/vt/vttablet/tabletserver/state_manager.go @@ -43,6 +43,9 @@ const ( StateServing ) +// transitionRetryInterval is for tests. +var transitionRetryInterval = 1 * time.Second + // stateName names every state. The number of elements must // match the number of states. Names can overlap. var stateName = []string{ @@ -120,6 +123,7 @@ type txThrottler interface { } func (sm *stateManager) SetServingType(tabletType topodatapb.TabletType, state int64, alsoAllow []topodatapb.TabletType) (stateChanged bool, err error) { + log.Infof("Starting transition to %v %v", tabletType, stateName[state]) stateChanged, errch := sm.setDesiredState(tabletType, state, alsoAllow) err = <-errch return stateChanged, err @@ -155,15 +159,13 @@ func (sm *stateManager) CheckMySQL() { // This code path emulates the error case at the end of the loop // of executeTransition where it waits for 1s and retries. sm.closeAll() - time.Sleep(1 * time.Second) + time.Sleep(transitionRetryInterval) go sm.executeTransition(make(chan error, 1)) }() } func (sm *stateManager) StopService() { defer close(sm.setTimeBomb()) - - log.Info("Stopping TabletServer") sm.SetServingType(sm.Target().TabletType, StateNotConnected, nil) } @@ -174,25 +176,31 @@ func (sm *stateManager) StartRequest(ctx context.Context, target *querypb.Target sm.mu.Lock() defer sm.mu.Unlock() - // All the checks below must pass. switch { case sm.state != StateServing: return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "operation not allowed in state %s", stateName[sm.state]) case sm.transitioning && !allowOnTransition: return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "operation not allowed in state %s", stateName[sm.state]) - case target == nil && !tabletenv.IsLocalContext(ctx): - return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "No target") - case target.Keyspace != sm.target.Keyspace: - return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid keyspace %v", target.Keyspace) - case target.Shard != sm.target.Shard: - return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid shard %v", target.Shard) - case target.TabletType != sm.target.TabletType: - for _, otherType := range sm.alsoAllow { - if target.TabletType == otherType { - goto ok + } + + if target != nil { + switch { + case target.Keyspace != sm.target.Keyspace: + return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid keyspace %v", target.Keyspace) + case target.Shard != sm.target.Shard: + return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid shard %v", target.Shard) + case target.TabletType != sm.target.TabletType: + for _, otherType := range sm.alsoAllow { + if target.TabletType == otherType { + goto ok + } } + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "invalid tablet type: %v, want: %v or %v", target.TabletType, sm.target.TabletType, sm.alsoAllow) + } + } else { + if !tabletenv.IsLocalContext(ctx) { + return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "No target") } - return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "invalid tablet type: %v, want: %v or %v", target.TabletType, sm.target.TabletType, sm.alsoAllow) } ok: @@ -210,21 +218,24 @@ func (sm *stateManager) EndRequest() { func (sm *stateManager) VerifyTarget(ctx context.Context, target *querypb.Target) error { sm.mu.Lock() defer sm.mu.Unlock() - - switch { - case target == nil && !tabletenv.IsLocalContext(ctx): - return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "No target") - case target.Keyspace != sm.target.Keyspace: - return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid keyspace %v", target.Keyspace) - case target.Shard != sm.target.Shard: - return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid shard %v", target.Shard) - case target.TabletType != sm.target.TabletType: - for _, otherType := range sm.alsoAllow { - if target.TabletType == otherType { - return nil + if target != nil { + switch { + case target.Keyspace != sm.target.Keyspace: + return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid keyspace %v", target.Keyspace) + case target.Shard != sm.target.Shard: + return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid shard %v", target.Shard) + case target.TabletType != sm.target.TabletType: + for _, otherType := range sm.alsoAllow { + if target.TabletType == otherType { + return nil + } } + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "invalid tablet type: %v, want: %v or %v", target.TabletType, sm.target.TabletType, sm.alsoAllow) + } + } else { + if !tabletenv.IsLocalContext(ctx) { + return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "No target") } - return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "invalid tablet type: %v, want: %v or %v", target.TabletType, sm.target.TabletType, sm.alsoAllow) } return nil } @@ -303,7 +314,7 @@ func (sm *stateManager) executeTransition(ch chan<- error) { log.Errorf("Error transitioning to the desired state: %v, %v, will keep retrying: %v", wantTabletType, stateName[wantState], err) } sm.closeAll() - time.Sleep(1 * time.Second) + time.Sleep(transitionRetryInterval) } } } @@ -389,7 +400,7 @@ func (sm *stateManager) unserveNonMaster(wantTabletType topodatapb.TabletType) e sm.hr.Open() sm.watcher.Open() - sm.setState(wantTabletType, StateServing) + sm.setState(wantTabletType, StateNotServing) return nil } diff --git a/go/vt/vttablet/tabletserver/state_manager_test.go b/go/vt/vttablet/tabletserver/state_manager_test.go index 9a7909274dd..f9e9229b2a0 100644 --- a/go/vt/vttablet/tabletserver/state_manager_test.go +++ b/go/vt/vttablet/tabletserver/state_manager_test.go @@ -17,12 +17,20 @@ limitations under the License. package tabletserver import ( + "errors" "testing" + "time" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "vitess.io/vitess/go/history" + "vitess.io/vitess/go/sync2" + querypb "vitess.io/vitess/go/vt/proto/query" + topodatapb "vitess.io/vitess/go/vt/proto/topodata" + "vitess.io/vitess/go/vt/vttablet/tabletserver/tabletenv" ) -func TestTabletServerGetState(t *testing.T) { +func TestStateManagerStateByName(t *testing.T) { states := []int64{ StateNotConnected, StateNotServing, @@ -39,4 +47,543 @@ func TestTabletServerGetState(t *testing.T) { sm.state = state require.Equal(t, names[i], sm.StateByName(), "StateByName") } + sm.EnterLameduck() + require.Equal(t, "NOT_SERVING", sm.StateByName(), "StateByName") +} + +func TestStateManagerServeMaster(t *testing.T) { + sm := newTestStateManager(t) + stateChanged, err := sm.SetServingType(topodatapb.TabletType_MASTER, StateServing, nil) + require.NoError(t, err) + assert.True(t, stateChanged) + + verifySubcomponent(t, sm.watcher, 1, testStateClosed) + verifySubcomponent(t, sm.hr, 2, testStateClosed) + + verifySubcomponent(t, sm.se, 3, testStateOpen) + verifySubcomponent(t, sm.vstreamer, 4, testStateOpen) + verifySubcomponent(t, sm.qe, 5, testStateOpen) + verifySubcomponent(t, sm.txThrottler, 6, testStateOpen) + verifySubcomponent(t, sm.hw, 7, testStateOpen) + verifySubcomponent(t, sm.tracker, 8, testStateOpen) + verifySubcomponent(t, sm.te, 9, testStateAcceptReadWrite) + verifySubcomponent(t, sm.messager, 10, testStateOpen) + + assert.False(t, sm.se.(*testSchemaEngine).nonMaster) + assert.True(t, sm.qe.(*testQueryEngine).isReachable) + assert.False(t, sm.qe.(*testQueryEngine).stopServing) + + assert.Equal(t, topodatapb.TabletType_MASTER, sm.target.TabletType) + assert.Equal(t, int64(StateServing), sm.state) +} + +func TestStateManagerServeNonMaster(t *testing.T) { + sm := newTestStateManager(t) + stateChanged, err := sm.SetServingType(topodatapb.TabletType_REPLICA, StateServing, nil) + require.NoError(t, err) + assert.True(t, stateChanged) + + verifySubcomponent(t, sm.messager, 1, testStateClosed) + verifySubcomponent(t, sm.tracker, 2, testStateClosed) + verifySubcomponent(t, sm.hw, 3, testStateClosed) + assert.True(t, sm.se.(*testSchemaEngine).nonMaster) + + verifySubcomponent(t, sm.se, 4, testStateOpen) + verifySubcomponent(t, sm.vstreamer, 5, testStateOpen) + verifySubcomponent(t, sm.qe, 6, testStateOpen) + verifySubcomponent(t, sm.txThrottler, 7, testStateOpen) + verifySubcomponent(t, sm.te, 8, testStateAcceptReadOnly) + verifySubcomponent(t, sm.hr, 9, testStateOpen) + verifySubcomponent(t, sm.watcher, 10, testStateOpen) + + assert.Equal(t, topodatapb.TabletType_REPLICA, sm.target.TabletType) + assert.Equal(t, int64(StateServing), sm.state) +} + +func TestStateManagerUnserveMaster(t *testing.T) { + sm := newTestStateManager(t) + stateChanged, err := sm.SetServingType(topodatapb.TabletType_MASTER, StateNotServing, nil) + require.NoError(t, err) + assert.True(t, stateChanged) + + verifySubcomponent(t, sm.messager, 1, testStateClosed) + verifySubcomponent(t, sm.te, 2, testStateClosed) + assert.True(t, sm.qe.(*testQueryEngine).stopServing) + + verifySubcomponent(t, sm.watcher, 3, testStateClosed) + verifySubcomponent(t, sm.hr, 4, testStateClosed) + + verifySubcomponent(t, sm.se, 5, testStateOpen) + verifySubcomponent(t, sm.vstreamer, 6, testStateOpen) + verifySubcomponent(t, sm.qe, 7, testStateOpen) + verifySubcomponent(t, sm.txThrottler, 8, testStateOpen) + + verifySubcomponent(t, sm.hw, 9, testStateOpen) + verifySubcomponent(t, sm.tracker, 10, testStateOpen) + + assert.Equal(t, topodatapb.TabletType_MASTER, sm.target.TabletType) + assert.Equal(t, int64(StateNotServing), sm.state) +} + +func TestStateManagerUnserveNonmaster(t *testing.T) { + sm := newTestStateManager(t) + stateChanged, err := sm.SetServingType(topodatapb.TabletType_RDONLY, StateNotServing, nil) + require.NoError(t, err) + assert.True(t, stateChanged) + + verifySubcomponent(t, sm.messager, 1, testStateClosed) + verifySubcomponent(t, sm.te, 2, testStateClosed) + assert.True(t, sm.qe.(*testQueryEngine).stopServing) + + verifySubcomponent(t, sm.tracker, 3, testStateClosed) + verifySubcomponent(t, sm.hw, 4, testStateClosed) + assert.True(t, sm.se.(*testSchemaEngine).nonMaster) + + verifySubcomponent(t, sm.se, 5, testStateOpen) + verifySubcomponent(t, sm.vstreamer, 6, testStateOpen) + verifySubcomponent(t, sm.qe, 7, testStateOpen) + verifySubcomponent(t, sm.txThrottler, 8, testStateOpen) + + verifySubcomponent(t, sm.hr, 9, testStateOpen) + verifySubcomponent(t, sm.watcher, 10, testStateOpen) + + assert.Equal(t, topodatapb.TabletType_RDONLY, sm.target.TabletType) + assert.Equal(t, int64(StateNotServing), sm.state) +} + +func TestStateManagerClose(t *testing.T) { + sm := newTestStateManager(t) + stateChanged, err := sm.SetServingType(topodatapb.TabletType_RDONLY, StateNotConnected, nil) + require.NoError(t, err) + assert.True(t, stateChanged) + + verifySubcomponent(t, sm.messager, 1, testStateClosed) + verifySubcomponent(t, sm.te, 2, testStateClosed) + assert.True(t, sm.qe.(*testQueryEngine).stopServing) + + verifySubcomponent(t, sm.txThrottler, 3, testStateClosed) + verifySubcomponent(t, sm.qe, 4, testStateClosed) + verifySubcomponent(t, sm.watcher, 5, testStateClosed) + verifySubcomponent(t, sm.tracker, 6, testStateClosed) + verifySubcomponent(t, sm.vstreamer, 7, testStateClosed) + verifySubcomponent(t, sm.hr, 8, testStateClosed) + verifySubcomponent(t, sm.hw, 9, testStateClosed) + verifySubcomponent(t, sm.se, 10, testStateClosed) + + assert.Equal(t, topodatapb.TabletType_RDONLY, sm.target.TabletType) + assert.Equal(t, int64(StateNotConnected), sm.state) +} + +func TestStateManagerStopService(t *testing.T) { + sm := newTestStateManager(t) + stateChanged, err := sm.SetServingType(topodatapb.TabletType_REPLICA, StateServing, nil) + require.NoError(t, err) + assert.True(t, stateChanged) + + assert.Equal(t, topodatapb.TabletType_REPLICA, sm.target.TabletType) + assert.Equal(t, int64(StateServing), sm.state) + + sm.StopService() + assert.Equal(t, topodatapb.TabletType_REPLICA, sm.target.TabletType) + assert.Equal(t, int64(StateNotConnected), sm.state) +} + +// testWatcher1 is used as a hook to invoke another transition +type testWatcher1 struct { + t *testing.T + sm *stateManager +} + +func (te *testWatcher1) Open() { +} + +func (te *testWatcher1) Close() { + stateChanged, err := te.sm.SetServingType(topodatapb.TabletType_RDONLY, StateNotServing, nil) + // We are transitioning. + // This should return immediately with no error. + require.NoError(te.t, err) + assert.True(te.t, stateChanged) +} + +func TestStateManagerSetServingTypeRace(t *testing.T) { + sm := newTestStateManager(t) + sm.watcher = &testWatcher1{ + t: t, + sm: sm, + } + stateChanged, err := sm.SetServingType(topodatapb.TabletType_MASTER, StateServing, nil) + require.NoError(t, err) + assert.True(t, stateChanged) + + // The watcher, being special, is not counted in the ordering. + verifySubcomponent(t, sm.messager, 10, testStateClosed) + verifySubcomponent(t, sm.te, 11, testStateClosed) + + verifySubcomponent(t, sm.tracker, 12, testStateClosed) + verifySubcomponent(t, sm.hw, 13, testStateClosed) + + verifySubcomponent(t, sm.se, 14, testStateOpen) + verifySubcomponent(t, sm.vstreamer, 15, testStateOpen) + verifySubcomponent(t, sm.qe, 16, testStateOpen) + verifySubcomponent(t, sm.txThrottler, 17, testStateOpen) + + verifySubcomponent(t, sm.hr, 18, testStateOpen) + + // End state should be the final desired state. + assert.Equal(t, topodatapb.TabletType_RDONLY, sm.target.TabletType) + assert.Equal(t, int64(StateNotServing), sm.state) +} + +// testWatcher2 is used as a hook to invoke another transition +type testWatcher2 struct { + t *testing.T + sm *stateManager +} + +func (te *testWatcher2) Open() { +} + +func (te *testWatcher2) Close() { + stateChanged, err := te.sm.SetServingType(topodatapb.TabletType_MASTER, StateServing, nil) + // We are transitioning. + // This should return immediately with no error. + require.NoError(te.t, err) + assert.False(te.t, stateChanged) +} + +func TestStateManagerSetServingTypeNoChange(t *testing.T) { + sm := newTestStateManager(t) + sm.watcher = &testWatcher2{ + t: t, + sm: sm, + } + stateChanged, err := sm.SetServingType(topodatapb.TabletType_MASTER, StateServing, nil) + require.NoError(t, err) + assert.True(t, stateChanged) + + // End state should be the final desired state. + assert.Equal(t, topodatapb.TabletType_MASTER, sm.target.TabletType) + assert.Equal(t, int64(StateServing), sm.state) +} + +func TestStateManagerTransitionFailRetry(t *testing.T) { + defer func(saved time.Duration) { transitionRetryInterval = saved }(transitionRetryInterval) + transitionRetryInterval = 10 * time.Millisecond + + sm := newTestStateManager(t) + sm.qe.(*testQueryEngine).failMySQL = true + + stateChanged, err := sm.SetServingType(topodatapb.TabletType_MASTER, StateServing, nil) + require.Error(t, err) + assert.True(t, stateChanged) + + for { + sm.mu.Lock() + transitioning := sm.transitioning + sm.mu.Unlock() + if !transitioning { + break + } + time.Sleep(10 * time.Millisecond) + } + + assert.Equal(t, topodatapb.TabletType_MASTER, sm.Target().TabletType) + assert.Equal(t, int64(StateServing), sm.State()) +} + +func TestStateManagerCheckMySQL(t *testing.T) { + defer func(saved time.Duration) { transitionRetryInterval = saved }(transitionRetryInterval) + transitionRetryInterval = 10 * time.Millisecond + + sm := newTestStateManager(t) + + stateChanged, err := sm.SetServingType(topodatapb.TabletType_MASTER, StateServing, nil) + require.NoError(t, err) + assert.True(t, stateChanged) + + sm.qe.(*testQueryEngine).failMySQL = true + order.Set(0) + sm.CheckMySQL() + + // Wait for closeAll to get under way. + for { + if order.Get() >= 1 { + break + } + time.Sleep(10 * time.Millisecond) + } + + // Wait to get out of transitioning state. + for { + sm.mu.Lock() + transitioning := sm.transitioning + sm.mu.Unlock() + if !transitioning { + break + } + time.Sleep(10 * time.Millisecond) + } + + assert.Equal(t, topodatapb.TabletType_MASTER, sm.Target().TabletType) + assert.Equal(t, int64(StateServing), sm.State()) +} + +func TestStateManagerValidations(t *testing.T) { + sm := newTestStateManager(t) + target := &querypb.Target{TabletType: topodatapb.TabletType_MASTER} + sm.target = *target + + err := sm.StartRequest(ctx, target, false) + assert.Contains(t, err.Error(), "operation not allowed") + + sm.state = StateServing + sm.transitioning = true + err = sm.StartRequest(ctx, target, false) + assert.Contains(t, err.Error(), "operation not allowed") + + err = sm.StartRequest(ctx, target, true) + assert.NoError(t, err) + + sm.transitioning = false + target.Keyspace = "a" + err = sm.StartRequest(ctx, target, false) + assert.Contains(t, err.Error(), "invalid keyspace") + err = sm.VerifyTarget(ctx, target) + assert.Contains(t, err.Error(), "invalid keyspace") + + target.Keyspace = "" + target.Shard = "a" + err = sm.StartRequest(ctx, target, false) + assert.Contains(t, err.Error(), "invalid shard") + err = sm.VerifyTarget(ctx, target) + assert.Contains(t, err.Error(), "invalid shard") + + target.Shard = "" + target.TabletType = topodatapb.TabletType_REPLICA + err = sm.StartRequest(ctx, target, false) + assert.Contains(t, err.Error(), "invalid tablet type") + err = sm.VerifyTarget(ctx, target) + assert.Contains(t, err.Error(), "invalid tablet type") + + sm.alsoAllow = []topodatapb.TabletType{topodatapb.TabletType_REPLICA} + err = sm.StartRequest(ctx, target, false) + assert.NoError(t, err) + err = sm.VerifyTarget(ctx, target) + assert.NoError(t, err) + + err = sm.StartRequest(ctx, nil, false) + assert.Contains(t, err.Error(), "No target") + err = sm.VerifyTarget(ctx, nil) + assert.Contains(t, err.Error(), "No target") + + localctx := tabletenv.LocalContext() + err = sm.StartRequest(localctx, nil, false) + assert.NoError(t, err) + err = sm.VerifyTarget(localctx, nil) + assert.NoError(t, err) +} + +func TestStateManagerWaitForRequests(t *testing.T) { + sm := newTestStateManager(t) + target := &querypb.Target{TabletType: topodatapb.TabletType_MASTER} + sm.target = *target + sm.timebombDuration = 10 * time.Second + + _, err := sm.SetServingType(topodatapb.TabletType_MASTER, StateServing, nil) + require.NoError(t, err) + + err = sm.StartRequest(ctx, target, false) + require.NoError(t, err) + + // This will go into transition and wait. + // Wait for that state. + go sm.StopService() + for { + sm.mu.Lock() + transitioning := sm.transitioning + sm.mu.Unlock() + if !transitioning { + continue + } + time.Sleep(10 * time.Millisecond) + break + } + + // Verify that we're still transitioning. + sm.mu.Lock() + assert.True(t, sm.transitioning) + sm.mu.Unlock() + + sm.EndRequest() + + for { + sm.mu.Lock() + transitioning := sm.transitioning + sm.mu.Unlock() + if transitioning { + time.Sleep(10 * time.Millisecond) + continue + } + break + } + assert.Equal(t, int64(StateNotConnected), sm.State()) +} + +func verifySubcomponent(t *testing.T, component interface{}, order int64, state testState) { + tos := component.(orderState) + assert.Equal(t, order, tos.Order()) + assert.Equal(t, state, tos.State()) +} + +func newTestStateManager(t *testing.T) *stateManager { + order.Set(0) + return &stateManager{ + se: &testSchemaEngine{}, + hw: &testSubcomponent{}, + hr: &testSubcomponent{}, + vstreamer: &testSubcomponent{}, + tracker: &testSubcomponent{}, + watcher: &testSubcomponent{}, + qe: &testQueryEngine{}, + txThrottler: &testTxThrottler{}, + te: &testTxEngine{}, + messager: &testSubcomponent{}, + + checkMySQLThrottler: sync2.NewSemaphore(1, 0), + history: history.New(10), + timebombDuration: time.Duration(10 * time.Millisecond), + } +} + +var order sync2.AtomicInt64 + +type testState int + +const ( + testStateUnknown = testState(iota) + testStateOpen + testStateClosed + testStateMakeNonMaster + testStateAcceptReadOnly + testStateAcceptReadWrite +) + +type orderState interface { + Order() int64 + State() testState +} + +type testOrderState struct { + order int64 + state testState +} + +func (tos testOrderState) Order() int64 { + return tos.order +} + +func (tos testOrderState) State() testState { + return tos.state +} + +type testSchemaEngine struct { + testOrderState + nonMaster bool +} + +func (te *testSchemaEngine) Open() error { + te.order = order.Add(1) + te.state = testStateOpen + return nil +} + +func (te *testSchemaEngine) MakeNonMaster() { + te.nonMaster = true +} + +func (te *testSchemaEngine) Close() { + te.order = order.Add(1) + te.state = testStateClosed +} + +type testQueryEngine struct { + testOrderState + isReachable bool + stopServing bool + + failMySQL bool +} + +func (te *testQueryEngine) Open() error { + te.order = order.Add(1) + te.state = testStateOpen + return nil +} + +func (te *testQueryEngine) IsMySQLReachable() error { + if te.failMySQL { + te.failMySQL = false + return errors.New("intentional error") + } + te.isReachable = true + return nil +} + +func (te *testQueryEngine) StopServing() { + te.stopServing = true +} + +func (te *testQueryEngine) Close() { + te.order = order.Add(1) + te.state = testStateClosed +} + +type testTxEngine struct { + testOrderState +} + +func (te *testTxEngine) AcceptReadWrite() error { + te.order = order.Add(1) + te.state = testStateAcceptReadWrite + return nil +} + +func (te *testTxEngine) AcceptReadOnly() error { + te.order = order.Add(1) + te.state = testStateAcceptReadOnly + return nil +} + +func (te *testTxEngine) Close() { + te.order = order.Add(1) + te.state = testStateClosed +} + +type testSubcomponent struct { + testOrderState +} + +func (te *testSubcomponent) Open() { + te.order = order.Add(1) + te.state = testStateOpen +} + +func (te *testSubcomponent) Close() { + te.order = order.Add(1) + te.state = testStateClosed +} + +type testTxThrottler struct { + testOrderState +} + +func (te *testTxThrottler) Open() error { + te.order = order.Add(1) + te.state = testStateOpen + return nil +} + +func (te *testTxThrottler) Close() { + te.order = order.Add(1) + te.state = testStateClosed } From 40ac47b91a5f3f01828fd418a5f2d572ee440eb4 Mon Sep 17 00:00:00 2001 From: Sugu Sougoumarane Date: Sun, 28 Jun 2020 22:27:48 -0700 Subject: [PATCH 13/19] vttablet: re-introduce ExitLameduck Looks like tm explicitly sets it expecting tablet server to unset it. We'll need to revisit this. Signed-off-by: Sugu Sougoumarane --- go/vt/vttablet/tabletserver/state_manager.go | 7 +++---- go/vt/vttablet/tabletserver/state_manager_test.go | 3 +++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/go/vt/vttablet/tabletserver/state_manager.go b/go/vt/vttablet/tabletserver/state_manager.go index 9bc6f864d9d..6e18c5ed1f2 100644 --- a/go/vt/vttablet/tabletserver/state_manager.go +++ b/go/vt/vttablet/tabletserver/state_manager.go @@ -123,6 +123,8 @@ type txThrottler interface { } func (sm *stateManager) SetServingType(tabletType topodatapb.TabletType, state int64, alsoAllow []topodatapb.TabletType) (stateChanged bool, err error) { + defer sm.ExitLameduck() + log.Infof("Starting transition to %v %v", tabletType, stateName[state]) stateChanged, errch := sm.setDesiredState(tabletType, state, alsoAllow) err = <-errch @@ -508,10 +510,7 @@ func (sm *stateManager) StateByName() string { if sm.lameduck.Get() != 0 { return "NOT_SERVING" } - sm.mu.Lock() - defer sm.mu.Unlock() - name := stateName[sm.state] - return name + return stateName[sm.State()] } // stateInfo returns a string representation of the state and optional detail diff --git a/go/vt/vttablet/tabletserver/state_manager_test.go b/go/vt/vttablet/tabletserver/state_manager_test.go index f9e9229b2a0..abdc4c3421d 100644 --- a/go/vt/vttablet/tabletserver/state_manager_test.go +++ b/go/vt/vttablet/tabletserver/state_manager_test.go @@ -53,10 +53,13 @@ func TestStateManagerStateByName(t *testing.T) { func TestStateManagerServeMaster(t *testing.T) { sm := newTestStateManager(t) + sm.EnterLameduck() stateChanged, err := sm.SetServingType(topodatapb.TabletType_MASTER, StateServing, nil) require.NoError(t, err) assert.True(t, stateChanged) + assert.Equal(t, int32(0), sm.lameduck.Get()) + verifySubcomponent(t, sm.watcher, 1, testStateClosed) verifySubcomponent(t, sm.hr, 2, testStateClosed) From f25002e35e3575b897f60ba7b6950f035c00f892 Mon Sep 17 00:00:00 2001 From: Sugu Sougoumarane Date: Mon, 29 Jun 2020 13:12:38 -0700 Subject: [PATCH 14/19] vttablet: detect shutindown state in StartRequest Also address a few early comments. Signed-off-by: Sugu Sougoumarane --- go/vt/vttablet/tabletserver/state_manager.go | 41 ++++++++++++++++--- .../tabletserver/state_manager_test.go | 4 +- 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/go/vt/vttablet/tabletserver/state_manager.go b/go/vt/vttablet/tabletserver/state_manager.go index 6e18c5ed1f2..bf6b7068b94 100644 --- a/go/vt/vttablet/tabletserver/state_manager.go +++ b/go/vt/vttablet/tabletserver/state_manager.go @@ -65,6 +65,20 @@ var stateDetail = []string{ // stateManager manages state transition for all the TabletServer // subcomponents. type stateManager struct { + // wantState and wantTabletType represent the desired state. + // If these values are changed and don't match the current + // state and target, transitioning is set to true, and executeTransition + // is invoked. This function returns after it transitions state + // and target to match the desired state, at which point it sets + // transitioning to false. + // wantState and wantTabletType can be changed if transitioning is true. + // executeTransition will check the latest values and continue transitioning + // until the state reaches the latest values. + // If a transition fails, execute transition will retry every second (dictated + // by transitionRetryInterval) until it reaches the desired state. + // If connection to MySQL is lost, the CheckMySQL function will launch + // executeTransition to make it retry until connection to MySQL is restored + // and the desired state is reached. mu sync.Mutex wantState int64 wantTabletType topodatapb.TabletType @@ -77,6 +91,9 @@ type stateManager struct { requests sync.WaitGroup lameduck sync2.AtomicInt32 + // Open must be done in forward order. + // Close must be done in reverse order. + // All Close functions must be called before Open. se schemaEngine hw subComponent hr subComponent @@ -88,6 +105,8 @@ type stateManager struct { te txEngine messager subComponent + // checkMySQLThrottler ensures that CheckMysql + // doesn't get spammed. checkMySQLThrottler *sync2.Semaphore history *history.History timebombDuration time.Duration @@ -122,6 +141,13 @@ type txThrottler interface { Close() } +// SetServingType changes the state to the specified settings. +// If sm is in the middle of a transition, it accepts the values, but returns +// an error saying that it's in the middle of a transition. +// If the desired state is already reached, it returns no error. +// If the first attempt at transitioning fails, it returns the error +// from that transition, but sm continues to retry until the desired +// state is reached. func (sm *stateManager) SetServingType(tabletType topodatapb.TabletType, state int64, alsoAllow []topodatapb.TabletType) (stateChanged bool, err error) { defer sm.ExitLameduck() @@ -166,6 +192,8 @@ func (sm *stateManager) CheckMySQL() { }() } +// StopService shuts down sm. If the shutdown doesn't complete +// within timeBombDuration, it crashes the process. func (sm *stateManager) StopService() { defer close(sm.setTimeBomb()) sm.SetServingType(sm.Target().TabletType, StateNotConnected, nil) @@ -178,13 +206,16 @@ func (sm *stateManager) StartRequest(ctx context.Context, target *querypb.Target sm.mu.Lock() defer sm.mu.Unlock() - switch { - case sm.state != StateServing: - return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "operation not allowed in state %s", stateName[sm.state]) - case sm.transitioning && !allowOnTransition: + if sm.state != StateServing { return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "operation not allowed in state %s", stateName[sm.state]) } + shuttingDown := sm.transitioning && sm.wantState != StateServing + if shuttingDown && !allowOnTransition { + // This specific error string needs to be returned for vtgate buffering to work. + return vterrors.New(vtrpcpb.Code_FAILED_PRECONDITION, "operation not allowed in state SHUTTING_DOWN") + } + if target != nil { switch { case target.Keyspace != sm.target.Keyspace: @@ -259,7 +290,7 @@ func (sm *stateManager) setDesiredState(tabletType topodatapb.TabletType, state } sm.alsoAllow = alsoAllow if sm.transitioning { - ch <- nil + ch <- vterrors.New(vtrpcpb.Code_FAILED_PRECONDITION, "a transition is already in progress") return stateChanged, ch } if sm.wantState == sm.state && sm.wantTabletType == sm.target.TabletType { diff --git a/go/vt/vttablet/tabletserver/state_manager_test.go b/go/vt/vttablet/tabletserver/state_manager_test.go index abdc4c3421d..a0e10c7f6e2 100644 --- a/go/vt/vttablet/tabletserver/state_manager_test.go +++ b/go/vt/vttablet/tabletserver/state_manager_test.go @@ -204,7 +204,7 @@ func (te *testWatcher1) Close() { stateChanged, err := te.sm.SetServingType(topodatapb.TabletType_RDONLY, StateNotServing, nil) // We are transitioning. // This should return immediately with no error. - require.NoError(te.t, err) + assert.Contains(te.t, err.Error(), "a transition is already in progress") assert.True(te.t, stateChanged) } @@ -250,7 +250,7 @@ func (te *testWatcher2) Close() { stateChanged, err := te.sm.SetServingType(topodatapb.TabletType_MASTER, StateServing, nil) // We are transitioning. // This should return immediately with no error. - require.NoError(te.t, err) + assert.Contains(te.t, err.Error(), "a transition is already in progress") assert.False(te.t, stateChanged) } From 9691dea35bcad3a91c5b6b4d2cef7e5cd1971ef4 Mon Sep 17 00:00:00 2001 From: Sugu Sougoumarane Date: Mon, 29 Jun 2020 19:15:53 -0700 Subject: [PATCH 15/19] vttablet: special-case RESTORE type If the requested type is RESTORE, force StateNotConnected. Signed-off-by: Sugu Sougoumarane --- go/vt/vttablet/tabletserver/state_manager.go | 5 +++++ go/vt/vttablet/tabletserver/state_manager_test.go | 12 ++++++++++++ 2 files changed, 17 insertions(+) diff --git a/go/vt/vttablet/tabletserver/state_manager.go b/go/vt/vttablet/tabletserver/state_manager.go index bf6b7068b94..3cd76f97eb2 100644 --- a/go/vt/vttablet/tabletserver/state_manager.go +++ b/go/vt/vttablet/tabletserver/state_manager.go @@ -151,6 +151,11 @@ type txThrottler interface { func (sm *stateManager) SetServingType(tabletType topodatapb.TabletType, state int64, alsoAllow []topodatapb.TabletType) (stateChanged bool, err error) { defer sm.ExitLameduck() + if tabletType == topodatapb.TabletType_RESTORE { + // TODO(sougou): remove this code once tm can give us more accurate state requests. + state = StateNotConnected + } + log.Infof("Starting transition to %v %v", tabletType, stateName[state]) stateChanged, errch := sm.setDesiredState(tabletType, state, alsoAllow) err = <-errch diff --git a/go/vt/vttablet/tabletserver/state_manager_test.go b/go/vt/vttablet/tabletserver/state_manager_test.go index a0e10c7f6e2..4e2d7be0e94 100644 --- a/go/vt/vttablet/tabletserver/state_manager_test.go +++ b/go/vt/vttablet/tabletserver/state_manager_test.go @@ -294,6 +294,18 @@ func TestStateManagerTransitionFailRetry(t *testing.T) { assert.Equal(t, int64(StateServing), sm.State()) } +func TestStateManagerRestoreType(t *testing.T) { + sm := newTestStateManager(t) + sm.EnterLameduck() + stateChanged, err := sm.SetServingType(topodatapb.TabletType_RESTORE, StateNotServing, nil) + require.NoError(t, err) + assert.True(t, stateChanged) + + assert.Equal(t, topodatapb.TabletType_RESTORE, sm.target.TabletType) + // RESTORE can only be in StateNotConnected. + assert.Equal(t, int64(StateNotConnected), sm.state) +} + func TestStateManagerCheckMySQL(t *testing.T) { defer func(saved time.Duration) { transitionRetryInterval = saved }(transitionRetryInterval) transitionRetryInterval = 10 * time.Millisecond From 084fa501bfa9c660d7b8f8a703cfb72c44bf854c Mon Sep 17 00:00:00 2001 From: Sugu Sougoumarane Date: Tue, 30 Jun 2020 16:06:43 -0700 Subject: [PATCH 16/19] vttablet: stateManager is more synchronous Some tests became flaky because they expect the state to change immediately after a call. This change brings the behavior to be closer to the existing one. But the tabletserver will continue to retry if an request failed. Signed-off-by: Sugu Sougoumarane --- go/vt/vttablet/tabletserver/state_manager.go | 248 ++++++++---------- .../tabletserver/state_manager_test.go | 129 +++++---- go/vt/vttablet/tabletserver/tabletserver.go | 47 ++-- 3 files changed, 200 insertions(+), 224 deletions(-) diff --git a/go/vt/vttablet/tabletserver/state_manager.go b/go/vt/vttablet/tabletserver/state_manager.go index 3cd76f97eb2..67a3e44b712 100644 --- a/go/vt/vttablet/tabletserver/state_manager.go +++ b/go/vt/vttablet/tabletserver/state_manager.go @@ -65,26 +65,27 @@ var stateDetail = []string{ // stateManager manages state transition for all the TabletServer // subcomponents. type stateManager struct { - // wantState and wantTabletType represent the desired state. - // If these values are changed and don't match the current - // state and target, transitioning is set to true, and executeTransition - // is invoked. This function returns after it transitions state - // and target to match the desired state, at which point it sets - // transitioning to false. - // wantState and wantTabletType can be changed if transitioning is true. - // executeTransition will check the latest values and continue transitioning - // until the state reaches the latest values. - // If a transition fails, execute transition will retry every second (dictated - // by transitionRetryInterval) until it reaches the desired state. - // If connection to MySQL is lost, the CheckMySQL function will launch - // executeTransition to make it retry until connection to MySQL is restored - // and the desired state is reached. + // transitioning is a semaphore that must to be obtained + // before attempting a state transition. To prevent deadlocks, + // this must be acquired before the mu lock. We use a semaphore + // because we need TryAcquire, which is not supported by sync.Mutex. + // If an acquire is successful, we must either Release explicitly + // or invoke execTransition, which will release once it's done. + transitioning *sync2.Semaphore + + // mu should be held to access the group of variables under it. + // It is required in spite of the transitioning semaphore. + // This is because other goroutines will still want + // read the values while a transition is in progress. + // + // If a transition fails, we set retrying to true and launch + // retryTransition which loops until the state converges. mu sync.Mutex wantState int64 wantTabletType topodatapb.TabletType state int64 target querypb.Target - transitioning bool + retrying bool // TODO(sougou): deprecate alsoAllow alsoAllow []topodatapb.TabletType @@ -142,12 +143,12 @@ type txThrottler interface { } // SetServingType changes the state to the specified settings. -// If sm is in the middle of a transition, it accepts the values, but returns -// an error saying that it's in the middle of a transition. -// If the desired state is already reached, it returns no error. -// If the first attempt at transitioning fails, it returns the error -// from that transition, but sm continues to retry until the desired -// state is reached. +// If a transition is in progress, it waits and then executes the +// new request. If the transition fails, it returns an error, and +// launches retryTransition to ensure that the request will eventually +// be honored. +// If sm is already in the requested state, it returns stateChanged as +// false. func (sm *stateManager) SetServingType(tabletType topodatapb.TabletType, state int64, alsoAllow []topodatapb.TabletType) (stateChanged bool, err error) { defer sm.ExitLameduck() @@ -157,9 +158,90 @@ func (sm *stateManager) SetServingType(tabletType topodatapb.TabletType, state i } log.Infof("Starting transition to %v %v", tabletType, stateName[state]) - stateChanged, errch := sm.setDesiredState(tabletType, state, alsoAllow) - err = <-errch - return stateChanged, err + if sm.mustTransition(tabletType, state, alsoAllow) { + return true, sm.execTransition(tabletType, state) + } + return false, nil +} + +// mustTransition returns true if the requested state does not match the current +// state. If so, it acquires the semaphore and returns true. If a transition is +// already in progress, it waits. If the desired state is already reached, it +// returns false without acquiring the semaphore. +func (sm *stateManager) mustTransition(tabletType topodatapb.TabletType, state int64, alsoAllow []topodatapb.TabletType) bool { + sm.transitioning.Acquire() + sm.mu.Lock() + defer sm.mu.Unlock() + + sm.wantTabletType = tabletType + sm.wantState = state + sm.alsoAllow = alsoAllow + if sm.target.TabletType == tabletType && sm.state == state { + sm.transitioning.Release() + return false + } + return true +} + +func (sm *stateManager) execTransition(tabletType topodatapb.TabletType, state int64) error { + defer sm.transitioning.Release() + + var err error + switch state { + case StateServing: + if tabletType == topodatapb.TabletType_MASTER { + err = sm.serveMaster() + } else { + err = sm.serveNonMaster(tabletType) + } + case StateNotServing: + if tabletType == topodatapb.TabletType_MASTER { + err = sm.unserveMaster() + } else { + err = sm.unserveNonMaster(tabletType) + } + case StateNotConnected: + sm.closeAll() + } + if err != nil { + sm.retryTransition(fmt.Sprintf("Error transitioning to the desired state: %v, %v, will keep retrying: %v", tabletType, stateName[state], err)) + } + return err +} + +func (sm *stateManager) retryTransition(message string) { + sm.mu.Lock() + defer sm.mu.Unlock() + if sm.retrying { + return + } + sm.retrying = true + + log.Error(message) + go func() { + for { + time.Sleep(transitionRetryInterval) + if sm.recheckState() { + return + } + } + }() +} + +func (sm *stateManager) recheckState() bool { + if !sm.transitioning.TryAcquire() { + return false + } + sm.mu.Lock() + defer sm.mu.Unlock() + + if sm.wantState == sm.state && sm.wantTabletType == sm.target.TabletType { + sm.retrying = false + sm.transitioning.Release() + return true + } + go sm.execTransition(sm.wantTabletType, sm.wantState) + return false } func (sm *stateManager) CheckMySQL() { @@ -177,23 +259,14 @@ func (sm *stateManager) CheckMySQL() { return } - log.Errorf("Cannot connect to MySQL, shutting down query service: %v", err) - sm.mu.Lock() - // If we're already transitioning, don't interfere. - if sm.transitioning { - sm.mu.Unlock() + if !sm.transitioning.TryAcquire() { + // If we're already transitioning, don't interfere. return } - // Setting this flag will ensure that no one else will - // invoke sm.executeTransition while we sleep. - sm.transitioning = true - sm.mu.Unlock() + defer sm.transitioning.Release() - // This code path emulates the error case at the end of the loop - // of executeTransition where it waits for 1s and retries. sm.closeAll() - time.Sleep(transitionRetryInterval) - go sm.executeTransition(make(chan error, 1)) + sm.retryTransition(fmt.Sprintf("Cannot connect to MySQL, shutting down query service: %v", err)) }() } @@ -207,7 +280,7 @@ func (sm *stateManager) StopService() { // StartRequest validates the current state and target and registers // the request (a waitgroup) as started. Every StartRequest must be // ended with an EndRequest. -func (sm *stateManager) StartRequest(ctx context.Context, target *querypb.Target, allowOnTransition bool) (err error) { +func (sm *stateManager) StartRequest(ctx context.Context, target *querypb.Target, allowOnShutdown bool) (err error) { sm.mu.Lock() defer sm.mu.Unlock() @@ -215,8 +288,8 @@ func (sm *stateManager) StartRequest(ctx context.Context, target *querypb.Target return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "operation not allowed in state %s", stateName[sm.state]) } - shuttingDown := sm.transitioning && sm.wantState != StateServing - if shuttingDown && !allowOnTransition { + shuttingDown := sm.wantState != StateServing + if shuttingDown && !allowOnShutdown { // This specific error string needs to be returned for vtgate buffering to work. return vterrors.New(vtrpcpb.Code_FAILED_PRECONDITION, "operation not allowed in state SHUTTING_DOWN") } @@ -278,100 +351,6 @@ func (sm *stateManager) VerifyTarget(ctx context.Context, target *querypb.Target return nil } -func (sm *stateManager) setDesiredState(tabletType topodatapb.TabletType, state int64, alsoAllow []topodatapb.TabletType) (bool, <-chan error) { - sm.mu.Lock() - defer sm.mu.Unlock() - - ch := make(chan error, 1) - - stateChanged := false - if sm.wantTabletType != tabletType { - stateChanged = true - sm.wantTabletType = tabletType - } - if sm.wantState != state { - stateChanged = true - sm.wantState = state - } - sm.alsoAllow = alsoAllow - if sm.transitioning { - ch <- vterrors.New(vtrpcpb.Code_FAILED_PRECONDITION, "a transition is already in progress") - return stateChanged, ch - } - if sm.wantState == sm.state && sm.wantTabletType == sm.target.TabletType { - ch <- nil - return stateChanged, ch - } - sm.transitioning = true - go sm.executeTransition(ch) - return stateChanged, ch -} - -// executeTransition must be invoked after setting sm.transitioning to true. -// If the flag is already set, it must not be called. The function will -// reset the flag to false when it returns. -func (sm *stateManager) executeTransition(ch chan<- error) { - // Repeat until desired state is reached. - errorReported := false - for { - ok, wantTabletType, wantState := sm.transitionDone() - if ok { - if !errorReported { - ch <- nil - } - return - } - - var err error - switch wantState { - case StateServing: - if wantTabletType == topodatapb.TabletType_MASTER { - err = sm.serveMaster() - } else { - err = sm.serveNonMaster(wantTabletType) - } - case StateNotServing: - if wantTabletType == topodatapb.TabletType_MASTER { - err = sm.unserveMaster() - } else { - err = sm.unserveNonMaster(wantTabletType) - } - case StateNotConnected: - sm.closeAll() - } - // If there was an error, shut everything down - // and retry after a delay. - // If there was no error, we restart the loop - // which verifies that the desired state was - // not changed before returning. If it was changed, - // it executes a new transition. - if err != nil { - if !errorReported { - errorReported = true - ch <- err - log.Errorf("Error transitioning to the desired state: %v, %v, will keep retrying: %v", wantTabletType, stateName[wantState], err) - } - sm.closeAll() - time.Sleep(transitionRetryInterval) - } - } -} - -// transitionDone returns true if the desired state matches the current state. -// Otherwise, it returns false, the desired tablet type and state. -func (sm *stateManager) transitionDone() (bool, topodatapb.TabletType, int64) { - sm.mu.Lock() - defer sm.mu.Unlock() - - wantTabletType := sm.wantTabletType - wantState := sm.wantState - if wantState == sm.state && wantTabletType == sm.target.TabletType { - sm.transitioning = false - return true, wantTabletType, wantState - } - return false, wantTabletType, wantState -} - func (sm *stateManager) serveMaster() error { sm.watcher.Close() sm.hr.Close() @@ -497,6 +476,7 @@ func (sm *stateManager) setTimeBomb() chan struct{} { func (sm *stateManager) setState(tabletType topodatapb.TabletType, state int64) { sm.mu.Lock() defer sm.mu.Unlock() + if tabletType == topodatapb.TabletType_UNKNOWN { tabletType = sm.wantTabletType } diff --git a/go/vt/vttablet/tabletserver/state_manager_test.go b/go/vt/vttablet/tabletserver/state_manager_test.go index 4e2d7be0e94..b73d699fd86 100644 --- a/go/vt/vttablet/tabletserver/state_manager_test.go +++ b/go/vt/vttablet/tabletserver/state_manager_test.go @@ -18,6 +18,7 @@ package tabletserver import ( "errors" + "sync" "testing" "time" @@ -191,81 +192,70 @@ func TestStateManagerStopService(t *testing.T) { assert.Equal(t, int64(StateNotConnected), sm.state) } -// testWatcher1 is used as a hook to invoke another transition -type testWatcher1 struct { +// testWatcher is used as a hook to invoke another transition +type testWatcher struct { t *testing.T sm *stateManager + wg sync.WaitGroup } -func (te *testWatcher1) Open() { +func (te *testWatcher) Open() { } -func (te *testWatcher1) Close() { - stateChanged, err := te.sm.SetServingType(topodatapb.TabletType_RDONLY, StateNotServing, nil) - // We are transitioning. - // This should return immediately with no error. - assert.Contains(te.t, err.Error(), "a transition is already in progress") - assert.True(te.t, stateChanged) +func (te *testWatcher) Close() { + te.wg.Add(1) + go func() { + defer te.wg.Done() + + stateChanged, err := te.sm.SetServingType(topodatapb.TabletType_RDONLY, StateNotServing, nil) + assert.NoError(te.t, err) + assert.True(te.t, stateChanged) + }() } func TestStateManagerSetServingTypeRace(t *testing.T) { sm := newTestStateManager(t) - sm.watcher = &testWatcher1{ + te := &testWatcher{ t: t, sm: sm, } + sm.watcher = te stateChanged, err := sm.SetServingType(topodatapb.TabletType_MASTER, StateServing, nil) require.NoError(t, err) assert.True(t, stateChanged) - // The watcher, being special, is not counted in the ordering. - verifySubcomponent(t, sm.messager, 10, testStateClosed) - verifySubcomponent(t, sm.te, 11, testStateClosed) - - verifySubcomponent(t, sm.tracker, 12, testStateClosed) - verifySubcomponent(t, sm.hw, 13, testStateClosed) - - verifySubcomponent(t, sm.se, 14, testStateOpen) - verifySubcomponent(t, sm.vstreamer, 15, testStateOpen) - verifySubcomponent(t, sm.qe, 16, testStateOpen) - verifySubcomponent(t, sm.txThrottler, 17, testStateOpen) - - verifySubcomponent(t, sm.hr, 18, testStateOpen) + // Ensure the next call waits and then succeeds. + te.wg.Wait() // End state should be the final desired state. assert.Equal(t, topodatapb.TabletType_RDONLY, sm.target.TabletType) assert.Equal(t, int64(StateNotServing), sm.state) } -// testWatcher2 is used as a hook to invoke another transition -type testWatcher2 struct { - t *testing.T - sm *stateManager -} - -func (te *testWatcher2) Open() { -} - -func (te *testWatcher2) Close() { - stateChanged, err := te.sm.SetServingType(topodatapb.TabletType_MASTER, StateServing, nil) - // We are transitioning. - // This should return immediately with no error. - assert.Contains(te.t, err.Error(), "a transition is already in progress") - assert.False(te.t, stateChanged) -} - func TestStateManagerSetServingTypeNoChange(t *testing.T) { sm := newTestStateManager(t) - sm.watcher = &testWatcher2{ - t: t, - sm: sm, - } - stateChanged, err := sm.SetServingType(topodatapb.TabletType_MASTER, StateServing, nil) + stateChanged, err := sm.SetServingType(topodatapb.TabletType_REPLICA, StateServing, nil) require.NoError(t, err) assert.True(t, stateChanged) - // End state should be the final desired state. - assert.Equal(t, topodatapb.TabletType_MASTER, sm.target.TabletType) + stateChanged, err = sm.SetServingType(topodatapb.TabletType_REPLICA, StateServing, nil) + require.NoError(t, err) + assert.False(t, stateChanged) + + verifySubcomponent(t, sm.messager, 1, testStateClosed) + verifySubcomponent(t, sm.tracker, 2, testStateClosed) + verifySubcomponent(t, sm.hw, 3, testStateClosed) + assert.True(t, sm.se.(*testSchemaEngine).nonMaster) + + verifySubcomponent(t, sm.se, 4, testStateOpen) + verifySubcomponent(t, sm.vstreamer, 5, testStateOpen) + verifySubcomponent(t, sm.qe, 6, testStateOpen) + verifySubcomponent(t, sm.txThrottler, 7, testStateOpen) + verifySubcomponent(t, sm.te, 8, testStateAcceptReadOnly) + verifySubcomponent(t, sm.hr, 9, testStateOpen) + verifySubcomponent(t, sm.watcher, 10, testStateOpen) + + assert.Equal(t, topodatapb.TabletType_REPLICA, sm.target.TabletType) assert.Equal(t, int64(StateServing), sm.state) } @@ -282,9 +272,9 @@ func TestStateManagerTransitionFailRetry(t *testing.T) { for { sm.mu.Lock() - transitioning := sm.transitioning + retrying := sm.retrying sm.mu.Unlock() - if !transitioning { + if !retrying { break } time.Sleep(10 * time.Millisecond) @@ -329,11 +319,19 @@ func TestStateManagerCheckMySQL(t *testing.T) { } // Wait to get out of transitioning state. + for { + if !sm.isTransitioning() { + break + } + time.Sleep(10 * time.Millisecond) + } + + // Wait for retry to finish. for { sm.mu.Lock() - transitioning := sm.transitioning + retrying := sm.retrying sm.mu.Unlock() - if !transitioning { + if !retrying { break } time.Sleep(10 * time.Millisecond) @@ -352,14 +350,14 @@ func TestStateManagerValidations(t *testing.T) { assert.Contains(t, err.Error(), "operation not allowed") sm.state = StateServing - sm.transitioning = true + sm.wantState = StateNotServing err = sm.StartRequest(ctx, target, false) assert.Contains(t, err.Error(), "operation not allowed") err = sm.StartRequest(ctx, target, true) assert.NoError(t, err) - sm.transitioning = false + sm.wantState = StateServing target.Keyspace = "a" err = sm.StartRequest(ctx, target, false) assert.Contains(t, err.Error(), "invalid keyspace") @@ -414,28 +412,20 @@ func TestStateManagerWaitForRequests(t *testing.T) { // Wait for that state. go sm.StopService() for { - sm.mu.Lock() - transitioning := sm.transitioning - sm.mu.Unlock() - if !transitioning { + if !sm.isTransitioning() { + time.Sleep(10 * time.Millisecond) continue } - time.Sleep(10 * time.Millisecond) break } // Verify that we're still transitioning. - sm.mu.Lock() - assert.True(t, sm.transitioning) - sm.mu.Unlock() + assert.True(t, sm.isTransitioning()) sm.EndRequest() for { - sm.mu.Lock() - transitioning := sm.transitioning - sm.mu.Unlock() - if transitioning { + if sm.isTransitioning() { time.Sleep(10 * time.Millisecond) continue } @@ -464,12 +454,21 @@ func newTestStateManager(t *testing.T) *stateManager { te: &testTxEngine{}, messager: &testSubcomponent{}, + transitioning: sync2.NewSemaphore(1, 0), checkMySQLThrottler: sync2.NewSemaphore(1, 0), history: history.New(10), timebombDuration: time.Duration(10 * time.Millisecond), } } +func (sm *stateManager) isTransitioning() bool { + if sm.transitioning.TryAcquire() { + sm.transitioning.Release() + return false + } + return true +} + var order sync2.AtomicInt64 type testState int diff --git a/go/vt/vttablet/tabletserver/tabletserver.go b/go/vt/vttablet/tabletserver/tabletserver.go index 96932287cef..bfe03e36075 100644 --- a/go/vt/vttablet/tabletserver/tabletserver.go +++ b/go/vt/vttablet/tabletserver/tabletserver.go @@ -153,10 +153,6 @@ func NewTabletServer(name string, config *tabletenv.TabletConfig, topoServer *to tsOnce.Do(func() { srvTopoServer = srvtopo.NewResilientServer(topoServer, "TabletSrvTopo") }) - // The following services should generally be opened in the order - // of initialization below, and closed in reverse order. - // However, gracefulStop is slightly different because only - // some services must be closed, while others should remain open. tsv.se = schema.NewEngine(tsv) tsv.hw = heartbeat.NewWriter(tsv, alias) tsv.hr = heartbeat.NewReader(tsv) @@ -180,6 +176,7 @@ func NewTabletServer(name string, config *tabletenv.TabletConfig, topoServer *to te: tsv.te, messager: tsv.messager, + transitioning: sync2.NewSemaphore(1, 0), checkMySQLThrottler: sync2.NewSemaphore(1, 0), history: history.New(10), timebombDuration: time.Duration(config.OltpReadPool.TimeoutSeconds * 10), @@ -393,7 +390,7 @@ func (tsv *TabletServer) begin(ctx context.Context, target *querypb.Target, preQ err = tsv.execRequest( ctx, tsv.QueryTimeout.Get(), "Begin", "begin", nil, - target, options, false, /* allowOnTransition */ + target, options, false, /* allowOnShutdown */ func(ctx context.Context, logStats *tabletenv.LogStats) error { startTime := time.Now() if tsv.txThrottler.Throttle() { @@ -425,7 +422,7 @@ func (tsv *TabletServer) Commit(ctx context.Context, target *querypb.Target, tra err = tsv.execRequest( ctx, tsv.QueryTimeout.Get(), "Commit", "commit", nil, - target, nil, true, /* allowOnTransition */ + target, nil, true, /* allowOnShutdown */ func(ctx context.Context, logStats *tabletenv.LogStats) error { startTime := time.Now() logStats.TransactionID = transactionID @@ -456,7 +453,7 @@ func (tsv *TabletServer) Rollback(ctx context.Context, target *querypb.Target, t err = tsv.execRequest( ctx, tsv.QueryTimeout.Get(), "Rollback", "rollback", nil, - target, nil, true, /* allowOnTransition */ + target, nil, true, /* allowOnShutdown */ func(ctx context.Context, logStats *tabletenv.LogStats) error { defer tsv.stats.QueryTimings.Record("ROLLBACK", time.Now()) logStats.TransactionID = transactionID @@ -476,7 +473,7 @@ func (tsv *TabletServer) Prepare(ctx context.Context, target *querypb.Target, tr return tsv.execRequest( ctx, tsv.QueryTimeout.Get(), "Prepare", "prepare", nil, - target, nil, true, /* allowOnTransition */ + target, nil, true, /* allowOnShutdown */ func(ctx context.Context, logStats *tabletenv.LogStats) error { txe := &TxExecutor{ ctx: ctx, @@ -493,7 +490,7 @@ func (tsv *TabletServer) CommitPrepared(ctx context.Context, target *querypb.Tar return tsv.execRequest( ctx, tsv.QueryTimeout.Get(), "CommitPrepared", "commit_prepared", nil, - target, nil, true, /* allowOnTransition */ + target, nil, true, /* allowOnShutdown */ func(ctx context.Context, logStats *tabletenv.LogStats) error { txe := &TxExecutor{ ctx: ctx, @@ -510,7 +507,7 @@ func (tsv *TabletServer) RollbackPrepared(ctx context.Context, target *querypb.T return tsv.execRequest( ctx, tsv.QueryTimeout.Get(), "RollbackPrepared", "rollback_prepared", nil, - target, nil, true, /* allowOnTransition */ + target, nil, true, /* allowOnShutdown */ func(ctx context.Context, logStats *tabletenv.LogStats) error { txe := &TxExecutor{ ctx: ctx, @@ -527,7 +524,7 @@ func (tsv *TabletServer) CreateTransaction(ctx context.Context, target *querypb. return tsv.execRequest( ctx, tsv.QueryTimeout.Get(), "CreateTransaction", "create_transaction", nil, - target, nil, true, /* allowOnTransition */ + target, nil, true, /* allowOnShutdown */ func(ctx context.Context, logStats *tabletenv.LogStats) error { txe := &TxExecutor{ ctx: ctx, @@ -545,7 +542,7 @@ func (tsv *TabletServer) StartCommit(ctx context.Context, target *querypb.Target return tsv.execRequest( ctx, tsv.QueryTimeout.Get(), "StartCommit", "start_commit", nil, - target, nil, true, /* allowOnTransition */ + target, nil, true, /* allowOnShutdown */ func(ctx context.Context, logStats *tabletenv.LogStats) error { txe := &TxExecutor{ ctx: ctx, @@ -563,7 +560,7 @@ func (tsv *TabletServer) SetRollback(ctx context.Context, target *querypb.Target return tsv.execRequest( ctx, tsv.QueryTimeout.Get(), "SetRollback", "set_rollback", nil, - target, nil, true, /* allowOnTransition */ + target, nil, true, /* allowOnShutdown */ func(ctx context.Context, logStats *tabletenv.LogStats) error { txe := &TxExecutor{ ctx: ctx, @@ -581,7 +578,7 @@ func (tsv *TabletServer) ConcludeTransaction(ctx context.Context, target *queryp return tsv.execRequest( ctx, tsv.QueryTimeout.Get(), "ConcludeTransaction", "conclude_transaction", nil, - target, nil, true, /* allowOnTransition */ + target, nil, true, /* allowOnShutdown */ func(ctx context.Context, logStats *tabletenv.LogStats) error { txe := &TxExecutor{ ctx: ctx, @@ -598,7 +595,7 @@ func (tsv *TabletServer) ReadTransaction(ctx context.Context, target *querypb.Ta err = tsv.execRequest( ctx, tsv.QueryTimeout.Get(), "ReadTransaction", "read_transaction", nil, - target, nil, true, /* allowOnTransition */ + target, nil, true, /* allowOnShutdown */ func(ctx context.Context, logStats *tabletenv.LogStats) error { txe := &TxExecutor{ ctx: ctx, @@ -622,11 +619,11 @@ func (tsv *TabletServer) Execute(ctx context.Context, target *querypb.Target, sq return nil, vterrors.New(vtrpcpb.Code_INTERNAL, "transactionID and reserveID must match if both are non-zero") } - allowOnTransition := transactionID != 0 + allowOnShutdown := transactionID != 0 err = tsv.execRequest( ctx, tsv.QueryTimeout.Get(), "Execute", sql, bindVariables, - target, options, allowOnTransition, + target, options, allowOnShutdown, func(ctx context.Context, logStats *tabletenv.LogStats) error { if bindVariables == nil { bindVariables = make(map[string]*querypb.BindVariable) @@ -673,7 +670,7 @@ func (tsv *TabletServer) StreamExecute(ctx context.Context, target *querypb.Targ return tsv.execRequest( ctx, 0, "StreamExecute", sql, bindVariables, - target, options, false, /* allowOnTransition */ + target, options, false, /* allowOnShutdown */ func(ctx context.Context, logStats *tabletenv.LogStats) error { if bindVariables == nil { bindVariables = make(map[string]*querypb.BindVariable) @@ -730,14 +727,14 @@ func (tsv *TabletServer) ExecuteBatch(ctx context.Context, target *querypb.Targe } } - allowOnTransition := transactionID != 0 + allowOnShutdown := transactionID != 0 // TODO(sougou): Convert StartRequest/EndRequest pattern to use wrapper // function tsv.execRequest() instead. // Note that below we always return "err" right away and do not call // tsv.convertAndLogError. That's because the methods which returned "err", // e.g. tsv.Execute(), already called that function and therefore already // converted and logged the error. - if err = tsv.sm.StartRequest(ctx, target, allowOnTransition); err != nil { + if err = tsv.sm.StartRequest(ctx, target, allowOnShutdown); err != nil { return nil, err } defer tsv.sm.EndRequest() @@ -832,7 +829,7 @@ func (tsv *TabletServer) beginWaitForSameRangeTransactions(ctx context.Context, // -queryserver-config-txpool-timeout (defaults to 1s) to limit the waiting. ctx, tsv.QueryTimeout.Get(), "", "waitForSameRangeTransactions", nil, - target, options, false, /* allowOnTransition */ + target, options, false, /* allowOnShutdown */ func(ctx context.Context, logStats *tabletenv.LogStats) error { k, table := tsv.computeTxSerializerKey(ctx, logStats, sql, bindVariables) if k == "" { @@ -909,7 +906,7 @@ func (tsv *TabletServer) MessageStream(ctx context.Context, target *querypb.Targ return tsv.execRequest( ctx, 0, "MessageStream", "stream", nil, - target, nil, false, /* allowOnTransition */ + target, nil, false, /* allowOnShutdown */ func(ctx context.Context, logStats *tabletenv.LogStats) error { plan, err := tsv.qe.GetMessageStreamPlan(name) if err != nil { @@ -961,7 +958,7 @@ func (tsv *TabletServer) PurgeMessages(ctx context.Context, target *querypb.Targ } func (tsv *TabletServer) execDML(ctx context.Context, target *querypb.Target, queryGenerator func() (string, map[string]*querypb.BindVariable, error)) (count int64, err error) { - if err = tsv.sm.StartRequest(ctx, target, false /* allowOnTransition */); err != nil { + if err = tsv.sm.StartRequest(ctx, target, false /* allowOnShutdown */); err != nil { return 0, err } defer tsv.sm.EndRequest() @@ -1115,7 +1112,7 @@ func (tsv *TabletServer) Release(ctx context.Context, target *querypb.Target, tr func (tsv *TabletServer) execRequest( ctx context.Context, timeout time.Duration, requestName, sql string, bindVariables map[string]*querypb.BindVariable, - target *querypb.Target, options *querypb.ExecuteOptions, allowOnTransition bool, + target *querypb.Target, options *querypb.ExecuteOptions, allowOnShutdown bool, exec func(ctx context.Context, logStats *tabletenv.LogStats) error, ) (err error) { span, ctx := trace.NewSpan(ctx, "TabletServer."+requestName) @@ -1135,7 +1132,7 @@ func (tsv *TabletServer) execRequest( logStats.OriginalSQL = sql logStats.BindVariables = bindVariables defer tsv.handlePanicAndSendLogStats(sql, bindVariables, logStats) - if err = tsv.sm.StartRequest(ctx, target, allowOnTransition); err != nil { + if err = tsv.sm.StartRequest(ctx, target, allowOnShutdown); err != nil { return err } From e23cf2aa803d08f956bc88854b61ac179ceb68b7 Mon Sep 17 00:00:00 2001 From: Sugu Sougoumarane Date: Tue, 30 Jun 2020 16:24:57 -0700 Subject: [PATCH 17/19] vttablet: address review comments Signed-off-by: Sugu Sougoumarane --- go/vt/dbconfigs/dbconfigs.go | 5 +- go/vt/vttablet/tabletserver/state_manager.go | 25 +-- .../tabletserver/state_manager_test.go | 150 +++++++++--------- go/vt/vttablet/tabletserver/tabletserver.go | 4 +- .../tabletserver/tabletserver_test.go | 2 +- 5 files changed, 96 insertions(+), 90 deletions(-) diff --git a/go/vt/dbconfigs/dbconfigs.go b/go/vt/dbconfigs/dbconfigs.go index 8caef96abd2..98dba288d78 100644 --- a/go/vt/dbconfigs/dbconfigs.go +++ b/go/vt/dbconfigs/dbconfigs.go @@ -24,10 +24,11 @@ import ( "context" "encoding/json" "flag" - "fmt" "vitess.io/vitess/go/mysql" "vitess.io/vitess/go/vt/log" + vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" + "vitess.io/vitess/go/vt/vterrors" "vitess.io/vitess/go/yaml2" ) @@ -193,7 +194,7 @@ func (c Connector) Connect(ctx context.Context) (*mysql.Conn, error) { func (c Connector) MysqlParams() (*mysql.ConnParams, error) { if c.connParams == nil { // This is only possible during tests. - return nil, fmt.Errorf("parameters are empty") + return nil, vterrors.New(vtrpcpb.Code_INVALID_ARGUMENT, "parameters are empty") } params, err := withCredentials(c.connParams) if err != nil { diff --git a/go/vt/vttablet/tabletserver/state_manager.go b/go/vt/vttablet/tabletserver/state_manager.go index 67a3e44b712..832df33668d 100644 --- a/go/vt/vttablet/tabletserver/state_manager.go +++ b/go/vt/vttablet/tabletserver/state_manager.go @@ -17,11 +17,11 @@ limitations under the License. package tabletserver import ( + "context" "fmt" "sync" "time" - "golang.org/x/net/context" "vitess.io/vitess/go/history" "vitess.io/vitess/go/sync2" "vitess.io/vitess/go/vt/log" @@ -32,10 +32,12 @@ import ( "vitess.io/vitess/go/vt/vttablet/tabletserver/tabletenv" ) +type servingState int64 + const ( // StateNotConnected is the state where tabletserver is not // connected to an underlying mysql instance. - StateNotConnected = iota + StateNotConnected = servingState(iota) // StateNotServing is the state where tabletserver is connected // to an underlying mysql instance, but is not serving queries. StateNotServing @@ -81,9 +83,9 @@ type stateManager struct { // If a transition fails, we set retrying to true and launch // retryTransition which loops until the state converges. mu sync.Mutex - wantState int64 + wantState servingState wantTabletType topodatapb.TabletType - state int64 + state servingState target querypb.Target retrying bool // TODO(sougou): deprecate alsoAllow @@ -149,7 +151,7 @@ type txThrottler interface { // be honored. // If sm is already in the requested state, it returns stateChanged as // false. -func (sm *stateManager) SetServingType(tabletType topodatapb.TabletType, state int64, alsoAllow []topodatapb.TabletType) (stateChanged bool, err error) { +func (sm *stateManager) SetServingType(tabletType topodatapb.TabletType, state servingState, alsoAllow []topodatapb.TabletType) (stateChanged bool, err error) { defer sm.ExitLameduck() if tabletType == topodatapb.TabletType_RESTORE { @@ -168,7 +170,7 @@ func (sm *stateManager) SetServingType(tabletType topodatapb.TabletType, state i // state. If so, it acquires the semaphore and returns true. If a transition is // already in progress, it waits. If the desired state is already reached, it // returns false without acquiring the semaphore. -func (sm *stateManager) mustTransition(tabletType topodatapb.TabletType, state int64, alsoAllow []topodatapb.TabletType) bool { +func (sm *stateManager) mustTransition(tabletType topodatapb.TabletType, state servingState, alsoAllow []topodatapb.TabletType) bool { sm.transitioning.Acquire() sm.mu.Lock() defer sm.mu.Unlock() @@ -183,7 +185,7 @@ func (sm *stateManager) mustTransition(tabletType topodatapb.TabletType, state i return true } -func (sm *stateManager) execTransition(tabletType topodatapb.TabletType, state int64) error { +func (sm *stateManager) execTransition(tabletType topodatapb.TabletType, state servingState) error { defer sm.transitioning.Release() var err error @@ -244,6 +246,9 @@ func (sm *stateManager) recheckState() bool { return false } +// CheckMySQL verifies that we can connect to mysql. +// If it fails, then we shutdown the service and initiate +// the retry loop. func (sm *stateManager) CheckMySQL() { if !sm.checkMySQLThrottler.TryAcquire() { return @@ -473,7 +478,7 @@ func (sm *stateManager) setTimeBomb() chan struct{} { } // setState changes the state and logs the event. -func (sm *stateManager) setState(tabletType topodatapb.TabletType, state int64) { +func (sm *stateManager) setState(tabletType topodatapb.TabletType, state servingState) { sm.mu.Lock() defer sm.mu.Unlock() @@ -508,7 +513,7 @@ func (sm *stateManager) IsServing() bool { return sm.StateByName() == "SERVING" } -func (sm *stateManager) State() int64 { +func (sm *stateManager) State() servingState { sm.mu.Lock() defer sm.mu.Unlock() return sm.state @@ -531,7 +536,7 @@ func (sm *stateManager) StateByName() string { // stateInfo returns a string representation of the state and optional detail // about the reason for the state transition -func stateInfo(state int64) string { +func stateInfo(state servingState) string { if state == StateServing { return "SERVING" } diff --git a/go/vt/vttablet/tabletserver/state_manager_test.go b/go/vt/vttablet/tabletserver/state_manager_test.go index b73d699fd86..585f473742e 100644 --- a/go/vt/vttablet/tabletserver/state_manager_test.go +++ b/go/vt/vttablet/tabletserver/state_manager_test.go @@ -32,7 +32,7 @@ import ( ) func TestStateManagerStateByName(t *testing.T) { - states := []int64{ + states := []servingState{ StateNotConnected, StateNotServing, StateServing, @@ -61,24 +61,24 @@ func TestStateManagerServeMaster(t *testing.T) { assert.Equal(t, int32(0), sm.lameduck.Get()) - verifySubcomponent(t, sm.watcher, 1, testStateClosed) - verifySubcomponent(t, sm.hr, 2, testStateClosed) + verifySubcomponent(t, 1, sm.watcher, testStateClosed) + verifySubcomponent(t, 2, sm.hr, testStateClosed) - verifySubcomponent(t, sm.se, 3, testStateOpen) - verifySubcomponent(t, sm.vstreamer, 4, testStateOpen) - verifySubcomponent(t, sm.qe, 5, testStateOpen) - verifySubcomponent(t, sm.txThrottler, 6, testStateOpen) - verifySubcomponent(t, sm.hw, 7, testStateOpen) - verifySubcomponent(t, sm.tracker, 8, testStateOpen) - verifySubcomponent(t, sm.te, 9, testStateAcceptReadWrite) - verifySubcomponent(t, sm.messager, 10, testStateOpen) + verifySubcomponent(t, 3, sm.se, testStateOpen) + verifySubcomponent(t, 4, sm.vstreamer, testStateOpen) + verifySubcomponent(t, 5, sm.qe, testStateOpen) + verifySubcomponent(t, 6, sm.txThrottler, testStateOpen) + verifySubcomponent(t, 7, sm.hw, testStateOpen) + verifySubcomponent(t, 8, sm.tracker, testStateOpen) + verifySubcomponent(t, 9, sm.te, testStateAcceptReadWrite) + verifySubcomponent(t, 10, sm.messager, testStateOpen) assert.False(t, sm.se.(*testSchemaEngine).nonMaster) assert.True(t, sm.qe.(*testQueryEngine).isReachable) assert.False(t, sm.qe.(*testQueryEngine).stopServing) assert.Equal(t, topodatapb.TabletType_MASTER, sm.target.TabletType) - assert.Equal(t, int64(StateServing), sm.state) + assert.Equal(t, StateServing, sm.state) } func TestStateManagerServeNonMaster(t *testing.T) { @@ -87,21 +87,21 @@ func TestStateManagerServeNonMaster(t *testing.T) { require.NoError(t, err) assert.True(t, stateChanged) - verifySubcomponent(t, sm.messager, 1, testStateClosed) - verifySubcomponent(t, sm.tracker, 2, testStateClosed) - verifySubcomponent(t, sm.hw, 3, testStateClosed) + verifySubcomponent(t, 1, sm.messager, testStateClosed) + verifySubcomponent(t, 2, sm.tracker, testStateClosed) + verifySubcomponent(t, 3, sm.hw, testStateClosed) assert.True(t, sm.se.(*testSchemaEngine).nonMaster) - verifySubcomponent(t, sm.se, 4, testStateOpen) - verifySubcomponent(t, sm.vstreamer, 5, testStateOpen) - verifySubcomponent(t, sm.qe, 6, testStateOpen) - verifySubcomponent(t, sm.txThrottler, 7, testStateOpen) - verifySubcomponent(t, sm.te, 8, testStateAcceptReadOnly) - verifySubcomponent(t, sm.hr, 9, testStateOpen) - verifySubcomponent(t, sm.watcher, 10, testStateOpen) + verifySubcomponent(t, 4, sm.se, testStateOpen) + verifySubcomponent(t, 5, sm.vstreamer, testStateOpen) + verifySubcomponent(t, 6, sm.qe, testStateOpen) + verifySubcomponent(t, 7, sm.txThrottler, testStateOpen) + verifySubcomponent(t, 8, sm.te, testStateAcceptReadOnly) + verifySubcomponent(t, 9, sm.hr, testStateOpen) + verifySubcomponent(t, 10, sm.watcher, testStateOpen) assert.Equal(t, topodatapb.TabletType_REPLICA, sm.target.TabletType) - assert.Equal(t, int64(StateServing), sm.state) + assert.Equal(t, StateServing, sm.state) } func TestStateManagerUnserveMaster(t *testing.T) { @@ -110,23 +110,23 @@ func TestStateManagerUnserveMaster(t *testing.T) { require.NoError(t, err) assert.True(t, stateChanged) - verifySubcomponent(t, sm.messager, 1, testStateClosed) - verifySubcomponent(t, sm.te, 2, testStateClosed) + verifySubcomponent(t, 1, sm.messager, testStateClosed) + verifySubcomponent(t, 2, sm.te, testStateClosed) assert.True(t, sm.qe.(*testQueryEngine).stopServing) - verifySubcomponent(t, sm.watcher, 3, testStateClosed) - verifySubcomponent(t, sm.hr, 4, testStateClosed) + verifySubcomponent(t, 3, sm.watcher, testStateClosed) + verifySubcomponent(t, 4, sm.hr, testStateClosed) - verifySubcomponent(t, sm.se, 5, testStateOpen) - verifySubcomponent(t, sm.vstreamer, 6, testStateOpen) - verifySubcomponent(t, sm.qe, 7, testStateOpen) - verifySubcomponent(t, sm.txThrottler, 8, testStateOpen) + verifySubcomponent(t, 5, sm.se, testStateOpen) + verifySubcomponent(t, 6, sm.vstreamer, testStateOpen) + verifySubcomponent(t, 7, sm.qe, testStateOpen) + verifySubcomponent(t, 8, sm.txThrottler, testStateOpen) - verifySubcomponent(t, sm.hw, 9, testStateOpen) - verifySubcomponent(t, sm.tracker, 10, testStateOpen) + verifySubcomponent(t, 9, sm.hw, testStateOpen) + verifySubcomponent(t, 10, sm.tracker, testStateOpen) assert.Equal(t, topodatapb.TabletType_MASTER, sm.target.TabletType) - assert.Equal(t, int64(StateNotServing), sm.state) + assert.Equal(t, StateNotServing, sm.state) } func TestStateManagerUnserveNonmaster(t *testing.T) { @@ -135,24 +135,24 @@ func TestStateManagerUnserveNonmaster(t *testing.T) { require.NoError(t, err) assert.True(t, stateChanged) - verifySubcomponent(t, sm.messager, 1, testStateClosed) - verifySubcomponent(t, sm.te, 2, testStateClosed) + verifySubcomponent(t, 1, sm.messager, testStateClosed) + verifySubcomponent(t, 2, sm.te, testStateClosed) assert.True(t, sm.qe.(*testQueryEngine).stopServing) - verifySubcomponent(t, sm.tracker, 3, testStateClosed) - verifySubcomponent(t, sm.hw, 4, testStateClosed) + verifySubcomponent(t, 3, sm.tracker, testStateClosed) + verifySubcomponent(t, 4, sm.hw, testStateClosed) assert.True(t, sm.se.(*testSchemaEngine).nonMaster) - verifySubcomponent(t, sm.se, 5, testStateOpen) - verifySubcomponent(t, sm.vstreamer, 6, testStateOpen) - verifySubcomponent(t, sm.qe, 7, testStateOpen) - verifySubcomponent(t, sm.txThrottler, 8, testStateOpen) + verifySubcomponent(t, 5, sm.se, testStateOpen) + verifySubcomponent(t, 6, sm.vstreamer, testStateOpen) + verifySubcomponent(t, 7, sm.qe, testStateOpen) + verifySubcomponent(t, 8, sm.txThrottler, testStateOpen) - verifySubcomponent(t, sm.hr, 9, testStateOpen) - verifySubcomponent(t, sm.watcher, 10, testStateOpen) + verifySubcomponent(t, 9, sm.hr, testStateOpen) + verifySubcomponent(t, 10, sm.watcher, testStateOpen) assert.Equal(t, topodatapb.TabletType_RDONLY, sm.target.TabletType) - assert.Equal(t, int64(StateNotServing), sm.state) + assert.Equal(t, StateNotServing, sm.state) } func TestStateManagerClose(t *testing.T) { @@ -161,21 +161,21 @@ func TestStateManagerClose(t *testing.T) { require.NoError(t, err) assert.True(t, stateChanged) - verifySubcomponent(t, sm.messager, 1, testStateClosed) - verifySubcomponent(t, sm.te, 2, testStateClosed) + verifySubcomponent(t, 1, sm.messager, testStateClosed) + verifySubcomponent(t, 2, sm.te, testStateClosed) assert.True(t, sm.qe.(*testQueryEngine).stopServing) - verifySubcomponent(t, sm.txThrottler, 3, testStateClosed) - verifySubcomponent(t, sm.qe, 4, testStateClosed) - verifySubcomponent(t, sm.watcher, 5, testStateClosed) - verifySubcomponent(t, sm.tracker, 6, testStateClosed) - verifySubcomponent(t, sm.vstreamer, 7, testStateClosed) - verifySubcomponent(t, sm.hr, 8, testStateClosed) - verifySubcomponent(t, sm.hw, 9, testStateClosed) - verifySubcomponent(t, sm.se, 10, testStateClosed) + verifySubcomponent(t, 3, sm.txThrottler, testStateClosed) + verifySubcomponent(t, 4, sm.qe, testStateClosed) + verifySubcomponent(t, 5, sm.watcher, testStateClosed) + verifySubcomponent(t, 6, sm.tracker, testStateClosed) + verifySubcomponent(t, 7, sm.vstreamer, testStateClosed) + verifySubcomponent(t, 8, sm.hr, testStateClosed) + verifySubcomponent(t, 9, sm.hw, testStateClosed) + verifySubcomponent(t, 10, sm.se, testStateClosed) assert.Equal(t, topodatapb.TabletType_RDONLY, sm.target.TabletType) - assert.Equal(t, int64(StateNotConnected), sm.state) + assert.Equal(t, StateNotConnected, sm.state) } func TestStateManagerStopService(t *testing.T) { @@ -185,11 +185,11 @@ func TestStateManagerStopService(t *testing.T) { assert.True(t, stateChanged) assert.Equal(t, topodatapb.TabletType_REPLICA, sm.target.TabletType) - assert.Equal(t, int64(StateServing), sm.state) + assert.Equal(t, StateServing, sm.state) sm.StopService() assert.Equal(t, topodatapb.TabletType_REPLICA, sm.target.TabletType) - assert.Equal(t, int64(StateNotConnected), sm.state) + assert.Equal(t, StateNotConnected, sm.state) } // testWatcher is used as a hook to invoke another transition @@ -229,7 +229,7 @@ func TestStateManagerSetServingTypeRace(t *testing.T) { // End state should be the final desired state. assert.Equal(t, topodatapb.TabletType_RDONLY, sm.target.TabletType) - assert.Equal(t, int64(StateNotServing), sm.state) + assert.Equal(t, StateNotServing, sm.state) } func TestStateManagerSetServingTypeNoChange(t *testing.T) { @@ -242,21 +242,21 @@ func TestStateManagerSetServingTypeNoChange(t *testing.T) { require.NoError(t, err) assert.False(t, stateChanged) - verifySubcomponent(t, sm.messager, 1, testStateClosed) - verifySubcomponent(t, sm.tracker, 2, testStateClosed) - verifySubcomponent(t, sm.hw, 3, testStateClosed) + verifySubcomponent(t, 1, sm.messager, testStateClosed) + verifySubcomponent(t, 2, sm.tracker, testStateClosed) + verifySubcomponent(t, 3, sm.hw, testStateClosed) assert.True(t, sm.se.(*testSchemaEngine).nonMaster) - verifySubcomponent(t, sm.se, 4, testStateOpen) - verifySubcomponent(t, sm.vstreamer, 5, testStateOpen) - verifySubcomponent(t, sm.qe, 6, testStateOpen) - verifySubcomponent(t, sm.txThrottler, 7, testStateOpen) - verifySubcomponent(t, sm.te, 8, testStateAcceptReadOnly) - verifySubcomponent(t, sm.hr, 9, testStateOpen) - verifySubcomponent(t, sm.watcher, 10, testStateOpen) + verifySubcomponent(t, 4, sm.se, testStateOpen) + verifySubcomponent(t, 5, sm.vstreamer, testStateOpen) + verifySubcomponent(t, 6, sm.qe, testStateOpen) + verifySubcomponent(t, 7, sm.txThrottler, testStateOpen) + verifySubcomponent(t, 8, sm.te, testStateAcceptReadOnly) + verifySubcomponent(t, 9, sm.hr, testStateOpen) + verifySubcomponent(t, 10, sm.watcher, testStateOpen) assert.Equal(t, topodatapb.TabletType_REPLICA, sm.target.TabletType) - assert.Equal(t, int64(StateServing), sm.state) + assert.Equal(t, StateServing, sm.state) } func TestStateManagerTransitionFailRetry(t *testing.T) { @@ -281,7 +281,7 @@ func TestStateManagerTransitionFailRetry(t *testing.T) { } assert.Equal(t, topodatapb.TabletType_MASTER, sm.Target().TabletType) - assert.Equal(t, int64(StateServing), sm.State()) + assert.Equal(t, StateServing, sm.State()) } func TestStateManagerRestoreType(t *testing.T) { @@ -293,7 +293,7 @@ func TestStateManagerRestoreType(t *testing.T) { assert.Equal(t, topodatapb.TabletType_RESTORE, sm.target.TabletType) // RESTORE can only be in StateNotConnected. - assert.Equal(t, int64(StateNotConnected), sm.state) + assert.Equal(t, StateNotConnected, sm.state) } func TestStateManagerCheckMySQL(t *testing.T) { @@ -338,7 +338,7 @@ func TestStateManagerCheckMySQL(t *testing.T) { } assert.Equal(t, topodatapb.TabletType_MASTER, sm.Target().TabletType) - assert.Equal(t, int64(StateServing), sm.State()) + assert.Equal(t, StateServing, sm.State()) } func TestStateManagerValidations(t *testing.T) { @@ -431,10 +431,10 @@ func TestStateManagerWaitForRequests(t *testing.T) { } break } - assert.Equal(t, int64(StateNotConnected), sm.State()) + assert.Equal(t, StateNotConnected, sm.State()) } -func verifySubcomponent(t *testing.T, component interface{}, order int64, state testState) { +func verifySubcomponent(t *testing.T, order int64, component interface{}, state testState) { tos := component.(orderState) assert.Equal(t, order, tos.Order()) assert.Equal(t, state, tos.State()) diff --git a/go/vt/vttablet/tabletserver/tabletserver.go b/go/vt/vttablet/tabletserver/tabletserver.go index bfe03e36075..6c9a8865210 100644 --- a/go/vt/vttablet/tabletserver/tabletserver.go +++ b/go/vt/vttablet/tabletserver/tabletserver.go @@ -182,7 +182,7 @@ func NewTabletServer(name string, config *tabletenv.TabletConfig, topoServer *to timebombDuration: time.Duration(config.OltpReadPool.TimeoutSeconds * 10), } - tsv.exporter.NewGaugeFunc("TabletState", "Tablet server state", tsv.sm.State) + tsv.exporter.NewGaugeFunc("TabletState", "Tablet server state", func() int64 { return int64(tsv.sm.State()) }) tsv.exporter.Publish("TabletStateName", stats.StringFunc(tsv.sm.StateByName)) // TabletServerState exports the same information as the above two stats (TabletState / TabletStateName), @@ -311,7 +311,7 @@ func (tsv *TabletServer) InitACL(tableACLConfigFile string, enforceTableACLConfi // should also be honored for serving. // Returns true if the state of QueryService or the tablet type changed. func (tsv *TabletServer) SetServingType(tabletType topodatapb.TabletType, serving bool, alsoAllow []topodatapb.TabletType) (stateChanged bool, err error) { - state := int64(StateNotServing) + state := StateNotServing if serving { state = StateServing } diff --git a/go/vt/vttablet/tabletserver/tabletserver_test.go b/go/vt/vttablet/tabletserver/tabletserver_test.go index ae4927989c9..718d08e9de9 100644 --- a/go/vt/vttablet/tabletserver/tabletserver_test.go +++ b/go/vt/vttablet/tabletserver/tabletserver_test.go @@ -2162,7 +2162,7 @@ func setupTabletServerTest(t *testing.T) (*fakesqldb.DB, *TabletServer) { func setupTabletServerTestCustom(t *testing.T, config *tabletenv.TabletConfig) (*fakesqldb.DB, *TabletServer) { db := setupFakeDB(t) tsv := NewTabletServer("TabletServerTest", config, memorytopo.NewServer(""), topodatapb.TabletAlias{}) - require.Equal(t, int64(StateNotConnected), tsv.sm.State()) + require.Equal(t, StateNotConnected, tsv.sm.State()) dbcfgs := newDBConfigs(db) target := querypb.Target{TabletType: topodatapb.TabletType_MASTER} err := tsv.StartService(target, dbcfgs) From dbc50a0c040546d4690ead74969620c37a4c06f2 Mon Sep 17 00:00:00 2001 From: Sugu Sougoumarane Date: Wed, 1 Jul 2020 19:04:18 -0700 Subject: [PATCH 18/19] fix golangci errors Signed-off-by: Sugu Sougoumarane --- go/vt/vttablet/heartbeat/writer.go | 1 - go/vt/vttablet/tabletserver/state_manager_test.go | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/go/vt/vttablet/heartbeat/writer.go b/go/vt/vttablet/heartbeat/writer.go index 10b9ae975d2..0c212133ff9 100644 --- a/go/vt/vttablet/heartbeat/writer.go +++ b/go/vt/vttablet/heartbeat/writer.go @@ -45,7 +45,6 @@ const ( ts BIGINT UNSIGNED NOT NULL ) engine=InnoDB` sqlUpsertHeartbeat = "INSERT INTO %s.heartbeat (ts, tabletUid, keyspaceShard) VALUES (%a, %a, %a) ON DUPLICATE KEY UPDATE ts=VALUES(ts), tabletUid=VALUES(tabletUid)" - sqlUpdateHeartbeat = "UPDATE %s.heartbeat SET ts=%a, tabletUid=%a WHERE keyspaceShard=%a" ) var withDDL = withddl.New([]string{ diff --git a/go/vt/vttablet/tabletserver/state_manager_test.go b/go/vt/vttablet/tabletserver/state_manager_test.go index 585f473742e..75ba1eba510 100644 --- a/go/vt/vttablet/tabletserver/state_manager_test.go +++ b/go/vt/vttablet/tabletserver/state_manager_test.go @@ -474,10 +474,9 @@ var order sync2.AtomicInt64 type testState int const ( - testStateUnknown = testState(iota) + _ = testState(iota) testStateOpen testStateClosed - testStateMakeNonMaster testStateAcceptReadOnly testStateAcceptReadWrite ) From 5ebab3a4d1bdc395c755e20bb38fa9d43671e05b Mon Sep 17 00:00:00 2001 From: Sugu Sougoumarane Date: Mon, 13 Jul 2020 17:39:34 -0700 Subject: [PATCH 19/19] vttablet: address review comments Signed-off-by: Sugu Sougoumarane --- go/vt/vttablet/tabletserver/state_manager_test.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/go/vt/vttablet/tabletserver/state_manager_test.go b/go/vt/vttablet/tabletserver/state_manager_test.go index 75ba1eba510..89ee61cbe71 100644 --- a/go/vt/vttablet/tabletserver/state_manager_test.go +++ b/go/vt/vttablet/tabletserver/state_manager_test.go @@ -270,6 +270,16 @@ func TestStateManagerTransitionFailRetry(t *testing.T) { require.Error(t, err) assert.True(t, stateChanged) + // Calling retryTransition while retrying should be a no-op. + sm.retryTransition("") + + // Steal the lock and wait long enough for the retry + // to fail, and then release it. The retry will have + // to keep retrying. + sm.transitioning.Acquire() + time.Sleep(30 * time.Millisecond) + sm.transitioning.Release() + for { sm.mu.Lock() retrying := sm.retrying @@ -310,6 +320,9 @@ func TestStateManagerCheckMySQL(t *testing.T) { order.Set(0) sm.CheckMySQL() + // Rechecking immediately should be a no-op: + sm.CheckMySQL() + // Wait for closeAll to get under way. for { if order.Get() >= 1 {