From 11a8809c48351a3ea1533d4e5e86af3357ed240c Mon Sep 17 00:00:00 2001 From: Matt Lord Date: Tue, 30 Nov 2021 20:41:17 -0500 Subject: [PATCH 1/4] Mark a replica as unhealthy when the repl lag is unknown This prevents us from serving queries from the replica that may be beyond the defined -unhealthy_threshold for the tablet. Signed-off-by: Matt Lord --- go/mysql/flavor.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/go/mysql/flavor.go b/go/mysql/flavor.go index 2902308d843..97265888bec 100644 --- a/go/mysql/flavor.go +++ b/go/mysql/flavor.go @@ -19,12 +19,14 @@ package mysql import ( "errors" "fmt" + "math" "strconv" "strings" "context" "vitess.io/vitess/go/sqltypes" + "vitess.io/vitess/go/vt/log" "vitess.io/vitess/go/vt/proto/vtrpc" "vitess.io/vitess/go/vt/vterrors" ) @@ -315,7 +317,11 @@ func parseReplicationStatus(fields map[string]string) ReplicationStatus { status.SourcePort = int(parseInt) parseInt, _ = strconv.ParseInt(fields["Connect_Retry"], 10, 0) status.ConnectRetry = int(parseInt) - parseUint, _ := strconv.ParseUint(fields["Seconds_Behind_Master"], 10, 0) + parseUint, err := strconv.ParseUint(fields["Seconds_Behind_Master"], 10, 0) + if err != nil { + log.Errorf("Could not compute replica lag from seconds_behind_master value of '%s', this means that replication is unhealthy; setting replica lag to max value to prevent the serving of queries", fields["Seconds_Behind_Master"]) + parseUint = math.MaxUint32 + } status.ReplicationLagSeconds = uint(parseUint) parseUint, _ = strconv.ParseUint(fields["Master_Server_Id"], 10, 0) status.SourceServerID = uint(parseUint) From ef39eb8ed4084c7aa06d3d1740745c9ac4a6b645 Mon Sep 17 00:00:00 2001 From: Matt Lord Date: Wed, 1 Dec 2021 12:18:13 -0500 Subject: [PATCH 2/4] Add e2e test for replica health when its source is gone Signed-off-by: Matt Lord --- go/test/endtoend/tabletmanager/main_test.go | 32 +++++----- .../tabletmanager/tablet_health_test.go | 58 +++++++++++++++++-- 2 files changed, 69 insertions(+), 21 deletions(-) diff --git a/go/test/endtoend/tabletmanager/main_test.go b/go/test/endtoend/tabletmanager/main_test.go index 08770665c49..5ac97fe4f20 100644 --- a/go/test/endtoend/tabletmanager/main_test.go +++ b/go/test/endtoend/tabletmanager/main_test.go @@ -35,21 +35,22 @@ import ( ) var ( - clusterInstance *cluster.LocalProcessCluster - tmClient *tmc.Client - primaryTabletParams mysql.ConnParams - replicaTabletParams mysql.ConnParams - primaryTablet cluster.Vttablet - replicaTablet cluster.Vttablet - rdonlyTablet cluster.Vttablet - hostname = "localhost" - keyspaceName = "ks" - shardName = "0" - keyspaceShard = "ks/" + shardName - dbName = "vt_" + keyspaceName - username = "vt_dba" - cell = "zone1" - sqlSchema = ` + clusterInstance *cluster.LocalProcessCluster + tmClient *tmc.Client + primaryTabletParams mysql.ConnParams + replicaTabletParams mysql.ConnParams + primaryTablet cluster.Vttablet + replicaTablet cluster.Vttablet + rdonlyTablet cluster.Vttablet + hostname = "localhost" + keyspaceName = "ks" + shardName = "0" + keyspaceShard = "ks/" + shardName + dbName = "vt_" + keyspaceName + username = "vt_dba" + cell = "zone1" + tabletHealthcheckRefreshInterval = 5 * time.Second + sqlSchema = ` create table t1( id bigint, value varchar(16), @@ -99,6 +100,7 @@ func TestMain(m *testing.M) { "-lock_tables_timeout", "5s", "-watch_replication_stream", "-enable_replication_reporter", + "-health_check_interval", tabletHealthcheckRefreshInterval.String(), } // We do not need semiSync for this test case. clusterInstance.EnableSemiSync = false diff --git a/go/test/endtoend/tabletmanager/tablet_health_test.go b/go/test/endtoend/tabletmanager/tablet_health_test.go index 81946a1edea..d7afdb44615 100644 --- a/go/test/endtoend/tabletmanager/tablet_health_test.go +++ b/go/test/endtoend/tabletmanager/tablet_health_test.go @@ -24,6 +24,7 @@ import ( "strings" "sync" "testing" + "time" "github.com/stretchr/testify/require" @@ -134,7 +135,7 @@ func TestHealthCheck(t *testing.T) { // make sure the health stream is updated result, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("VtTabletStreamHealth", "-count", "1", rTablet.Alias) require.NoError(t, err) - verifyStreamHealth(t, result) + verifyStreamHealth(t, result, true) // then restart replication, make sure we stay healthy err = clusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", rTablet.Alias) @@ -148,7 +149,47 @@ func TestHealthCheck(t *testing.T) { require.NoError(t, err) scanner := bufio.NewScanner(strings.NewReader(result)) for scanner.Scan() { - verifyStreamHealth(t, scanner.Text()) + verifyStreamHealth(t, scanner.Text(), true) + } + + // stop the replica's source mysqld instance to break replication + // and test that the tablet becomes unhealthy and non-serving + err = primaryTablet.MysqlctlProcess.Stop() + require.NoError(t, err) + + time.Sleep(tabletHealthcheckRefreshInterval) + + // now the replica's VtTabletStreamHealth should show it as unhealthy + result, err = clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("VtTabletStreamHealth", "-count", "1", rTablet.Alias) + require.NoError(t, err) + scanner = bufio.NewScanner(strings.NewReader(result)) + for scanner.Scan() { + verifyStreamHealth(t, scanner.Text(), false) + } + + // start the primary tablet's mysqld back up + primaryTablet.MysqlctlProcess.InitMysql = false + err = primaryTablet.MysqlctlProcess.Start() + primaryTablet.MysqlctlProcess.InitMysql = true + require.NoError(t, err) + + // explicitly start replication on all of the replicas to avoid any test flakiness as they were all + // replicating from the primary instance + err = clusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", rTablet.Alias) + require.NoError(t, err) + err = clusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", replicaTablet.Alias) + require.NoError(t, err) + err = clusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", rdonlyTablet.Alias) + require.NoError(t, err) + + time.Sleep(tabletHealthcheckRefreshInterval) + + // now the replica's VtTabletStreamHealth should show it as healthy again + result, err = clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("VtTabletStreamHealth", "-count", "1", rTablet.Alias) + require.NoError(t, err) + scanner = bufio.NewScanner(strings.NewReader(result)) + for scanner.Scan() { + verifyStreamHealth(t, scanner.Text(), true) } // Manual cleanup of processes @@ -183,7 +224,7 @@ func checkTabletType(t *testing.T, tabletAlias string, typeWant string) { assert.Equal(t, want, got) } -func verifyStreamHealth(t *testing.T, result string) { +func verifyStreamHealth(t *testing.T, result string, expectHealthy bool) { var streamHealthResponse querypb.StreamHealthResponse err := json2.Unmarshal([]byte(result), &streamHealthResponse) require.NoError(t, err) @@ -191,10 +232,15 @@ func verifyStreamHealth(t *testing.T, result string) { UID := streamHealthResponse.GetTabletAlias().GetUid() realTimeStats := streamHealthResponse.GetRealtimeStats() replicationLagSeconds := realTimeStats.GetReplicationLagSeconds() - assert.True(t, serving, "Tablet should be in serving state") assert.True(t, UID > 0, "Tablet should contain uid") - // replicationLagSeconds varies till 7200 so setting safe limit - assert.True(t, replicationLagSeconds < 10000, "replica should not be behind primary") + if expectHealthy { + assert.True(t, serving, "Tablet should be in serving state") + // replicationLagSeconds varies till 7200 so setting safe limit + assert.True(t, replicationLagSeconds < 10000, "replica should not be behind primary") + } else { + assert.False(t, serving, "Tablet should not be in serving state") + assert.False(t, replicationLagSeconds < 10000, "replica should be behind primary") + } } func TestHealthCheckDrainedStateDoesNotShutdownQueryService(t *testing.T) { From c5143d84164ab80571b603c00716b7ecc1248bc8 Mon Sep 17 00:00:00 2001 From: Matt Lord Date: Thu, 2 Dec 2021 15:53:34 -0500 Subject: [PATCH 3/4] Calculate replica lag ourselves when it's unknown in MySQL This gives us the most accurate estimate possible -- and seconds_behind_master is already fuzzy anyway -- of the replica lag when the replica cannot talk to its source and is constantly retrying and in a state where Slave_IO_Running == Connecting which we consider to be equivalent to running (see parseReplicationStatus() in flavor.go) as reconnects happen regularly anyway e.g. due to slave_net_timeout. Signed-off-by: Matt Lord --- go/mysql/flavor.go | 11 ++++++----- go/mysql/replication_status.go | 4 ++++ go/vt/vttablet/tabletserver/repltracker/poller.go | 7 ++++++- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/go/mysql/flavor.go b/go/mysql/flavor.go index 97265888bec..7204a36a916 100644 --- a/go/mysql/flavor.go +++ b/go/mysql/flavor.go @@ -19,14 +19,12 @@ package mysql import ( "errors" "fmt" - "math" "strconv" "strings" "context" "vitess.io/vitess/go/sqltypes" - "vitess.io/vitess/go/vt/log" "vitess.io/vitess/go/vt/proto/vtrpc" "vitess.io/vitess/go/vt/vterrors" ) @@ -319,10 +317,13 @@ func parseReplicationStatus(fields map[string]string) ReplicationStatus { status.ConnectRetry = int(parseInt) parseUint, err := strconv.ParseUint(fields["Seconds_Behind_Master"], 10, 0) if err != nil { - log.Errorf("Could not compute replica lag from seconds_behind_master value of '%s', this means that replication is unhealthy; setting replica lag to max value to prevent the serving of queries", fields["Seconds_Behind_Master"]) - parseUint = math.MaxUint32 + // we could not parse the value into a valid uint -- most commonly because the value is NULL from the + // database -- so let's reflect that the underlying value was unknown on our last check + status.ReplicationLagUnknown = true + } else { + status.ReplicationLagUnknown = false + status.ReplicationLagSeconds = uint(parseUint) } - status.ReplicationLagSeconds = uint(parseUint) parseUint, _ = strconv.ParseUint(fields["Master_Server_Id"], 10, 0) status.SourceServerID = uint(parseUint) diff --git a/go/mysql/replication_status.go b/go/mysql/replication_status.go index 801d38a680f..5decc99033e 100644 --- a/go/mysql/replication_status.go +++ b/go/mysql/replication_status.go @@ -30,6 +30,9 @@ type ReplicationStatus struct { // were to finish executing everything that's currently in its relay log. // However, some MySQL flavors don't expose this information, // in which case RelayLogPosition.IsZero() will be true. + // If ReplicationLagUnknown is true then we should not rely on the seconds + // behind value and we can instead try to calcuate the lag ourselves when + // appropriate. RelayLogPosition Position FilePosition Position FileRelayLogPosition Position @@ -37,6 +40,7 @@ type ReplicationStatus struct { IOThreadRunning bool SQLThreadRunning bool ReplicationLagSeconds uint + ReplicationLagUnknown bool SourceHost string SourcePort int ConnectRetry int diff --git a/go/vt/vttablet/tabletserver/repltracker/poller.go b/go/vt/vttablet/tabletserver/repltracker/poller.go index 995223dc28d..5a9112be82b 100644 --- a/go/vt/vttablet/tabletserver/repltracker/poller.go +++ b/go/vt/vttablet/tabletserver/repltracker/poller.go @@ -50,7 +50,12 @@ func (p *poller) Status() (time.Duration, error) { return 0, err } - if !status.ReplicationRunning() { + // If replication is not currently running or we don't know what the lag is -- most commonly + // because the replica mysqld is in the process of trying to start replicating from its source + // but it hasn't yet reached the point where it can calculate the seconds_behind_master + // value and it's thus NULL -- then we will estimate the lag ourselves using the last seen + // value + the time elapsed since. + if !status.ReplicationRunning() || status.ReplicationLagUnknown { if p.timeRecorded.IsZero() { return 0, vterrors.Errorf(vtrpcpb.Code_UNAVAILABLE, "replication is not running") } From 31560fdb23c6170dcc16e07748c086596a3e72fc Mon Sep 17 00:00:00 2001 From: Matt Lord Date: Thu, 2 Dec 2021 16:08:32 -0500 Subject: [PATCH 4/4] Update tablet healthcheck test based on new replica lag logic Signed-off-by: Matt Lord --- go/mysql/replication_status.go | 2 +- go/test/endtoend/tabletmanager/main_test.go | 7 ++++++- go/test/endtoend/tabletmanager/tablet_health_test.go | 8 ++++---- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/go/mysql/replication_status.go b/go/mysql/replication_status.go index 5decc99033e..2c818dcb537 100644 --- a/go/mysql/replication_status.go +++ b/go/mysql/replication_status.go @@ -31,7 +31,7 @@ type ReplicationStatus struct { // However, some MySQL flavors don't expose this information, // in which case RelayLogPosition.IsZero() will be true. // If ReplicationLagUnknown is true then we should not rely on the seconds - // behind value and we can instead try to calcuate the lag ourselves when + // behind value and we can instead try to calculate the lag ourselves when // appropriate. RelayLogPosition Position FilePosition Position diff --git a/go/test/endtoend/tabletmanager/main_test.go b/go/test/endtoend/tabletmanager/main_test.go index 5ac97fe4f20..9c118465edc 100644 --- a/go/test/endtoend/tabletmanager/main_test.go +++ b/go/test/endtoend/tabletmanager/main_test.go @@ -50,6 +50,7 @@ var ( username = "vt_dba" cell = "zone1" tabletHealthcheckRefreshInterval = 5 * time.Second + tabletUnhealthyThreshold = tabletHealthcheckRefreshInterval * 2 sqlSchema = ` create table t1( id bigint, @@ -94,13 +95,17 @@ func TestMain(m *testing.M) { } // List of users authorized to execute vschema ddl operations - clusterInstance.VtGateExtraArgs = []string{"-vschema_ddl_authorized_users=%"} + clusterInstance.VtGateExtraArgs = []string{ + "-vschema_ddl_authorized_users=%", + "-discovery_low_replication_lag", tabletUnhealthyThreshold.String(), + } // Set extra tablet args for lock timeout clusterInstance.VtTabletExtraArgs = []string{ "-lock_tables_timeout", "5s", "-watch_replication_stream", "-enable_replication_reporter", "-health_check_interval", tabletHealthcheckRefreshInterval.String(), + "-unhealthy_threshold", tabletUnhealthyThreshold.String(), } // We do not need semiSync for this test case. clusterInstance.EnableSemiSync = false diff --git a/go/test/endtoend/tabletmanager/tablet_health_test.go b/go/test/endtoend/tabletmanager/tablet_health_test.go index d7afdb44615..a27ab5a3d47 100644 --- a/go/test/endtoend/tabletmanager/tablet_health_test.go +++ b/go/test/endtoend/tabletmanager/tablet_health_test.go @@ -153,11 +153,12 @@ func TestHealthCheck(t *testing.T) { } // stop the replica's source mysqld instance to break replication - // and test that the tablet becomes unhealthy and non-serving + // and test that the replica tablet becomes unhealthy and non-serving after crossing + // the tablet's -unhealthy_threshold and the gateway's -discovery_low_replication_lag err = primaryTablet.MysqlctlProcess.Stop() require.NoError(t, err) - time.Sleep(tabletHealthcheckRefreshInterval) + time.Sleep(tabletUnhealthyThreshold + tabletHealthcheckRefreshInterval) // now the replica's VtTabletStreamHealth should show it as unhealthy result, err = clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("VtTabletStreamHealth", "-count", "1", rTablet.Alias) @@ -238,8 +239,7 @@ func verifyStreamHealth(t *testing.T, result string, expectHealthy bool) { // replicationLagSeconds varies till 7200 so setting safe limit assert.True(t, replicationLagSeconds < 10000, "replica should not be behind primary") } else { - assert.False(t, serving, "Tablet should not be in serving state") - assert.False(t, replicationLagSeconds < 10000, "replica should be behind primary") + assert.True(t, (!serving || replicationLagSeconds >= uint32(tabletUnhealthyThreshold.Seconds())), "Tablet should not be in serving and healthy state") } }